#Installation



In [2]:

import tensorflow_hub as hub
import pandas as pd
import common_code
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

%load_ext autoreload
%autoreload 2


#Load USE model and the dataset



In [3]:
# Load USE model
# warning: it might take 5+ mins to download this model 
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

df_train = pd.read_csv("data/train.csv", sep=",")
df_test = pd.read_csv("data/test.csv", sep=",")














In [4]:
USE_EXISTING_CLEANED_FILES = False

if USE_EXISTING_CLEANED_FILES:
    df_train = pd.read_csv("data/cleaned_train.csv")
    df_test = pd.read_csv("data/cleaned_test.csv")
else:
    df_train, df_test = common_code.clean_text_wrapper(df_train, df_test)

Start cleaning...
done!


## Sentence Encoder ##

In [5]:
#func that take column and encode it using USE and return it
def sentences_encoder(col):
    sentences = col.tolist()
    # Create USE vectors for sentences
    sentence_vectors = use_model(sentences)
    return sentence_vectors

In [6]:

train_sentence_vectors = sentences_encoder(df_train['cleaned_text'])
test_sentence_vectors = sentences_encoder(df_train['cleaned_text'])

print("the first sentence encoded result:")
train_sentence_vectors[0]

the first sentence encoded result:


<tf.Tensor: shape=(512,), dtype=float32, numpy=
array([ 1.52491881e-02,  2.40026116e-02, -4.81963307e-02,  4.22714651e-02,
        6.96127266e-02,  1.31124798e-02,  1.23990076e-02, -6.05101418e-03,
       -2.40373928e-02,  6.81885630e-02,  1.65025219e-02, -6.16021566e-02,
        8.09205137e-03, -4.74354252e-02,  2.16256715e-02, -9.46175158e-02,
        5.27474517e-03, -2.02022884e-02,  3.91790159e-02, -7.39929378e-02,
        4.05305214e-02,  2.56662313e-02,  6.91196322e-02,  6.75732456e-03,
       -1.03321504e-02,  4.63298485e-02,  5.33153601e-02, -7.00938404e-02,
        7.46023422e-03,  3.92535664e-02,  3.54592688e-02,  3.92078534e-02,
        2.56102644e-02,  3.53632942e-02, -2.08516866e-02,  1.05187576e-02,
        4.96120565e-03, -3.78673477e-03, -3.78143303e-02, -5.01396973e-03,
        1.96129121e-02,  4.35654045e-04,  8.30168091e-03, -8.42987597e-02,
       -1.20632192e-02, -7.46258870e-02, -7.25122690e-02,  5.99875972e-02,
       -2.88609471e-02,  5.33716269e-02,  4.09351587

## Convert Y labels to binorminal values ##

In [7]:
le = LabelEncoder()
df_train["upt_label"] = le.fit_transform(df_train['Label'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
le_label_mapping = dict(zip(le.transform(le.classes_), le.classes_))
print("name maps to numeric label: ", le_name_mapping)
print("numeric label maps to name: ", le_label_mapping)

name maps to numeric label:  {'amusement': 0, 'anger': 1, 'disapproval': 2, 'gratitude': 3, 'love': 4}
numeric label maps to name:  {0: 'amusement', 1: 'anger', 2: 'disapproval', 3: 'gratitude', 4: 'love'}


# Train and Create the model #
## Logistic Regression ##

In [8]:
# split the dataset for training and testing
indices = np.arange(df_train.shape[0])
X_train, X_veri, y_train, y_veri, indices_train, indices_veri = train_test_split(
    pd.DataFrame(train_sentence_vectors), df_train['upt_label'], indices, test_size=0.1, random_state=42)

print("Training Data shape:\t\t", X_train.shape, y_train.shape)
print("Verification Data shape:\t", X_veri.shape, y_veri.shape)


Training Data shape:		 (6750, 512) (6750,)
Verification Data shape:	 (750, 512) (750,)


In [9]:

log_model = LogisticRegression(C=0.85, solver="saga", max_iter=1000)
log_model.fit(X_train, y_train)
y_veri_pred = log_model.predict(X_veri)

print('\n------------------------------')
print("Logistic Regression accuracy: {:.2f}".format(metrics.accuracy_score(y_veri, y_veri_pred)))
print("Logistic F1 Score micro:", metrics.f1_score(y_veri, y_veri_pred, average='micro'))
print("Logistic F1 Score macro:", metrics.f1_score(y_veri, y_veri_pred, average='macro'))
print("Logistic confusion matrix:")
print(metrics.confusion_matrix(y_veri, y_veri_pred))


------------------------------
Logistic Regression accuracy: 0.77
Logistic F1 Score micro: 0.7706666666666667
Logistic F1 Score macro: 0.7600537543052039
Logistic confusion matrix:
[[116  11  11   6   5]
 [  7  66  35   4   4]
 [ 12  20 123   4   7]
 [  3   5   7 161   5]
 [  7   6   9   4 112]]


## SVM ##

In [10]:
from sklearn.svm import SVC

# Train a Linear SVM classifier
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Evaluate accuracy
y_train_pred = svm_model.predict(X_train)
y_veri_pred = svm_model.predict(X_veri)
print("SVC Train accuracy: {:.2f}".format(metrics.accuracy_score(y_train, y_train_pred)))
print("SVC Regression accuracy: {:.2f}".format(metrics.accuracy_score(y_veri, y_veri_pred)))
print("SVC F1 Score micro:", metrics.f1_score(y_veri, y_veri_pred, average='micro'))
print("SVC F1 Score macro:", metrics.f1_score(y_veri, y_veri_pred, average='macro'))
print("SVC confusion matrix:")
print(metrics.confusion_matrix(y_veri, y_veri_pred))
print('------------------------------')

SVC Train accuracy: 0.81
SVC Regression accuracy: 0.76
SVC F1 Score micro: 0.7573333333333333
SVC F1 Score macro: 0.7491267102955541
SVC confusion matrix:
[[113  13  10   6   7]
 [  6  69  35   4   2]
 [ 13  21 120   6   6]
 [  6   7   7 155   6]
 [  7   7   9   4 111]]
------------------------------


## RandomForst ##

In [12]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_train_pred = rf_model.predict(X_train)
y_veri_pred = rf_model.predict(X_veri)
print("Randomforest training accuracy: {:.2f}".format(metrics.accuracy_score(y_train, y_train_pred)))
print("Randomforest verification accuracy: {:.2f}".format(metrics.accuracy_score(y_veri, y_veri_pred)))
print("Randomforest F1 Score micro:", metrics.f1_score(y_veri, y_veri_pred, average='micro'))
print("Randomforest F1 Score macro:", metrics.f1_score(y_veri, y_veri_pred, average='macro'))
print("Randomforest confusion matrix:")
print(metrics.confusion_matrix(y_veri, y_veri_pred))
print('------------------------------')

Randomforest training accuracy: 1.00
Randomforest verification accuracy: 0.73
Randomforest F1 Score micro: 0.732
Randomforest F1 Score macro: 0.7196979895334621
Randomforest confusion matrix:
[[110  11  12   8   8]
 [  4  54  47   8   3]
 [ 10  16 130   4   6]
 [  4   5  14 155   3]
 [  7   4  23   4 100]]
------------------------------


#What is USE

(USE) The Universal Sentence Encoder  is model that encodes text into high-dimensional vectors that can be used for a variety of natural language processing (NLP) tasks. It was developed by Google AI and is available as a pre-trained model on the TensorFlow Hub.


The output of the model is a 512-dimensional vector that represents the meaning of the input text.

there two  main verision for USE one is Transformers Based and the other DAN based
we are using the DAN based version ,
 The model is a deep learning model that uses a technique called deep averaging network (DAN).




The USE model can be used for a variety of NLP tasks, including:

Text classification: Categorizing text into different classes, such as news, social media, or product reviews.

Semantic similarity: Measuring the similarity between two pieces of text.

Clustering: Grouping similar pieces of text together.

Question answering: Answering questions about text.

Machine translation: Translating text from one language to another.


Unlike traditional word embeddings like Word2Vec or GloVe that generate vectors for individual words, the Universal Sentence Encoder focuses on entire sentences or short text snippets.


 This makes it particularly useful for scenarios where understanding the meaning of a full sentence in context is important.

In [19]:

# df_train = pd.read_csv("data/cleaned_train.csv", encoding="utf-8")
df_test_problem = pd.read_csv("data/cleaned_test.csv", encoding = "utf-8")

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
df_train, df_test = common_code.clean_text_wrapper(df_train, df_test)

display(df_test)
display(df_test_problem)

Start cleaning...
done!


Unnamed: 0,Id,Text,cleaned_text
0,0,i love this i am big fun of cross over and Lor...,love big fun cross lore skin
1,1,FUCK LES BRUNS,fuck les brun
2,2,Anyone who can pass the test proceeds. The sta...,anyone pass test proceed . standard exist prot...
3,3,yeah you're right. i just glanced over and tho...,"ye relief , happy right . glance think swish l..."
4,4,I get abused on the daily fam,get abuse daily fam
...,...,...,...
2495,2495,"[NAME] pnr play has not been good, they are de...","pnr play not good , defend jok let get rim"
2496,2496,You k how it's bad when a furry is roasting you,k bad furry roast
2497,2497,Don't you know that kids need constant supervi...,not know kid need constant supervision turn ? ...
2498,2498,DON'T DO THE TEENAGE MUTANT NINJA TURTLES LIKE...,not teenage mutant ninja turtle like not want ...


Unnamed: 0,Id,Text,cleaned_text
0,0,i love this i am big fun of cross over and Lor...,love big fun cross lore skin
1,1,FUCK LES BRUNS,fuck les brun
2,2,Anyone who can pass the test proceeds. The sta...,anyone pass test proceed . standard exist prot...
3,3,yeah you're right. i just glanced over and tho...,"ye relief , happy right . glance think swish l..."
4,4,I get abused on the daily fam,get abuse daily fam
...,...,...,...
2495,2495,"[NAME] pnr play has not been good, they are de...","pnr play not good , defend jok let get rim"
2496,2496,You k how it's bad when a furry is roasting you,k bad furry roast
2497,2497,Don't you know that kids need constant supervi...,not know kid need constant supervision turn ? ...
2498,2498,DON'T DO THE TEENAGE MUTANT NINJA TURTLES LIKE...,not teenage mutant ninja turtle like not want ...


In [22]:
result = df_train['Label'].value_counts()
result.index

Index(['gratitude', 'disapproval', 'amusement', 'love', 'anger'], dtype='object')