# This notebook compares the Accuracy and AUC socre on Logistic Regression classifier using two different word embedding methods.

In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from IPython.display import display, Markdown, Latex
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
class word2vec:         
    def transform(self,text):
        l = [[j.lower() for j in word_tokenize(i)] for i in text]
        model = Word2Vec(l,size=300, \
            window=7, \
            min_count=0,\
            workers=1)
        model.train(l, total_examples=len(l), epochs=10)
        updated_vector = []
        for i in l:
            p=0
            for j in i:
                p+=model.wv[j]
            updated_vector.append(p)
        return(updated_vector)
    def transformSpacy(self,text):
        model = spacy.load("en_core_web_md")
      #  text2vec = [model(i).vector for i in text['comment'].fillna(" ").tolist()]
        #dimmension of vector = 300x1
        text2vec = [model(i).vector for i in text]
        return text2vec

In [3]:
#parameters:
#size ->  dimensionality of the word vector
#window -> the window size(maximum distance between the current and predicted word within a sentence)
#min_count -> ignores all words with total frequency lower than this
#workers -> faster training with multicore machine

df_train = pd.read_csv('../Preprocessed Data/cleaned_dataset_train.csv')
df_test = pd.read_csv('../Preprocessed Data//cleaned_dataset_test.csv')
#df = pd.read_csv('../cleaned_dataset.csv')
#print(df_train["comment"].fillna(" ").tolist());


# Word embedding used - Contineous bag of words

In [4]:
df_train = df_train.fillna('')
df_test = df_test.fillna('')
X_train = df_train["comment"]
Y_train = df_train["insult"].tolist()

X_test = df_test["comment"]
Y_test = df_test["insult"].tolist()


In [5]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [6]:

clf = LogisticRegression(C = 3)
clf.fit(X_train,Y_train)

LogisticRegression(C=3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
predictions=clf.predict(X_test)

count = 0
for prediction in range(len(predictions)):
    if predictions[prediction] == Y_test[prediction]:
        count += 1
acc = count/len(predictions)
aoc = metrics.roc_auc_score(Y_test,predictions)
data = "**Accuracy** : " + str(acc) + "<br>" + "**AUC Score** : " + str(aoc) 
display(Markdown(data))

**Accuracy** : 0.8451076690593124<br>**AUC Score** : 0.7698364696829385

# Word embedding used - TF-IDF

In [8]:
df_train = df_train.fillna('')
df_test = df_test.fillna('')
X_train = df_train["comment"]
Y_train = df_train["insult"].tolist()

X_test = df_test["comment"]
Y_test = df_test["insult"].tolist()


In [9]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [10]:

clf = LogisticRegression(C = 3)
clf.fit(X_train,Y_train)

LogisticRegression(C=3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
predictions=clf.predict(X_test)

count = 0
for prediction in range(len(predictions)):
    if predictions[prediction] == Y_test[prediction]:
        count += 1
acc = count/len(predictions)
aoc = metrics.roc_auc_score(Y_test,predictions)
data = "**Accuracy** : " + str(acc) + "<br>" + "**AUC Score** : " + str(aoc) 
display(Markdown(data))

**Accuracy** : 0.84246316584813<br>**AUC Score** : 0.7415051967252582

# Word embedding used - Word2vec(Gensim)

In [12]:
df_train = df_train.fillna('')
df_test = df_test.fillna('')
X_train = df_train["comment"]
Y_train = df_train["insult"].tolist()

X_test = df_test["comment"]
Y_test = df_test["insult"].tolist()


In [13]:
#vectorizer = word2vec()
#X_train=vectorizer.transform(X_train)
#X_test=vectorizer.transform(X_test)

In [14]:
print(type(X_train),type(X_test)) 

<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>


In [15]:
#n_neighbors = 15
#clf = KNeighborsClassifier(n_neighbors)
#clf.fit(X_train,Y_train);

In [16]:
'''predictions=clf.predict(X_test)

count = 0
for prediction in range(len(predictions)):
    if predictions[prediction] == Y_test[prediction]:
        count += 1
acc = count/len(predictions)
aoc = metrics.roc_auc_score(Y_test,predictions)
data = "**Accuracy** : " + str(acc) + "<br>" + "**AUC Score** : " + str(aoc) 
display(Markdown(data))
''';

# Word embedding used - Word2vec(spacy)

In [17]:
df_train = df_train.fillna('')
df_test = df_test.fillna('')
X_train = df_train["comment"]
Y_train = df_train["insult"].tolist()

X_test = df_test["comment"]
Y_test = df_test["insult"].tolist()


In [None]:
vectorizer = word2vec()
X_train=vectorizer.transformSpacy(X_train)
X_test=vectorizer.transformSpacy(X_test)

In [30]:
print (type(X_train),type(Y_train))

<class 'list'> <class 'list'>


In [31]:

clf = LogisticRegression(C = 3)
clf.fit(X_train,Y_train)

LogisticRegression(C=3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
predictions=clf.predict(X_test)

count = 0
for prediction in range(len(predictions)):
    if predictions[prediction] == Y_test[prediction]:
        count += 1
acc = count/len(predictions)
aoc = metrics.roc_auc_score(Y_test,predictions)
data = "**Accuracy** : " + str(acc) + "<br>" + "**AUC Score** : " + str(aoc) 
display(Markdown(data))


**Accuracy** : 0.8356630147336608<br>**AUC Score** : 0.7359680294685412

#### best word embedding results on k-nearest neighbourusing spacy(Word2vec)