In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import the library

In [2]:
import numpy as np
import pandas as pd
import gensim
import string
import spacy
np.random.seed(42)
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [4]:
import os
os.chdir('/content/drive/MyDrive/NLP/Word2Vec')

### Loading the dataset

In [5]:
df = pd.read_csv('data.csv', encoding = 'latin-1')

In [6]:
# first 5 rows
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
# last 5 rows
df.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
995,fc27bf5d8ed568e0,Take a look at this\n\nWikipedia:Disruptive ed...,0,0,0,0,0,0
996,fc2828355aed9e5e,Your Edits to Albert Einstein,0,0,0,0,0,0
997,fc284cc939fea168,RFC error \nFor some reason the replaced your...,0,0,0,0,0,0
998,fc29b8a68f192b65,"""\n Darwin Rebellion \n\nMy apologies for that...",0,0,0,0,0,0
999,fc2a808948207a7b,Ambrosi\nThank you for experimenting with the ...,0,0,0,0,0,0


In [8]:
# shape of the dataset
df.shape

(1000, 8)

In [9]:
# target column distribution
df['toxic'].value_counts()

0    500
1    500
Name: toxic, dtype: int64

In [10]:
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
print(stop_words)

{'may', 'under', 'among', 'nor', 'together', 'me', 'as', 'am', 'hereby', 'therefore', 'whereby', 'mine', 'will', 'ourselves', 'first', 'ever', 'though', 'whither', "'s", 'her', 'also', 'by', 'had', 'might', 'three', 'into', 'anything', 'another', 'unless', 'its', 'too', 'quite', 'please', 'whose', 'within', 'something', 'my', 'used', 'in', 'us', 'such', 'what', 'for', 'behind', 'been', 'four', 'beside', 'whole', 'whether', 'anyway', 'it', 'via', 'whatever', 'become', 'both', 'someone', 'became', 'ca', 'has', 'whenever', 'i', 'nine', 'name', 'nevertheless', 'next', 'his', 'two', 'else', 'to', "'re", 'latter', 'yourself', 'top', 'themselves', 'towards', 'n’t', 'these', 'him', 'all', 'between', 'twelve', 'which', 'around', 'can', 'most', 'anywhere', 'part', 'eight', 'not', 'you', 'n‘t', 'hereafter', 'doing', 'onto', 'once', 'because', 'must', 'before', 'through', 'and', 'we', 'above', 'everyone', 'since', 'some', 'herself', 'thereafter', 'whereas', 'over', 'they', 'namely', 'rather', 'lea

In [11]:
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [12]:
# Creating our tokenizer function

def spacy_tokenizer(sentence):

    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # print(mytokens)

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [13]:
sentence = "I am eating apple ?"
spacy_tokenizer(sentence)

['eat', 'apple']

In [15]:
import gensim.downloader as api
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [16]:
wv = api.load('word2vec-google-news-300')



In [17]:
wv.save('word2vec-google-news-300.kv')

In [18]:
type(wv)

gensim.models.keyedvectors.KeyedVectors

In [19]:
wv['apple']

array([-0.06445312, -0.16015625, -0.01208496,  0.13476562, -0.22949219,
        0.16210938,  0.3046875 , -0.1796875 , -0.12109375,  0.25390625,
       -0.01428223, -0.06396484, -0.08056641, -0.05688477, -0.19628906,
        0.2890625 , -0.05151367,  0.14257812, -0.10498047, -0.04736328,
       -0.34765625,  0.35742188,  0.265625  ,  0.00188446, -0.01586914,
        0.00195312, -0.35546875,  0.22167969,  0.05761719,  0.15917969,
        0.08691406, -0.0267334 , -0.04785156,  0.23925781, -0.05981445,
        0.0378418 ,  0.17382812, -0.41796875,  0.2890625 ,  0.32617188,
        0.02429199, -0.01647949, -0.06494141, -0.08886719,  0.07666016,
       -0.15136719,  0.05249023, -0.04199219, -0.05419922,  0.00108337,
       -0.20117188,  0.12304688,  0.09228516,  0.10449219, -0.00408936,
       -0.04199219,  0.01409912, -0.02111816, -0.13476562, -0.24316406,
        0.16015625, -0.06689453, -0.08984375, -0.07177734, -0.00595093,
       -0.00482178, -0.00089264, -0.30664062, -0.0625    ,  0.07

In [20]:
len(wv['apple'])

300

In [21]:
wv.similarity("apple", "mango")

0.57518554

In [22]:
wv.similarity("apple", "car")

0.12830707

In [23]:
def sent_vec(sent):
    vector_size = wv.vector_size
    wv_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 1
    for w in sent:
        if w in wv:
            ctr += 1
            wv_res += wv[w]
    wv_res = wv_res/ctr
    return wv_res

In [24]:
sent_vec("I am happy")

array([-0.08447266,  0.10973685, -0.00453404,  0.18362863, -0.05944824,
       -0.04013497, -0.10041155,  0.03710938, -0.02503313,  0.04729353,
       -0.07543836, -0.16434152, -0.1116333 ,  0.02915737, -0.12290737,
        0.10421317, -0.00425502,  0.27535575, -0.01681083, -0.08075387,
       -0.26747349, -0.07281712,  0.08138602, -0.02498954, -0.05358887,
       -0.07969884, -0.30385045,  0.10531616, -0.03756278, -0.04029192,
        0.06455776, -0.02200753, -0.11361694, -0.12702288, -0.19824219,
        0.06921387, -0.14557757,  0.18457031, -0.05542864,  0.10093471,
        0.10728237, -0.07920619, -0.01236398,  0.11485073,  0.09437779,
        0.02020509, -0.08954729, -0.12911551, -0.0363072 ,  0.05212402,
       -0.12200056,  0.23856027, -0.04056222,  0.12512207,  0.05569894,
        0.1237793 , -0.08370536, -0.05489676, -0.0291748 , -0.18401228,
       -0.14592634, -0.04910714, -0.15844727, -0.03636387,  0.00628662,
       -0.30032785, -0.11404855,  0.08530971,  0.01077706,  0.05

In [25]:
df['tokens'] = df['comment_text'].apply(spacy_tokenizer)

In [26]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokens
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"[explanation, edit, username, hardcore, metall..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"[d'aww, match, background, colour, seemingly, ..."
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"[hey, man, try, edit, war, guy, constantly, re..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"[real, suggestion, improvement, wonder, sectio..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"[sir, hero, chance, remember, page]"


In [27]:
df['vec'] = df['tokens'].apply(sent_vec)

In [28]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokens,vec
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"[explanation, edit, username, hardcore, metall...","[-0.005419776553199405, 0.046755836123511904, ..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"[d'aww, match, background, colour, seemingly, ...","[-0.03402879503038195, -0.042989095052083336, ..."
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"[hey, man, try, edit, war, guy, constantly, re...","[0.03130035400390625, 0.043255615234375, -0.00..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"[real, suggestion, improvement, wonder, sectio...","[0.008553277878534226, 0.006958734421502976, 0..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"[sir, hero, chance, remember, page]","[0.14371744791666666, 0.051432291666666664, 0...."


In [29]:
# Splitting the data into independent and dependent

x = df['vec'].to_list()
y = df['toxic'].to_list()

In [30]:
x[0]

array([-0.00541978,  0.04675584,  0.00133696,  0.0850801 , -0.06447638,
       -0.00180263, -0.00253596, -0.11783854,  0.12369792,  0.02642146,
       -0.02159773, -0.03835333, -0.02583386,  0.02868071, -0.09855725,
        0.0882859 ,  0.01708984,  0.05018398, -0.01936704, -0.13048444,
        0.074941  ,  0.06057485,  0.08355422,  0.03940982, -0.03506034,
        0.03308033, -0.05092076,  0.09277489,  0.04749698, -0.10054706,
       -0.04815383,  0.01269531, -0.09663028, -0.00693476, -0.02238973,
       -0.0322905 ,  0.08835856,  0.03654298,  0.06050037,  0.1015625 ,
        0.04052734, -0.04001726,  0.16807138, -0.01393345, -0.0494363 ,
       -0.10120501, -0.00805664, -0.02898362, -0.09638904,  0.03300985,
       -0.10766166, -0.01426442, -0.03711337, -0.08377511,  0.00310989,
       -0.01328096, -0.05810547, -0.06438337, -0.023888  , -0.09505208,
       -0.02969215, -0.00195022, -0.02789452, -0.03556315, -0.06668381,
       -0.02664621, -0.0712818 ,  0.02684675, -0.03303019,  0.12

In [31]:
# Splitting the data into train and test

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 1)

### Logistic Regression

In [32]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [33]:
model.fit(x_train, y_train)

In [34]:
y_pred = model.predict(x_test)

### Evaluation Metrics

In [35]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [36]:
Accuracy = accuracy_score(y_test, y_pred)
print("Acuracy ", Accuracy)

Acuracy  0.91


In [37]:
Precision = precision_score(y_test, y_pred)
print("Precision ", Precision)

Precision  0.9270833333333334


In [38]:
Recall = recall_score(y_test, y_pred)
print("Recall ", Recall)

Recall  0.89
