# Word Embeddings

## Name: Srinitish Srinivasan
## Reg.No: 21BAI1394

In [1]:
import pandas as pd 
import numpy as np
import re 
import nltk 
import gensim 
from gensim.models import word2vec 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords,wordnet
from sklearn.manifold import TSNE
from nltk.stem import SnowballStemmer,WordNetLemmatizer
import matplotlib.pyplot as plt 

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")
nltk.download("stopwords")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,f1_score,accuracy_score,confusion_matrix,roc_curve,auc,roc_auc_score


[nltk_data] Downloading package punkt to /Users/smudge/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/smudge/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/smudge/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/smudge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Get the dataset
#Spam classification Dataset from UCI Repository

import os 
from dotenv import load_dotenv

load_dotenv('.env')
path=os.getenv("spam_classification")

dataset=pd.read_csv(path,encoding='ISO-8859-1')
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Text Pre-Processing

In [3]:
#Remove punctuations
import re
import string

def preprocess(text):
    text=text.lower()
    text=text.strip()

    text=re.compile('<.&?').sub('',text)
    text=re.compile('[%s]'% re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 

    return text

def stopword(text):
    a=[i for i in text.split() if i not in stopwords.words('english')]

    return ' '.join(a)

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
wl=WordNetLemmatizer()

def lemmatizer(word):
    word_pos_tags=nltk.pos_tag(word_tokenize(word))

    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)]
    return " ".join(a)


def lemmatizer(word):
    word_pos_tags=nltk.pos_tag(word_tokenize(word))

    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)]
    return " ".join(a)



  text = re.sub('\s+', ' ', text)


In [4]:
#Preprocess the text

def total_preprocess(text):
    return lemmatizer(stopword(preprocess(text)))

dataset['v2']=dataset['v2'].apply(lambda x:total_preprocess(x))

dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,go jurong point crazy available bugis n great ...,,,
1,ham,ok lar joking wif u oni,,,
2,spam,free entry wkly comp win fa cup final tkts st ...,,,
3,ham,u dun say early hor u c already say,,,
4,ham,nah think go usf life around though,,,


In [5]:
#Map Target train and test to 0 and 1 from no spam and spam
mapping={
    'spam':1,
    'ham':0
}

dataset['v1']=dataset['v1'].map(mapping)
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,go jurong point crazy available bugis n great ...,,,
1,0,ok lar joking wif u oni,,,
2,1,free entry wkly comp win fa cup final tkts st ...,,,
3,0,u dun say early hor u c already say,,,
4,0,nah think go usf life around though,,,


## Model 1: Word2Vec Model

In [7]:
load_dotenv('.env')
weight_path=os.getenv('word_to_vec_weights')
wv=gensim.models.KeyedVectors.load_word2vec_format(weight_path,binary=True)
wv.init_sims(replace=True)

  wv.init_sims(replace=True)


In [31]:
import logging
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.key_to_index:
            mean.append(wv.vectors[wv.key_to_index[word]])
            all_words.add(wv.key_to_index[word])

    if not mean:
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [32]:

def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train, test = train_test_split(dataset, test_size=0.3, random_state = 42)

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['v2']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['v2']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

In [33]:
X_train_word_average

array([[-0.04694395, -0.00098143, -0.04139368, ..., -0.11545982,
        -0.03328285,  0.01025448],
       [-0.03553115,  0.04626777, -0.00489333, ..., -0.05796808,
         0.0195055 ,  0.02464293],
       [-0.04763703,  0.04311031, -0.0387473 , ..., -0.02275408,
         0.09332368, -0.03774943],
       ...,
       [-0.0418291 ,  0.03640273,  0.00182187, ..., -0.02342421,
        -0.12872402,  0.05463272],
       [-0.06885448,  0.01638936,  0.02868873, ..., -0.08429455,
        -0.06026937, -0.02412523],
       [-0.03394803,  0.06657219, -0.07391234, ..., -0.01207056,
         0.03805465, -0.00542622]])

In [34]:
X_test_word_average

array([[ 0.08171444, -0.01049037, -0.00261955, ..., -0.06282342,
         0.03905517,  0.05368195],
       [ 0.0060008 , -0.06187503,  0.03754118, ..., -0.06349047,
         0.02650882, -0.00918592],
       [ 0.00629334, -0.03687675, -0.0115251 , ..., -0.08536734,
         0.03408094,  0.03699492],
       ...,
       [-0.03060124, -0.0192426 , -0.01480922, ..., -0.02625908,
        -0.03370272,  0.03300186],
       [ 0.12758228,  0.06820801,  0.01670597, ...,  0.02695397,
         0.00995757,  0.05631969],
       [-0.03701865,  0.04309348,  0.06624202, ...,  0.0122826 ,
        -0.0756361 ,  0.01264328]])

In [35]:
#classification
logistic=LogisticRegression()
logistic.fit(X_train_word_average,train['v1'])

In [38]:
pred=logistic.predict(X_test_word_average)
print(f"Accuracy: {accuracy_score(pred,test['v1'])*100} %")

Accuracy: 96.17224880382776 %


## Model 2: Doc2Vec Model

In [51]:
from gensim.models import Doc2Vec
from sklearn import utils 
from gensim.models.doc2vec import TaggedDocument
from gensim.models import doc2vec
from tqdm import tqdm

In [52]:
def label_sentences(corpus, label_type):
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled

X_train, X_test, y_train, y_test = train_test_split(dataset['v2'], dataset['v1'], random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [53]:
all_data[:3]

[TaggedDocument(words=['mathews', 'tait', 'edward', 'anderson'], tags=['Train_0']),
 TaggedDocument(words=['ok', 'take', 'care', 'umma'], tags=['Train_1']),
 TaggedDocument(words=['make', 'life', 'stressfull', 'always', 'find', 'time', 'laugh', 'may', 'add', 'year', 'life', 'surely', 'add', 'life', 'ur', 'year', 'gud', 'ni', 'swt', 'dream'], tags=['Train_2'])]

In [55]:
model=Doc2Vec(dm=0,vector_size=300,negative=5,min_count=1,alpha=0.050,min_alpha=0.050)
model.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model.train(utils.shuffle([x for x in tqdm(all_data)]),total_examples=len(all_data),epochs=1)
    model.alpha-=0.002

100%|██████████| 5572/5572 [00:00<00:00, 2001426.90it/s]
100%|██████████| 5572/5572 [00:00<00:00, 9915427.19it/s]
100%|██████████| 5572/5572 [00:00<00:00, 9890250.48it/s]
100%|██████████| 5572/5572 [00:00<00:00, 10451995.48it/s]
100%|██████████| 5572/5572 [00:00<00:00, 10794763.00it/s]
100%|██████████| 5572/5572 [00:00<00:00, 10414733.46it/s]
100%|██████████| 5572/5572 [00:00<00:00, 10720487.10it/s]
100%|██████████| 5572/5572 [00:00<00:00, 9762181.24it/s]
100%|██████████| 5572/5572 [00:00<00:00, 9778519.62it/s]
100%|██████████| 5572/5572 [00:00<00:00, 11008319.31it/s]
100%|██████████| 5572/5572 [00:00<00:00, 10259289.68it/s]
100%|██████████| 5572/5572 [00:00<00:00, 9609647.16it/s]
100%|██████████| 5572/5572 [00:00<00:00, 7330822.42it/s]
100%|██████████| 5572/5572 [00:00<00:00, 10536817.80it/s]
100%|██████████| 5572/5572 [00:00<00:00, 10281857.41it/s]
100%|██████████| 5572/5572 [00:00<00:00, 9844423.71it/s]
100%|██████████| 5572/5572 [00:00<00:00, 9799019.66it/s]
100%|██████████| 5572/5

In [56]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors
    
train_vectors_dbow = get_vectors(model, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model, len(X_test), 300, 'Test')

  vectors[i] = model.docvecs[prefix]


In [57]:
#Training the model
logistic=LogisticRegression()
logistic.fit(train_vectors_dbow,y_train)

In [59]:
pred=logistic.predict(test_vectors_dbow)
print(f"Accuracy: {accuracy_score(pred,y_test)*100} %")

Accuracy: 98.20574162679426 %


## Accuracy Comparision
|Model|Accuracy|
|----|--------|
|Word2Vec|0.96172|
|Doc2Vec|0.9820|