# Word Embedding using Doc2Vec

In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from collections import Counter
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#https://www.analyticsvidhya.com/blog/2020/08/top-4-sentence-embedding-techniques-using-python/
    

### 1) Récupérer nos données

In [7]:
data = pd.read_csv("my_csv_clean.csv",sep = ',') #we got that csv after running the Text Preprocessing file
data.columns = ['tweet', 'class']

categories = ["not_sexist", "sexist"]
    #               2161           989

X = data['tweet']
y = data['class'] 

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)

### 2) Tokeniser les phrases et les catégories

In [8]:
X_token = []
y_token = []

for phrase in X_train:
    X_token.append(word_tokenize(phrase))    
    
for categorie in y_train:
    y_token.append(categorie)

### 3) Construire un corpus contenant des tags

In [9]:
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(X_token)]

### 4) Entraîner le modèle

In [10]:
modele = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)

### 5) Tester le modèle

In [12]:
def predire_categorie(phrase):
    phrase = phrase.lower()
    test_doc = word_tokenize(phrase)
    test_doc_vecteur = modele.infer_vector(test_doc)
    phrases_similaires = modele.docvecs.most_similar(positive = [test_doc_vecteur]) 
    phrases_similaires_categories = [y_token[phrases_similaires[i][0]] for i in range(len(phrases_similaires))]

    nb_0 = phrases_similaires_categories.count(0)
    nb_1 = phrases_similaires_categories.count(1)
    
    if(nb_0 <= nb_1):
        return 1
    
    else:
        return 0

predictions = []    
for phrase in X_test:
    predictions.append(predire_categorie(phrase))


  phrases_similaires = modele.docvecs.most_similar(positive = [test_doc_vecteur])


In [13]:
print("Accuracy : ", np.mean(predictions == y_test))
metrics.confusion_matrix(y_test, predictions)
print(metrics.classification_report(y_test, predictions,target_names=categories))

Accuracy :  0.6708994708994709
              precision    recall  f1-score   support

  not_sexist       0.75      0.78      0.76       643
      sexist       0.48      0.43      0.46       302

    accuracy                           0.67       945
   macro avg       0.61      0.61      0.61       945
weighted avg       0.66      0.67      0.67       945



### Word2Vec

In [14]:
# Python program to generate word vectors using Word2Vec

# importing all necessary modules
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings

warnings.filterwarnings(action = 'ignore')

import gensim
from gensim.models import Word2Vec

data = pd.read_csv("my_csv_clean.csv",sep = ',') #we got that csv after running the Text Preprocessing file
data.columns = ['tweet', 'class']
    

X = data['tweet']
y = data['class'] 

X_token = []
# iterate through each sentence in the file
for i in X:
    temp = []

    # tokenize the sentence into words
    for j in word_tokenize(i):
        temp.append(j.lower())

    X_token.append(temp)
    
X_train, X_test, y_train , y_test = train_test_split(X_token , y ,test_size=0.3)

# Create CBOW model
model1 = gensim.models.Word2Vec(X_train, min_count = 1, window = 5)
print(model1.wv.most_similar('homme'))

# Print results

#print("Cosine similarity between 'femme' and 'championne' - CBOW : ",model1.wv.similarity('femme', 'championne'))
#print("Cosine similarity between 'femmes' and 'hommes' - CBOW : ",model1.wv.similarity('femmes', 'hommes'))

# Create Skip Gram model
model2 = gensim.models.Word2Vec(X_train, min_count = 1, window = 5, sg = 1)

# Print results
#print("Cosine similarity between 'femme' and 'championne' - Skip Gram : ",model2.wv.similarity('femme', 'championne'))
#print("Cosine similarity between 'femmes' and 'hommes' - Skip Gram : ",model2.wv.similarity('femmes', 'hommes'))

[('de', 0.9993289113044739), ('est', 0.9992777109146118), ('du', 0.9992663264274597), ('qui', 0.9992426037788391), ('cette', 0.9992223978042603), ('sur', 0.9992102980613708), ('comme', 0.9992092251777649), ('dans', 0.999208927154541), ('aux', 0.9991835355758667), ('pour', 0.999182939529419)]


In [15]:
from pyemd import emd
print(model1.wv.wmdistance('femmes',"femme"))
print(model2.wv.wmdistance('La pomme est verte',"La femme est belle"))

0.06300259136543976
0.09673344925526245


### TEST MODEL CBOW

In [16]:
model1 = gensim.models.Word2Vec(X_train, min_count = 1, window = 5)

my_dict = dict({})
for idx, key in enumerate(model1.wv.key_to_index):
    my_dict[key] = model1.wv[key]
    # Or my_dict[key] = model.wv.get_vector(key)
    # Or my_dict[key] = model.wv.word_vec(key, use_norm=False)


In [17]:
embedding_dim = 100
max_length = 16
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size=160000
test_portion=.1
print(len(my_dict))
# 9696,100

9771


In [20]:
import tensorflow as tf
embeddings_matrix = my_dict
print(len(embeddings_matrix))
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(9696, 100, input_length=max_length, 
                              weights=[embeddings_matrix.vectors], trainable=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

9771


AttributeError: 'dict' object has no attribute 'vectors'