# Word Embedding using Doc2Vec and Word2Vec

In [2]:
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from collections import Counter
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#https://www.analyticsvidhya.com/blog/2020/08/top-4-sentence-embedding-techniques-using-python/
    

### Doc2Vec

### 1) Récupérer nos données

In [7]:
data = pd.read_csv("my_csv_clean.csv",sep = ',') #we got that csv after running the Text Preprocessing file
data.columns = ['tweet', 'class']

categories = ["not_sexist", "sexist"]
    #               2161           989

X = data['tweet']
y = data['class'] 

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)

### 2) Tokeniser les phrases et les catégories

In [8]:
X_token = []
y_token = []

for phrase in X_train:
    X_token.append(word_tokenize(phrase))    
    
for categorie in y_train:
    y_token.append(categorie)

### 3) Construire un corpus contenant des tags

In [9]:
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(X_token)]

### 4) Entraîner le modèle

In [10]:
modele = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)

### 5) Tester le modèle

In [12]:
def predire_categorie(phrase):
    phrase = phrase.lower()
    test_doc = word_tokenize(phrase)
    test_doc_vecteur = modele.infer_vector(test_doc)
    phrases_similaires = modele.docvecs.most_similar(positive = [test_doc_vecteur]) 
    phrases_similaires_categories = [y_token[phrases_similaires[i][0]] for i in range(len(phrases_similaires))]

    nb_0 = phrases_similaires_categories.count(0)
    nb_1 = phrases_similaires_categories.count(1)
    
    if(nb_0 <= nb_1):
        return 1
    
    else:
        return 0

predictions = []    
for phrase in X_test:
    predictions.append(predire_categorie(phrase))


  phrases_similaires = modele.docvecs.most_similar(positive = [test_doc_vecteur])


In [13]:
print("Accuracy : ", np.mean(predictions == y_test))
metrics.confusion_matrix(y_test, predictions)
print(metrics.classification_report(y_test, predictions,target_names=categories))

Accuracy :  0.6708994708994709
              precision    recall  f1-score   support

  not_sexist       0.75      0.78      0.76       643
      sexist       0.48      0.43      0.46       302

    accuracy                           0.67       945
   macro avg       0.61      0.61      0.61       945
weighted avg       0.66      0.67      0.67       945



### Word2Vec - CBOW 

In [69]:
# Python program to generate word vectors using Word2Vec

# importing all necessary modules
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings

warnings.filterwarnings(action = 'ignore')

import gensim
from gensim.models import Word2Vec

data = pd.read_csv("my_csv_clean.csv",sep = ',') #we got that csv after running the Text Preprocessing file
data.columns = ['tweet', 'class']


categories = ["not_sexist", "sexist"]
    #               2161           989

X = data['tweet']
y = data['class'] 

X_token = []
# iterate through each sentence in the file
for i in X:
    temp = []

    # tokenize the sentence into words
    for j in word_tokenize(i):
        temp.append(j.lower())

    X_token.append(temp)
    
X_train, X_test, y_train , y_test = train_test_split(X_token , y ,test_size=0.3)
# Create CBOW model
model_CBOW = gensim.models.Word2Vec(X_train, min_count = 1, window = 5)

#print("Cosine similarity between 'femme' and 'championne' - CBOW : ",model_CBOW.wv.similarity('femme', 'championne'))
#print("Cosine similarity between 'femmes' and 'hommes' - CBOW : ",model_CBOW.wv.similarity('femmes', 'hommes'))

'''
from pyemd import emd
print(model_CBOW.wv.wmdistance('femmes',"femme"))
print(model_CBOW.wv.wmdistance('La pomme est verte',"La femme est belle"))

print(model_CBOW.wv.most_similar_cosmul(positive=['femme', 'championne'], negative=['homme']))
'''

In [70]:
def get_list_similar_words_tweet(tweet):
    similar_words = []
    for i in range(len(tweet)):
        sw = model_CBOW.wv.most_similar(i)
        for j in sw:
            similar_words.append(j[0])
                
    return list(set(similar_words))
        
    

def get_category_tweet_from_a_word(word):
    list_labels = []
    y_train_list = [i for i in y_train]
    for i in range(len(X_train)):
        if word in X_train[i]:
            list_labels.append(y_train_list[i])
    
    nb0 = list_labels.count(0)
    nb1 = list_labels.count(1)
    if nb0 <= nb1:
        return 1
    else:
        return 0
    
def predict(tweet):
    similar_words = get_list_similar_words_tweet(tweet)
    category_words = [get_category_tweet_from_a_word(i) for i in tweet]
    
    nb0 = category_words.count(0)
    nb1 = category_words.count(1)
    
    if nb0 <= nb1:
        return 1
    else:
        return 0
    

        
predictions = []
for i in X_test:
    predictions.append(predict(i))
    
print(predictions)

[1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 

In [71]:
y_test_list = [i for i in y_test]
acc = 0
for i in range(len(y_test_list)):
    if y_test_list[i] == predictions[i]:
        acc = acc + 1
        
print("Accuracy : ", acc/len(y_test_list))

print("\nMatrice de confusion :  \n" , metrics.confusion_matrix(y_test_list, predictions))

print(metrics.classification_report(y_test_list, predictions,target_names=categories))


Accuracy :  0.6952380952380952

Matrice de confusion :  
 [[537 129]
 [159 120]]
              precision    recall  f1-score   support

  not_sexist       0.77      0.81      0.79       666
      sexist       0.48      0.43      0.45       279

    accuracy                           0.70       945
   macro avg       0.63      0.62      0.62       945
weighted avg       0.69      0.70      0.69       945



### Word2Vec - SKIP GRAM

In [75]:
# Python program to generate word vectors using Word2Vec

# importing all necessary modules
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings

warnings.filterwarnings(action = 'ignore')

import gensim
from gensim.models import Word2Vec

data = pd.read_csv("my_csv_clean.csv",sep = ',') #we got that csv after running the Text Preprocessing file
data.columns = ['tweet', 'class']


categories = ["not_sexist", "sexist"]
    #               2161           989

X = data['tweet']
y = data['class'] 

X_token = []
# iterate through each sentence in the file
for i in X:
    temp = []

    # tokenize the sentence into words
    for j in word_tokenize(i):
        temp.append(j.lower())

    X_token.append(temp)
    
X_train, X_test, y_train , y_test = train_test_split(X_token , y ,test_size=0.3)
# Create SKIP GRAM model
model_skip_gram = gensim.models.Word2Vec(X_train, min_count = 1, window = 5, sg = 1)

#print("Cosine similarity between 'femme' and 'championne' - Skip Gram : ",model_skip_gram.wv.similarity('femme', 'championne'))
#print("Cosine similarity between 'femmes' and 'hommes' - Skip Gram : ",model_skip_gram.wv.similarity('femmes', 'hommes'))


In [76]:
def get_list_similar_words_tweet(tweet):
    similar_words = []
    for i in range(len(tweet)):
        sw = model_skip_gram.wv.most_similar(i)
        for j in sw:
            similar_words.append(j[0])
                
    return list(set(similar_words))
        
    

def get_category_tweet_from_a_word(word):
    list_labels = []
    y_train_list = [i for i in y_train]
    for i in range(len(X_train)):
        if word in X_train[i]:
            list_labels.append(y_train_list[i])
    
    nb0 = list_labels.count(0)
    nb1 = list_labels.count(1)
    if nb0 <= nb1:
        return 1
    else:
        return 0
    
def predict(tweet):
    similar_words = get_list_similar_words_tweet(tweet)
    category_words = [get_category_tweet_from_a_word(i) for i in tweet]
    
    nb0 = category_words.count(0)
    nb1 = category_words.count(1)
    
    if nb0 <= nb1:
        return 1
    else:
        return 0
    

        
predictions = []
for i in X_test:
    predictions.append(predict(i))
    
print(predictions)

[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [77]:
y_test_list = [i for i in y_test]
acc = 0
for i in range(len(y_test_list)):
    if y_test_list[i] == predictions[i]:
        acc = acc + 1
        
print("Accuracy : ", acc/len(y_test_list))

print("\nMatrice de confusion :  \n" , metrics.confusion_matrix(y_test_list, predictions))

print(metrics.classification_report(y_test_list, predictions,target_names=categories))

Accuracy :  0.6825396825396826

Matrice de confusion :  
 [[535 118]
 [182 110]]
              precision    recall  f1-score   support

  not_sexist       0.75      0.82      0.78       653
      sexist       0.48      0.38      0.42       292

    accuracy                           0.68       945
   macro avg       0.61      0.60      0.60       945
weighted avg       0.66      0.68      0.67       945



### TENTATIVE DE COMBINAISON DE WORD2VEC AVEC KERAS (ne marche pas)

In [79]:
model = gensim.models.Word2Vec(X_train, min_count = 1, window = 5)

my_dict = dict({})
for idx, key in enumerate(model.wv.key_to_index):
    my_dict[key] = model.wv[key]
    # Or my_dict[key] = model.wv.get_vector(key)
    # Or my_dict[key] = model.wv.word_vec(key, use_norm=False)


In [81]:
embedding_dim = 100
max_length = 16
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size=160000
test_portion=.1

In [82]:
import tensorflow as tf
embeddings_matrix = my_dict
print(len(embeddings_matrix))
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(9696, 100, input_length=max_length, 
                              weights=[embeddings_matrix], trainable=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

8720


ValueError: Layer embedding weight shape (9696, 100) is not compatible with provided weight shape ().