In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import metrics 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from collections import Counter
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

### NAIVES BAYES - Utillisation de MultinomialNB()

In [2]:
categories = ["not_sexist", "sexist"]
    #               2161           989

In [3]:
data = pd.read_csv("my_csv_clean.csv",sep = ',')
data.columns = ['tweet', 'class']

X = data['tweet']
y = data['class']

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)

###### Tokenisation avec scikit-learrn

In [4]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(2203, 8637)

###### Des occurences aux fréquences

In [5]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2203, 8637)

###### Construction du model 

In [6]:
clf = MultinomialNB().fit(X_train_tfidf, y_train)

###### Tests sur des exemples simples

In [7]:
tweet_test = ['La femme doit être dans la cuisine', "La femme est belle", "Un homme", "Les hommes sont tous les mêmes"]
X_new_counts = count_vect.transform(tweet_test)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(tweet_test, predicted):
     print('%r => %s' % (doc, categories[category]))

'La femme doit être dans la cuisine' => not_sexist
'La femme est belle' => sexist
'Un homme' => not_sexist
'Les hommes sont tous les mêmes' => not_sexist


###### Test sur X_test

In [8]:
X_test_new_counts =  count_vect.transform(X_test)
X_test_new_tfidf = tfidf_transformer.transform(X_test_new_counts)
predicted = clf.predict(X_test_new_tfidf)
print("Matrice de confusion :")
print(metrics.confusion_matrix(y_test, predicted))
print("Accuracy : ", np.mean(predicted == y_test))
print("Balanced accuracy", metrics.balanced_accuracy_score(y_test, predicted))
print(metrics.classification_report(y_test, predicted,target_names=categories))

Matrice de confusion :
[[653   3]
 [264  25]]
Accuracy :  0.7174603174603175
Balanced accuracy 0.5409660097898557
              precision    recall  f1-score   support

  not_sexist       0.71      1.00      0.83       656
      sexist       0.89      0.09      0.16       289

    accuracy                           0.72       945
   macro avg       0.80      0.54      0.49       945
weighted avg       0.77      0.72      0.62       945



###### OVERSAMPLING

In [9]:
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(np.array(X_train).reshape(-1,1), y_train)
print(Counter(y_ros))

Counter({1: 1503, 0: 1503})


In [10]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_ros.ravel())
X_train_counts.shape

(3006, 8637)

In [11]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3006, 8637)

In [12]:
clf = MultinomialNB().fit(X_train_tfidf, y_ros)
docs_new = ["y'a que les femmes qui pleurent", "C'est un homme."] 
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, categories[category]))
        
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf.fit(X_ros.ravel(), y_ros)


# resampling X, y
X_ros_test, y_ros_test = ros.fit_resample(np.array(X_test).reshape(-1,1), y_test)# new class distribution 
docs_test = X_ros_test.ravel()
predicted = text_clf.predict(docs_test)
print()
print("Accuracy : ", np.mean(predicted == y_ros_test))
print()
print("Matrice de confusion : ")
print(metrics.confusion_matrix(y_ros_test, predicted))
print()
print(metrics.classification_report(y_ros_test, predicted,target_names=categories))

"y'a que les femmes qui pleurent" => sexist
"C'est un homme." => sexist

Accuracy :  0.7469512195121951

Matrice de confusion : 
[[456 200]
 [132 524]]

              precision    recall  f1-score   support

  not_sexist       0.78      0.70      0.73       656
      sexist       0.72      0.80      0.76       656

    accuracy                           0.75      1312
   macro avg       0.75      0.75      0.75      1312
weighted avg       0.75      0.75      0.75      1312



###### Undersampling

In [13]:
# instantiating the random over sampler 
rus = RandomUnderSampler()
# resampling X, y
X_rus, y_rus = rus.fit_resample(np.array(X_train).reshape(-1,1), y_train)# new class distribution 
print(Counter(y_rus))

Counter({0: 700, 1: 700})


In [14]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_rus.ravel())
X_train_counts.shape

(1400, 6496)

In [15]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1400, 6496)

In [16]:
clf = MultinomialNB().fit(X_train_tfidf, y_rus)
docs_new = ["y'a que les femmes qui pleurent", "C'est un homme."] 
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, categories[category]))
        
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf.fit(X_rus.ravel(), y_rus)


# resampling X, y
X_rus_test, y_rus_test = rus.fit_resample(np.array(X_test).reshape(-1,1), y_test)# new class distribution 
docs_test = X_rus_test.ravel()
predicted = text_clf.predict(docs_test)
print()
print("Accuracy : ", np.mean(predicted == y_rus_test))
print()
print("Matrice de confusion : ")
print(metrics.confusion_matrix(y_rus_test, predicted))
print()
print(metrics.classification_report(y_rus_test, predicted,target_names=categories))


"y'a que les femmes qui pleurent" => sexist
"C'est un homme." => sexist

Accuracy :  0.7422145328719724

Matrice de confusion : 
[[182 107]
 [ 42 247]]

              precision    recall  f1-score   support

  not_sexist       0.81      0.63      0.71       289
      sexist       0.70      0.85      0.77       289

    accuracy                           0.74       578
   macro avg       0.76      0.74      0.74       578
weighted avg       0.76      0.74      0.74       578



###### Oversampling and Undersampling

In [17]:
over = RandomOverSampler(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.8)

In [18]:
X_over, y_over = over.fit_resample(np.array(X_train).reshape(-1,1), y_train)
print(f"Oversampled: {Counter(y_over)}")

Oversampled: Counter({0: 1503, 1: 751})


In [19]:
# now to comine under sampling 
X_ros, y_ros = under.fit_resample(X_over, y_over)
print(f"Combined Random Sampling: {Counter(y_ros)}")

Combined Random Sampling: Counter({0: 938, 1: 751})


In [20]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_ros.ravel())
X_train_counts.shape

(1689, 7188)

In [21]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1689, 7188)

In [22]:
clf = MultinomialNB().fit(X_train_tfidf, y_ros)
docs_new = ["y'a que les femmes qui pleurent", "C'est un homme."]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, categories[category]))
        
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf.fit(X_ros.ravel(), y_ros)

X_ros_test, y_ros_test = ros.fit_resample(np.array(X_test).reshape(-1,1), y_test)# new class distribution 
docs_test = X_ros_test.ravel()
predicted = text_clf.predict(docs_test)
print("Accuracy : ", np.mean(predicted == y_ros_test))
print("Matrice de confusion : ")
print(metrics.confusion_matrix(y_ros_test, predicted))
print(metrics.classification_report(y_ros_test, predicted,target_names=categories))

"y'a que les femmes qui pleurent" => sexist
"C'est un homme." => sexist
Accuracy :  0.7591463414634146
Matrice de confusion : 
[[520 136]
 [180 476]]
              precision    recall  f1-score   support

  not_sexist       0.74      0.79      0.77       656
      sexist       0.78      0.73      0.75       656

    accuracy                           0.76      1312
   macro avg       0.76      0.76      0.76      1312
weighted avg       0.76      0.76      0.76      1312



### NAIVE BAYES - Approche manuelle

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

data = pd.read_csv("my_csv_clean.csv",sep = ',') #we got that csv after running the Text Preprocessing file
data.columns = ['tweet', 'class']
    

X = data['tweet']
y = data['class'] 

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)


categories = ["NotSexist", "Sexist"]

X_train = X_train.tolist()
X_test = X_test.tolist()

y_train = y_train.tolist()
y_test = y_test.tolist()

X_train_0 = []
X_train_1 = []

for i in range(len(y_train)):
    if y_train[i]==0:
        X_train_0.append(X_train[i])
    else:
        X_train_1.append(X_train[i])

vec_train_0 = CountVectorizer()
X_c0 = vec_train_0.fit_transform(X_train_0)
tdm_0 = pd.DataFrame(X_c0.toarray(), columns=vec_train_0.get_feature_names())


vec_train_1 = CountVectorizer()
X_c1 = vec_train_1.fit_transform(X_train_1)
tdm_1 = pd.DataFrame(X_c1.toarray(), columns=vec_train_1.get_feature_names())



In [24]:
word_list_0 = vec_train_0.get_feature_names();    
count_list_0 = X_c0.toarray().sum(axis=0) 
freq_0 = dict(zip(word_list_0,count_list_0))

word_list_1 = vec_train_1.get_feature_names();    
count_list_1 = X_c1.toarray().sum(axis=0) 
freq_1 = dict(zip(word_list_1,count_list_1))

In [25]:
prob_0 = []
for word,count in zip(word_list_0,count_list_0):
    prob_0.append(count/len(word_list_0))
    
prob_1 = []
for word,count in zip(word_list_1,count_list_1):
    prob_1.append(count/len(word_list_1))

In [26]:
vec_0 = CountVectorizer()
X_vec_0 = vec_0.fit_transform(X_train_0)

total_features0 = len(vec_0.get_feature_names())
total_features0

6521

In [27]:
vec_1 = CountVectorizer()
X_vec_1 = vec_1.fit_transform(X_train_1)
total_features1 = len(vec_1.get_feature_names())
total_features1

proba0 = total_features0 / (total_features0+total_features1)
proba1 = total_features1 / (total_features0+total_features1)

print(proba0)
print(proba1)

0.6200437387087573
0.37995626129124277


In [28]:
total_cnts_features_0 = count_list_0.sum(axis=0)
total_cnts_features_1 = count_list_1.sum(axis=0)

In [29]:
from nltk.tokenize import word_tokenize

def proba_sentence_class0(sentence, total_cnts_features, total_features):
    new_word_list = word_tokenize(sentence)
    prob_s_with_ls = []
    for word in new_word_list:
        if word in freq_0.keys():
            count = freq_0[word]
        else:
            count = 0
        prob_s_with_ls.append((count + 1)/(total_cnts_features + total_features))
    d = dict(zip(new_word_list,prob_s_with_ls))
    res = 1
    for word in sentence.split():
        res = res * d[word]

    res = res * proba0
    return res

def proba_sentence_class1(sentence,total_cnts_features, total_features):
    new_word_list = word_tokenize(sentence)
    prob_s_with_ls = []
    for word in new_word_list:
        if word in freq_1.keys():
            count = freq_1[word]
        else:
            count = 0
        prob_s_with_ls.append((count + 1)/(total_cnts_features + total_features))
    d = dict(zip(new_word_list,prob_s_with_ls))
    res = 1
    for word in sentence.split():
        res = res * d[word]

    res = res * proba1
    return res

In [30]:
predictions = []
for i in X_test:
    proba_class0 = proba_sentence_class0(i,total_cnts_features_0,total_features0)
    proba_class1 = proba_sentence_class1(i,total_cnts_features_1,total_features1)
    if proba_class0 > proba_class1:
        predictions.append(0)
    else:
        predictions.append(1)
        

In [31]:
total_labels = len(y_test)
acc = 0
for i in range(len(y_test)):
    if(y_test[i]==predictions[i]):
        acc = acc + 1
        
acc = acc / total_labels
print("Accuracy :" , acc)
print()
print("Balanced accuracy :" , metrics.balanced_accuracy_score(y_test, predictions))
print()
print("Matrice de confusion :")
print(metrics.confusion_matrix(y_test, predictions))
print(metrics.classification_report(y_test, predictions,target_names=categories))

Accuracy : 0.6402116402116402

Balanced accuracy : 0.7217465753424658

Matrice de confusion :
[[337 320]
 [ 20 268]]
              precision    recall  f1-score   support

   NotSexist       0.94      0.51      0.66       657
      Sexist       0.46      0.93      0.61       288

    accuracy                           0.64       945
   macro avg       0.70      0.72      0.64       945
weighted avg       0.80      0.64      0.65       945



### NAIVE BAYES - Approche manuelle + Oversampling

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

data = pd.read_csv("my_csv_clean.csv",sep = ',')
data.columns = ['tweet', 'class']
    

X = data['tweet']
y = data['class'] 

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)


categories = ["NotSexist", "Sexist"]

X_train = X_train.tolist()
X_test = X_test.tolist()

y_train = y_train.tolist()
y_test = y_test.tolist()

ros = RandomOverSampler()

X_ros_train, y_ros_train = ros.fit_resample(np.array(X_train).reshape(-1,1), y_train)
X_ros_test, y_ros_test = ros.fit_resample(np.array(X_test).reshape(-1,1), y_test)


X_train_0 = []
X_train_1 = []

for i in range(len(y_ros_train)):
    if y_ros_train[i]==0:
        X_train_0.append(X_ros_train[i].tolist()[0])
    else:
        X_train_1.append(X_ros_train[i].tolist()[0])




vec_train_0 = CountVectorizer()
X_c0 = vec_train_0.fit_transform(X_train_0)
tdm_0 = pd.DataFrame(X_c0.toarray(), columns=vec_train_0.get_feature_names())


vec_train_1 = CountVectorizer()
X_c1 = vec_train_1.fit_transform(X_train_1)
tdm_1 = pd.DataFrame(X_c1.toarray(), columns=vec_train_1.get_feature_names())



In [33]:
word_list_0 = vec_train_0.get_feature_names();    
count_list_0 = X_c0.toarray().sum(axis=0) 
freq_0 = dict(zip(word_list_0,count_list_0))

word_list_1 = vec_train_1.get_feature_names();    
count_list_1 = X_c1.toarray().sum(axis=0) 
freq_1 = dict(zip(word_list_1,count_list_1))

In [34]:
prob_0 = []
for word,count in zip(word_list_0,count_list_0):
    prob_0.append(count/len(word_list_0))
    
prob_1 = []
for word,count in zip(word_list_1,count_list_1):
    prob_1.append(count/len(word_list_1))

In [35]:
vec_0 = CountVectorizer()
X_vec_0 = vec_0.fit_transform(X_train_0)

total_features0 = len(vec_0.get_feature_names())
total_features0

6606

In [36]:
vec_1 = CountVectorizer()
X_vec_1 = vec_1.fit_transform(X_train_1)
total_features1 = len(vec_1.get_feature_names())
total_features1

proba0 = total_features0 / (total_features0+total_features1)
proba1 = total_features1 / (total_features0+total_features1)

print(proba0)
print(proba1)

0.6222096637468212
0.37779033625317887


In [37]:
total_cnts_features_0 = count_list_0.sum(axis=0)
total_cnts_features_1 = count_list_1.sum(axis=0)

In [38]:
predictions = []
for i in X_ros_test:
    proba_class0 = proba_sentence_class0(i[0],total_cnts_features_0,total_features0)
    proba_class1 = proba_sentence_class1(i[0],total_cnts_features_1,total_features1)
    if proba_class0 > proba_class1:
        predictions.append(0)
    else:
        predictions.append(1)
        

In [39]:
total_labels = len(y_ros_test)
acc = 0
for i in range(len(y_ros_test)):
    if(y_ros_test[i]==predictions[i]):
        acc = acc + 1
        
acc = acc / total_labels
print("Accuracy :" , acc)
print()
print("Matrice de confusion :")
print(metrics.confusion_matrix(y_ros_test, predictions))
print(metrics.classification_report(y_ros_test, predictions,target_names=categories))

Accuracy : 0.7697674418604651

Matrice de confusion :
[[445 200]
 [ 97 548]]
              precision    recall  f1-score   support

   NotSexist       0.82      0.69      0.75       645
      Sexist       0.73      0.85      0.79       645

    accuracy                           0.77      1290
   macro avg       0.78      0.77      0.77      1290
weighted avg       0.78      0.77      0.77      1290



### NAIVE BAYES - Approche manuelle + Undersampling

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

data = pd.read_csv("my_csv_clean.csv",sep = ',') #we got that csv after running the Text Preprocessing file
data.columns = ['tweet', 'class']
    

X = data['tweet']
y = data['class'] 

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)


categories = ["NotSexist", "Sexist"]

X_train = X_train.tolist()
X_test = X_test.tolist()

y_train = y_train.tolist()
y_test = y_test.tolist()

rus = RandomUnderSampler()

X_rus_train, y_rus_train = rus.fit_resample(np.array(X_train).reshape(-1,1), y_train)# new class distribution 
X_rus_test, y_rus_test = rus.fit_resample(np.array(X_test).reshape(-1,1), y_test)# new class distribution 


X_train_0 = []
X_train_1 = []

for i in range(len(y_rus_train)):
    if y_rus_train[i]==0:
        X_train_0.append(X_rus_train[i].tolist()[0])
    else:
        X_train_1.append(X_rus_train[i].tolist()[0])




vec_train_0 = CountVectorizer()
X_c0 = vec_train_0.fit_transform(X_train_0)
tdm_0 = pd.DataFrame(X_c0.toarray(), columns=vec_train_0.get_feature_names())


vec_train_1 = CountVectorizer()
X_c1 = vec_train_1.fit_transform(X_train_1)
tdm_1 = pd.DataFrame(X_c1.toarray(), columns=vec_train_1.get_feature_names())



In [41]:
word_list_0 = vec_train_0.get_feature_names();    
count_list_0 = X_c0.toarray().sum(axis=0) 
freq_0 = dict(zip(word_list_0,count_list_0))

word_list_1 = vec_train_1.get_feature_names();    
count_list_1 = X_c1.toarray().sum(axis=0) 
freq_1 = dict(zip(word_list_1,count_list_1))

In [42]:
prob_0 = []
for word,count in zip(word_list_0,count_list_0):
    prob_0.append(count/len(word_list_0))
    
prob_1 = []
for word,count in zip(word_list_1,count_list_1):
    prob_1.append(count/len(word_list_1))

In [43]:
vec_0 = CountVectorizer()
X_vec_0 = vec_0.fit_transform(X_train_0)

total_features0 = len(vec_0.get_feature_names())
total_features0

3862

In [44]:
vec_1 = CountVectorizer()
X_vec_1 = vec_1.fit_transform(X_train_1)
total_features1 = len(vec_1.get_feature_names())
total_features1

proba0 = total_features0 / (total_features0+total_features1)
proba1 = total_features1 / (total_features0+total_features1)

print(proba0)
print(proba1)

0.49360940695296524
0.5063905930470347


In [45]:
total_cnts_features_0 = count_list_0.sum(axis=0)
total_cnts_features_1 = count_list_1.sum(axis=0)

In [46]:
predictions = []
for i in X_rus_test:
    proba_class0 = proba_sentence_class0(i[0],total_cnts_features_0,total_features0)
    proba_class1 = proba_sentence_class1(i[0],total_cnts_features_1,total_features1)
    if proba_class0 > proba_class1:
        predictions.append(0)
    else:
        predictions.append(1)
        

In [47]:
total_labels = len(y_rus_test)
acc = 0
for i in range(len(y_rus_test)):
    if(y_rus_test[i]==predictions[i]):
        acc = acc + 1
        
acc = acc / total_labels
print("Accuracy :" , acc)
print()
print("Matrice de confusion :")
print(metrics.confusion_matrix(y_rus_test, predictions))
print(metrics.classification_report(y_rus_test, predictions,target_names=categories))

Accuracy : 0.7298657718120806

Matrice de confusion :
[[202  96]
 [ 65 233]]
              precision    recall  f1-score   support

   NotSexist       0.76      0.68      0.72       298
      Sexist       0.71      0.78      0.74       298

    accuracy                           0.73       596
   macro avg       0.73      0.73      0.73       596
weighted avg       0.73      0.73      0.73       596

