In [92]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import metrics 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from collections import Counter
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

### NAIVES BAYES - Utillisation de MultinomialNB()

In [93]:
categories = ["not_sexist", "sexist"]
    #               2161           989

In [94]:
data = pd.read_csv("my_csv_clean.csv",sep = ',')
data.columns = ['tweet', 'class']

X = data['tweet']
y = data['class']

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)

###### Tokenisation avec scikit-learrn

In [95]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(2203, 8646)

###### Des occurences aux fréquences

In [96]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2203, 8646)

###### Construction du model 

In [97]:
clf = MultinomialNB().fit(X_train_tfidf, y_train)

###### Tests sur des exemples simples

In [98]:
tweet_test = ['La femme doit être dans la cuisine', "La femme est belle", "Un homme", "Les hommes sont tous les mêmes"]
X_new_counts = count_vect.transform(tweet_test)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(tweet_test, predicted):
     print('%r => %s' % (doc, categories[category]))

'La femme doit être dans la cuisine' => not_sexist
'La femme est belle' => sexist
'Un homme' => not_sexist
'Les hommes sont tous les mêmes' => not_sexist


###### Test sur X_test

In [99]:
X_test_new_counts =  count_vect.transform(X_test)
X_test_new_tfidf = tfidf_transformer.transform(X_test_new_counts)
predicted = clf.predict(X_test_new_tfidf)
print("Matrice de confusion :")
print(metrics.confusion_matrix(y_test, predicted))
print("Accuracy : ", np.mean(predicted == y_test))
print("Balanced accuracy", metrics.balanced_accuracy_score(y_test, predicted))
print(metrics.classification_report(y_test, predicted,target_names=categories))

Matrice de confusion :
[[634   2]
 [279  30]]
Accuracy :  0.7026455026455026
Balanced accuracy 0.5469713622763632
              precision    recall  f1-score   support

  not_sexist       0.69      1.00      0.82       636
      sexist       0.94      0.10      0.18       309

    accuracy                           0.70       945
   macro avg       0.82      0.55      0.50       945
weighted avg       0.77      0.70      0.61       945



###### OVERSAMPLING

In [100]:
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(np.array(X_train).reshape(-1,1), y_train)
print(Counter(y_ros))

Counter({0: 1523, 1: 1523})


In [101]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_ros.ravel())
X_train_counts.shape

(3046, 8646)

In [102]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3046, 8646)

In [103]:
clf = MultinomialNB().fit(X_train_tfidf, y_ros)
docs_new = ["y'a que les femmes qui pleurent", "C'est un homme."] 
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, categories[category]))
        
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf.fit(X_ros.ravel(), y_ros)


# resampling X, y
X_ros_test, y_ros_test = ros.fit_resample(np.array(X_test).reshape(-1,1), y_test)# new class distribution 
docs_test = X_ros_test.ravel()
predicted = text_clf.predict(docs_test)
print()
print("Accuracy : ", np.mean(predicted == y_ros_test))
print()
print("Matrice de confusion : ")
print(metrics.confusion_matrix(y_ros_test, predicted))
print()
print(metrics.classification_report(y_ros_test, predicted,target_names=categories))

"y'a que les femmes qui pleurent" => sexist
"C'est un homme." => sexist

Accuracy :  0.7421383647798742

Matrice de confusion : 
[[425 211]
 [117 519]]

              precision    recall  f1-score   support

  not_sexist       0.78      0.67      0.72       636
      sexist       0.71      0.82      0.76       636

    accuracy                           0.74      1272
   macro avg       0.75      0.74      0.74      1272
weighted avg       0.75      0.74      0.74      1272



###### Undersampling

In [104]:
# instantiating the random over sampler 
rus = RandomUnderSampler()
# resampling X, y
X_rus, y_rus = rus.fit_resample(np.array(X_train).reshape(-1,1), y_train)# new class distribution 
print(Counter(y_rus))

Counter({0: 680, 1: 680})


In [105]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_rus.ravel())
X_train_counts.shape

(1360, 6426)

In [106]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1360, 6426)

In [107]:
clf = MultinomialNB().fit(X_train_tfidf, y_rus)
docs_new = ["y'a que les femmes qui pleurent", "C'est un homme."] 
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, categories[category]))
        
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf.fit(X_rus.ravel(), y_rus)


# resampling X, y
X_rus_test, y_rus_test = rus.fit_resample(np.array(X_test).reshape(-1,1), y_test)# new class distribution 
docs_test = X_rus_test.ravel()
predicted = text_clf.predict(docs_test)
print()
print("Accuracy : ", np.mean(predicted == y_rus_test))
print()
print("Matrice de confusion : ")
print(metrics.confusion_matrix(y_rus_test, predicted))
print()
print(metrics.classification_report(y_rus_test, predicted,target_names=categories))


"y'a que les femmes qui pleurent" => sexist
"C'est un homme." => sexist

Accuracy :  0.7411003236245954

Matrice de confusion : 
[[190 119]
 [ 41 268]]

              precision    recall  f1-score   support

  not_sexist       0.82      0.61      0.70       309
      sexist       0.69      0.87      0.77       309

    accuracy                           0.74       618
   macro avg       0.76      0.74      0.74       618
weighted avg       0.76      0.74      0.74       618



###### Oversampling and Undersampling

In [108]:
over = RandomOverSampler(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.8)

In [109]:
X_over, y_over = over.fit_resample(np.array(X_train).reshape(-1,1), y_train)
print(f"Oversampled: {Counter(y_over)}")

Oversampled: Counter({0: 1523, 1: 761})


In [110]:
# now to comine under sampling 
X_ros, y_ros = under.fit_resample(X_over, y_over)
print(f"Combined Random Sampling: {Counter(y_ros)}")

Combined Random Sampling: Counter({0: 951, 1: 761})


In [111]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_ros.ravel())
X_train_counts.shape

(1712, 7165)

In [112]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1712, 7165)

In [113]:
clf = MultinomialNB().fit(X_train_tfidf, y_ros)
docs_new = ["y'a que les femmes qui pleurent", "C'est un homme."]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, categories[category]))
        
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf.fit(X_ros.ravel(), y_ros)

X_ros_test, y_ros_test = ros.fit_resample(np.array(X_test).reshape(-1,1), y_test)# new class distribution 
docs_test = X_ros_test.ravel()
predicted = text_clf.predict(docs_test)
print("Accuracy : ", np.mean(predicted == y_ros_test))
metrics.confusion_matrix(y_ros_test, predicted)
print(metrics.classification_report(y_ros_test, predicted,target_names=categories))

"y'a que les femmes qui pleurent" => not_sexist
"C'est un homme." => sexist
Accuracy :  0.7389937106918238
              precision    recall  f1-score   support

  not_sexist       0.72      0.77      0.75       636
      sexist       0.76      0.70      0.73       636

    accuracy                           0.74      1272
   macro avg       0.74      0.74      0.74      1272
weighted avg       0.74      0.74      0.74      1272



### NAIVE BAYES - Approche manuelle

In [114]:
from sklearn.feature_extraction.text import CountVectorizer

data = pd.read_csv("my_csv_clean.csv",sep = ',') #we got that csv after running the Text Preprocessing file
data.columns = ['tweet', 'class']
    

X = data['tweet']
y = data['class'] 

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)


categories = ["NotSexist", "Sexist"]

X_train = X_train.tolist()
X_test = X_test.tolist()

y_train = y_train.tolist()
y_test = y_test.tolist()

X_train_0 = []
X_train_1 = []

for i in range(len(y_train)):
    if y_train[i]==0:
        X_train_0.append(X_train[i])
    else:
        X_train_1.append(X_train[i])

vec_train_0 = CountVectorizer()
X_c0 = vec_train_0.fit_transform(X_train_0)
tdm_0 = pd.DataFrame(X_c0.toarray(), columns=vec_train_0.get_feature_names())


vec_train_1 = CountVectorizer()
X_c1 = vec_train_1.fit_transform(X_train_1)
tdm_1 = pd.DataFrame(X_c1.toarray(), columns=vec_train_1.get_feature_names())

In [115]:
word_list_0 = vec_train_0.get_feature_names();    
count_list_0 = X_c0.toarray().sum(axis=0) 
freq_0 = dict(zip(word_list_0,count_list_0))

word_list_1 = vec_train_1.get_feature_names();    
count_list_1 = X_c1.toarray().sum(axis=0) 
freq_1 = dict(zip(word_list_1,count_list_1))

In [116]:
prob_0 = []
for word,count in zip(word_list_0,count_list_0):
    prob_0.append(count/len(word_list_0))
    
prob_1 = []
for word,count in zip(word_list_1,count_list_1):
    prob_1.append(count/len(word_list_1))

In [117]:
vec_0 = CountVectorizer()
X_vec_0 = vec_0.fit_transform(X_train_0)

total_features0 = len(vec_0.get_feature_names())
total_features0

6533

In [118]:
vec_1 = CountVectorizer()
X_vec_1 = vec_1.fit_transform(X_train_1)
total_features1 = len(vec_1.get_feature_names())
total_features1

proba0 = total_features0 / (total_features0+total_features1)
proba1 = total_features1 / (total_features0+total_features1)

print(proba0)
print(proba1)

0.6194765787976484
0.3805234212023516


In [119]:
total_cnts_features_0 = count_list_0.sum(axis=0)
total_cnts_features_1 = count_list_1.sum(axis=0)

In [120]:
from nltk.tokenize import word_tokenize

def proba_sentence_class0(sentence, total_cnts_features, total_features):
    new_word_list = word_tokenize(sentence)
    prob_s_with_ls = []
    for word in new_word_list:
        if word in freq_0.keys():
            count = freq_0[word]
        else:
            count = 0
        prob_s_with_ls.append((count + 1)/(total_cnts_features + total_features))
    d = dict(zip(new_word_list,prob_s_with_ls))
    res = 1
    for word in sentence.split():
        res = res * d[word]

    res = res * proba0
    return res

def proba_sentence_class1(sentence,total_cnts_features, total_features):
    new_word_list = word_tokenize(sentence)
    prob_s_with_ls = []
    for word in new_word_list:
        if word in freq_1.keys():
            count = freq_1[word]
        else:
            count = 0
        prob_s_with_ls.append((count + 1)/(total_cnts_features + total_features))
    d = dict(zip(new_word_list,prob_s_with_ls))
    res = 1
    for word in sentence.split():
        res = res * d[word]

    res = res * proba1
    return res

In [121]:
predictions = []
for i in X_test:
    proba_class0 = proba_sentence_class0(i,total_cnts_features_0,total_features0)
    proba_class1 = proba_sentence_class1(i,total_cnts_features_1,total_features1)
    if proba_class0 > proba_class1:
        predictions.append(0)
    else:
        predictions.append(1)
        

In [122]:
total_labels = len(y_test)
acc = 0
for i in range(len(y_test)):
    if(y_test[i]==predictions[i]):
        acc = acc + 1
        
acc = acc / total_labels
print("Accuracy :" , acc)
print()
print("Balanced accuracy :" , metrics.balanced_accuracy_score(y_test, predictions))
print()
metrics.confusion_matrix(y_test, predictions)
print(metrics.classification_report(y_test, predictions,target_names=categories))

Accuracy : 0.6264550264550265

Balanced accuracy : 0.7067441860465116

              precision    recall  f1-score   support

   NotSexist       0.93      0.49      0.64       645
      Sexist       0.46      0.93      0.61       300

    accuracy                           0.63       945
   macro avg       0.70      0.71      0.63       945
weighted avg       0.78      0.63      0.63       945



### NAIVE BAYES - Approche manuelle + Oversampling

In [123]:
from sklearn.feature_extraction.text import CountVectorizer

data = pd.read_csv("my_csv_clean.csv",sep = ',')
data.columns = ['tweet', 'class']
    

X = data['tweet']
y = data['class'] 

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)


categories = ["NotSexist", "Sexist"]

X_train = X_train.tolist()
X_test = X_test.tolist()

y_train = y_train.tolist()
y_test = y_test.tolist()

ros = RandomOverSampler()

X_ros_train, y_ros_train = ros.fit_resample(np.array(X_train).reshape(-1,1), y_train)
X_ros_test, y_ros_test = ros.fit_resample(np.array(X_test).reshape(-1,1), y_test)


X_train_0 = []
X_train_1 = []

for i in range(len(y_ros_train)):
    if y_ros_train[i]==0:
        X_train_0.append(X_ros_train[i].tolist()[0])
    else:
        X_train_1.append(X_ros_train[i].tolist()[0])




vec_train_0 = CountVectorizer()
X_c0 = vec_train_0.fit_transform(X_train_0)
tdm_0 = pd.DataFrame(X_c0.toarray(), columns=vec_train_0.get_feature_names())


vec_train_1 = CountVectorizer()
X_c1 = vec_train_1.fit_transform(X_train_1)
tdm_1 = pd.DataFrame(X_c1.toarray(), columns=vec_train_1.get_feature_names())

In [124]:
word_list_0 = vec_train_0.get_feature_names();    
count_list_0 = X_c0.toarray().sum(axis=0) 
freq_0 = dict(zip(word_list_0,count_list_0))

word_list_1 = vec_train_1.get_feature_names();    
count_list_1 = X_c1.toarray().sum(axis=0) 
freq_1 = dict(zip(word_list_1,count_list_1))

In [125]:
prob_0 = []
for word,count in zip(word_list_0,count_list_0):
    prob_0.append(count/len(word_list_0))
    
prob_1 = []
for word,count in zip(word_list_1,count_list_1):
    prob_1.append(count/len(word_list_1))

In [126]:
vec_0 = CountVectorizer()
X_vec_0 = vec_0.fit_transform(X_train_0)

total_features0 = len(vec_0.get_feature_names())
total_features0

6608

In [127]:
vec_1 = CountVectorizer()
X_vec_1 = vec_1.fit_transform(X_train_1)
total_features1 = len(vec_1.get_feature_names())
total_features1

proba0 = total_features0 / (total_features0+total_features1)
proba1 = total_features1 / (total_features0+total_features1)

print(proba0)
print(proba1)

0.6239848914069878
0.3760151085930123


In [128]:
total_cnts_features_0 = count_list_0.sum(axis=0)
total_cnts_features_1 = count_list_1.sum(axis=0)

In [129]:
predictions = []
for i in X_ros_test:
    proba_class0 = proba_sentence_class0(i[0],total_cnts_features_0,total_features0)
    proba_class1 = proba_sentence_class1(i[0],total_cnts_features_1,total_features1)
    if proba_class0 > proba_class1:
        predictions.append(0)
    else:
        predictions.append(1)
        

In [130]:
total_labels = len(y_ros_test)
acc = 0
for i in range(len(y_ros_test)):
    if(y_ros_test[i]==predictions[i]):
        acc = acc + 1
        
acc = acc / total_labels
print("Accuracy :" , acc)
print()
metrics.confusion_matrix(y_ros_test, predictions)
print(metrics.classification_report(y_ros_test, predictions,target_names=categories))

Accuracy : 0.7589147286821706

              precision    recall  f1-score   support

   NotSexist       0.80      0.69      0.74       645
      Sexist       0.73      0.82      0.77       645

    accuracy                           0.76      1290
   macro avg       0.76      0.76      0.76      1290
weighted avg       0.76      0.76      0.76      1290



### NAIVE BAYES - Approche manuelle + Undersampling

In [131]:
from sklearn.feature_extraction.text import CountVectorizer

data = pd.read_csv("my_csv_clean.csv",sep = ',') #we got that csv after running the Text Preprocessing file
data.columns = ['tweet', 'class']
    

X = data['tweet']
y = data['class'] 

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)


categories = ["NotSexist", "Sexist"]

X_train = X_train.tolist()
X_test = X_test.tolist()

y_train = y_train.tolist()
y_test = y_test.tolist()

rus = RandomUnderSampler()

X_rus_train, y_rus_train = rus.fit_resample(np.array(X_train).reshape(-1,1), y_train)# new class distribution 
X_rus_test, y_rus_test = rus.fit_resample(np.array(X_test).reshape(-1,1), y_test)# new class distribution 


X_train_0 = []
X_train_1 = []

for i in range(len(y_rus_train)):
    if y_rus_train[i]==0:
        X_train_0.append(X_rus_train[i].tolist()[0])
    else:
        X_train_1.append(X_rus_train[i].tolist()[0])




vec_train_0 = CountVectorizer()
X_c0 = vec_train_0.fit_transform(X_train_0)
tdm_0 = pd.DataFrame(X_c0.toarray(), columns=vec_train_0.get_feature_names())


vec_train_1 = CountVectorizer()
X_c1 = vec_train_1.fit_transform(X_train_1)
tdm_1 = pd.DataFrame(X_c1.toarray(), columns=vec_train_1.get_feature_names())

In [132]:
word_list_0 = vec_train_0.get_feature_names();    
count_list_0 = X_c0.toarray().sum(axis=0) 
freq_0 = dict(zip(word_list_0,count_list_0))

word_list_1 = vec_train_1.get_feature_names();    
count_list_1 = X_c1.toarray().sum(axis=0) 
freq_1 = dict(zip(word_list_1,count_list_1))

In [133]:
prob_0 = []
for word,count in zip(word_list_0,count_list_0):
    prob_0.append(count/len(word_list_0))
    
prob_1 = []
for word,count in zip(word_list_1,count_list_1):
    prob_1.append(count/len(word_list_1))

In [134]:
vec_0 = CountVectorizer()
X_vec_0 = vec_0.fit_transform(X_train_0)

total_features0 = len(vec_0.get_feature_names())
total_features0

3799

In [135]:
vec_1 = CountVectorizer()
X_vec_1 = vec_1.fit_transform(X_train_1)
total_features1 = len(vec_1.get_feature_names())
total_features1

proba0 = total_features0 / (total_features0+total_features1)
proba1 = total_features1 / (total_features0+total_features1)

print(proba0)
print(proba1)

0.4894357124452461
0.5105642875547539


In [136]:
total_cnts_features_0 = count_list_0.sum(axis=0)
total_cnts_features_1 = count_list_1.sum(axis=0)

In [137]:
predictions = []
for i in X_rus_test:
    proba_class0 = proba_sentence_class0(i[0],total_cnts_features_0,total_features0)
    proba_class1 = proba_sentence_class1(i[0],total_cnts_features_1,total_features1)
    if proba_class0 > proba_class1:
        predictions.append(0)
    else:
        predictions.append(1)
        

In [139]:
total_labels = len(y_rus_test)
acc = 0
for i in range(len(y_rus_test)):
    if(y_rus_test[i]==predictions[i]):
        acc = acc + 1
        
acc = acc / total_labels
print("Accuracy :" , acc)
print()
metrics.confusion_matrix(y_rus_test, predictions)
print(metrics.classification_report(y_rus_test, predictions,target_names=categories))

Accuracy : 0.6889632107023411

              precision    recall  f1-score   support

   NotSexist       0.69      0.70      0.69       299
      Sexist       0.69      0.68      0.69       299

    accuracy                           0.69       598
   macro avg       0.69      0.69      0.69       598
weighted avg       0.69      0.69      0.69       598

