### TUTORIAL

In [73]:
import pandas as pd
import numpy as np
import csv
from sklearn.utils import Bunch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

from gensim.models import Word2Vec
import nltk
from gensim.models import KeyedVectors

from nltk.cluster import KMeansClusterer
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
import numpy as np 

from sklearn import cluster
from sklearn import metrics

from tensorflow.keras.preprocessing.text import one_hot
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [74]:
    categories = ["not_sexist", "sexist"]
    #               2161           989

In [75]:
data = pd.read_csv("my_csv.csv",sep = ',')
data.columns = ['tweet', 'class']

X = data['tweet']
y = data['class']

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)

### Tokenizing text with scikit-learn

In [76]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(2204, 11289)

In [77]:
count_vect.vocabulary_.get(u'femme')

4021

### From occurrences to frequencies

In [78]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2204, 11289)

### Training a classifier

In [79]:
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [80]:
docs_new = ["Les hommes c'est tous les mêmes", 'Il est étudiant.'] #marche pas du tout
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, categories[category]))

"Les hommes c'est tous les mêmes" => not_sexist
'Il est étudiant.' => not_sexist


### Building a pipeline

In [81]:
text_clf = Pipeline([
...     ('vect', CountVectorizer()),
...     ('tfidf', TfidfTransformer()),
...     ('clf', MultinomialNB()),
... ])
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

### Evaluation of the performance on the test set

In [82]:
docs_test = X_test
predicted = text_clf.predict(docs_test)
np.mean(predicted == y_test)

0.6857142857142857

### SVM

In [83]:
text_clf = Pipeline([
...     ('vect', CountVectorizer()),
...     ('tfidf', TfidfTransformer()),
...     ('clf', SGDClassifier(loss='hinge', penalty='l2',
...                           alpha=1e-3, random_state=42,
...                           max_iter=5, tol=None)),
... ])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(docs_test)
np.mean(predicted == y_test)

0.7470899470899471

In [84]:
print(metrics.classification_report(y_test, predicted,
...     target_names=categories))

              precision    recall  f1-score   support

  not_sexist       0.75      0.92      0.83       634
      sexist       0.71      0.39      0.50       311

    accuracy                           0.75       945
   macro avg       0.73      0.66      0.67       945
weighted avg       0.74      0.75      0.72       945



In [85]:
metrics.confusion_matrix(y_test, predicted)

array([[586,  48],
       [191, 120]])

### Parameter tuning using grid search

In [86]:
parameters = {
...     'vect__ngram_range': [(1, 1), (1, 2)],
...     'tfidf__use_idf': (True, False),
...     'clf__alpha': (1e-2, 1e-3),
... }

In [87]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [88]:
gs_clf = gs_clf.fit(X_train[:100], y_train[:100])
#ameliorer vecteur et algo et representation
#-> plus de poids sur les mots sexistes

In [89]:
example_1 = 'La femme'
example_2 = "L'homme"
example_3 = "La femme doit être dans la cuisine"

print(example_1 + "  => " + str(categories[gs_clf.predict([example_1])[0]]))
print(example_2 + "  => " + str(categories[gs_clf.predict([example_2])[0]]))
print(example_3 + "  => " + str(categories[gs_clf.predict([example_3])[0]]))

La femme  => not_sexist
L'homme  => not_sexist
La femme doit être dans la cuisine  => not_sexist


In [90]:
gs_clf.best_score_

0.72

In [91]:
for param_name in sorted(parameters.keys()):
     print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.01
tfidf__use_idf: False
vect__ngram_range: (1, 2)


### Oversampling

In [92]:
# instantiating the random over sampler 
ros = RandomOverSampler()
# resampling X, y
X_ros, y_ros = ros.fit_resample(np.array(X_train).reshape(-1,1), y_train)# new class distribution 
print(Counter(y_ros))

Counter({0: 1526, 1: 1526})


In [93]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_ros.ravel())
X_train_counts.shape

(3052, 11289)

In [94]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3052, 11289)

In [95]:
clf = MultinomialNB().fit(X_train_tfidf, y_ros)
docs_new = ["y'a que les femmes qui pleurent", "C'est un homme."] 
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, categories[category]))
        
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf.fit(X_ros.ravel(), y_ros)


# resampling X, y
X_ros_test, y_ros_test = ros.fit_resample(np.array(X_test).reshape(-1,1), y_test)# new class distribution 
docs_test = X_ros_test.ravel()
predicted = text_clf.predict(docs_test)
np.mean(predicted == y_ros_test)
print("Accuracy : ", np.mean(predicted == y_ros_test))
metrics.confusion_matrix(y_ros_test, predicted)

"y'a que les femmes qui pleurent" => sexist
"C'est un homme." => sexist
Accuracy :  0.7413249211356467


array([[405, 229],
       [ 99, 535]])

### Undersampling

In [96]:
# instantiating the random over sampler 
ros = RandomUnderSampler()
# resampling X, y
X_ros, y_ros = ros.fit_resample(np.array(X_train).reshape(-1,1), y_train)# new class distribution 
print(Counter(y_ros))

Counter({0: 678, 1: 678})


In [97]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_ros.ravel())
X_train_counts.shape

(1356, 8098)

In [98]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1356, 8098)

In [99]:
clf = MultinomialNB().fit(X_train_tfidf, y_ros)
docs_new = ["y'a que les femmes qui pleurent", "C'est un homme."] 
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, categories[category]))
        
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf.fit(X_ros.ravel(), y_ros)


# resampling X, y
X_ros_test, y_ros_test = ros.fit_resample(np.array(X_test).reshape(-1,1), y_test)# new class distribution 
docs_test = X_ros_test.ravel()
predicted = text_clf.predict(docs_test)
print("Accuracy : ", np.mean(predicted == y_ros_test))
metrics.confusion_matrix(y_ros_test, predicted)


"y'a que les femmes qui pleurent" => sexist
"C'est un homme." => sexist
Accuracy :  0.7282958199356914


array([[176, 135],
       [ 34, 277]])

### Oversampling and Undersampling

In [100]:
over = RandomOverSampler(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.8)

In [101]:
X_over, y_over = over.fit_resample(np.array(X_train).reshape(-1,1), y_train)
print(f"Oversampled: {Counter(y_over)}")

Oversampled: Counter({0: 1526, 1: 763})


In [102]:
# now to comine under sampling 
X_ros, y_ros = under.fit_resample(X_over, y_over)
print(f"Combined Random Sampling: {Counter(y_ros)}")

Combined Random Sampling: Counter({0: 953, 1: 763})


In [103]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_ros.ravel())
X_train_counts.shape

(1716, 9158)

In [104]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1716, 9158)

In [105]:
clf = MultinomialNB().fit(X_train_tfidf, y_ros)
docs_new = ["y'a que les femmes qui pleurent", "C'est un homme."]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, categories[category]))
        
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf.fit(X_ros.ravel(), y_ros)


# resampling X, y
X_ros_test, y_ros_test = ros.fit_resample(np.array(X_test).reshape(-1,1), y_test)# new class distribution 
docs_test = X_ros_test.ravel()
predicted = text_clf.predict(docs_test)
print("Accuracy : ", np.mean(predicted == y_ros_test))
metrics.confusion_matrix(y_ros_test, predicted)


"y'a que les femmes qui pleurent" => sexist
"C'est un homme." => sexist
Accuracy :  0.7202572347266881


array([[240,  71],
       [103, 208]])

### Naive Bayes classifier

In [145]:
from sklearn.feature_extraction.text import CountVectorizer

data = pd.read_csv("my_csv_clean.csv",sep = ',') #we got that csv after running the Text Preprocessing file
data.columns = ['tweet', 'class']
    

X = data['tweet']
y = data['class'] 


categories = ["NotSexist", "Sexist"]

X_0 = []
X_1 = []
for i in range(len(data)):
    if data['class'][i]==0:
        X_0.append(data['tweet'][i])
        
for i in range(len(data)):
    if data['class'][i]==1:
        X_1.append(data['tweet'][i])



vec_0 = CountVectorizer()
X_c0 = vec_0.fit_transform(X_0)
tdm_0 = pd.DataFrame(X_c0.toarray(), columns=vec_0.get_feature_names())


vec_1 = CountVectorizer()
X_c1 = vec_1.fit_transform(X_1)
tdm_1 = pd.DataFrame(X_c1.toarray(), columns=vec_1.get_feature_names())

tdm_1


Unnamed: 0,02,04,06,10,100,1000,102,12,120,12000,...,yqb,yves,zaimerais,zamzonite,zenash,zeniss,zetais,zlatan,zohrabitan,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [146]:
word_list_0 = vec_0.get_feature_names();    
count_list_0 = X_c0.toarray().sum(axis=0) 
freq_0 = dict(zip(word_list_0,count_list_0))
freq_0

{'000': 4,
 '10': 7,
 '100': 10,
 '1000': 1,
 '102': 2,
 '10h': 2,
 '10h00': 1,
 '10h30': 1,
 '11': 4,
 '1100': 1,
 '11000': 1,
 '117': 1,
 '12': 8,
 '1200': 1,
 '12000': 1,
 '120battementsparminute': 1,
 '1215': 1,
 '12h': 1,
 '12joursdaction': 1,
 '13': 7,
 '13h50': 1,
 '14': 9,
 '14eme': 1,
 '14h': 1,
 '14h00': 1,
 '14h15': 2,
 '15': 11,
 '157garlamd': 1,
 '15min': 1,
 '15x': 1,
 '16': 4,
 '169': 1,
 '16days': 2,
 '16h': 3,
 '16jours': 3,
 '17': 7,
 '1700': 1,
 '1703': 1,
 '1730': 1,
 '18': 9,
 '180': 1,
 '18006675005': 1,
 '18006686868': 1,
 '1825': 1,
 '18711955': 1,
 '18h': 2,
 '18h00': 1,
 '18h30': 1,
 '18h55': 1,
 '19': 6,
 '1900': 1,
 '1904': 1,
 '1905': 1,
 '1920': 2,
 '1927': 2,
 '1929': 2,
 '195': 2,
 '1954': 1,
 '1962': 1,
 '1965': 1,
 '1970': 1,
 '1979': 1,
 '1986': 1,
 '1987': 1,
 '1992': 1,
 '1999': 1,
 '19e': 1,
 '19h': 1,
 '19h30': 1,
 '19hruthelkrief': 2,
 '1a0maiscpasperdu': 1,
 '1dexmab': 1,
 '1dexmag': 1,
 '1e': 1,
 '1er': 10,
 '1ere': 17,
 '1iran': 1,
 '1les': 1,

In [147]:
word_list_1 = vec_1.get_feature_names();    
count_list_1 = X_c1.toarray().sum(axis=0) 
freq_1 = dict(zip(word_list_1,count_list_1))
freq_1

{'02': 3,
 '04': 3,
 '06': 1,
 '10': 6,
 '100': 2,
 '1000': 1,
 '102': 1,
 '12': 5,
 '120': 2,
 '12000': 1,
 '12f': 1,
 '13': 4,
 '14': 5,
 '14amp15': 1,
 '14ans': 1,
 '15': 2,
 '150eur': 1,
 '15ans': 1,
 '16': 2,
 '16ans': 1,
 '16days': 2,
 '16jours': 3,
 '17': 1,
 '1964': 1,
 '1990': 1,
 '1992': 2,
 '1997': 2,
 '1998': 1,
 '1999': 1,
 '1cm': 1,
 '1dexmag': 1,
 '1er': 1,
 '1ere': 3,
 '20': 7,
 '2004': 1,
 '2005': 1,
 '2006': 1,
 '2011': 1,
 '20122017': 1,
 '2014': 1,
 '2015': 2,
 '2016': 2,
 '2017': 9,
 '2018': 15,
 '20ans': 1,
 '20hfrance2': 1,
 '20minutes': 2,
 '21': 2,
 '21eme': 2,
 '22': 4,
 '23': 1,
 '24': 1,
 '2499': 1,
 '24hpujadas': 1,
 '25': 2,
 '25janvier': 2,
 '26': 1,
 '26caroll': 1,
 '27': 2,
 '28': 1,
 '2choses': 1,
 '2h': 1,
 '30': 4,
 '31': 1,
 '32': 1,
 '36': 1,
 '3600x': 1,
 '364': 1,
 '3784e': 1,
 '38': 2,
 '3eme': 1,
 '3min': 1,
 '40': 1,
 '40t': 1,
 '44': 1,
 '44t': 1,
 '4h': 1,
 '4meur': 1,
 '50': 5,
 '500': 1,
 '5000': 1,
 '50100500': 1,
 '50euros': 1,
 '50nuanc

In [148]:
prob_0 = []
for word,count in zip(word_list_0,count_list_0):
    prob_0.append(count/len(word_list_0))
dict(zip(word_list_0,prob_0))


{'000': 0.00043677658877484165,
 '10': 0.0007643590303559729,
 '100': 0.0010919414719371041,
 '1000': 0.00010919414719371041,
 '102': 0.00021838829438742082,
 '10h': 0.00021838829438742082,
 '10h00': 0.00010919414719371041,
 '10h30': 0.00010919414719371041,
 '11': 0.00043677658877484165,
 '1100': 0.00010919414719371041,
 '11000': 0.00010919414719371041,
 '117': 0.00010919414719371041,
 '12': 0.0008735531775496833,
 '1200': 0.00010919414719371041,
 '12000': 0.00010919414719371041,
 '120battementsparminute': 0.00010919414719371041,
 '1215': 0.00010919414719371041,
 '12h': 0.00010919414719371041,
 '12joursdaction': 0.00010919414719371041,
 '13': 0.0007643590303559729,
 '13h50': 0.00010919414719371041,
 '14': 0.0009827473247433938,
 '14eme': 0.00010919414719371041,
 '14h': 0.00010919414719371041,
 '14h00': 0.00010919414719371041,
 '14h15': 0.00021838829438742082,
 '15': 0.0012011356191308146,
 '157garlamd': 0.00010919414719371041,
 '15min': 0.00010919414719371041,
 '15x': 0.000109194147193

In [149]:
prob_1 = []
for word,count in zip(word_list_1,count_list_1):
    prob_1.append(count/len(word_list_1))
dict(zip(word_list_1,prob_1))


{'02': 0.0005337128624799857,
 '04': 0.0005337128624799857,
 '06': 0.0001779042874933286,
 '10': 0.0010674257249599715,
 '100': 0.0003558085749866572,
 '1000': 0.0001779042874933286,
 '102': 0.0001779042874933286,
 '12': 0.000889521437466643,
 '120': 0.0003558085749866572,
 '12000': 0.0001779042874933286,
 '12f': 0.0001779042874933286,
 '13': 0.0007116171499733144,
 '14': 0.000889521437466643,
 '14amp15': 0.0001779042874933286,
 '14ans': 0.0001779042874933286,
 '15': 0.0003558085749866572,
 '150eur': 0.0001779042874933286,
 '15ans': 0.0001779042874933286,
 '16': 0.0003558085749866572,
 '16ans': 0.0001779042874933286,
 '16days': 0.0003558085749866572,
 '16jours': 0.0005337128624799857,
 '17': 0.0001779042874933286,
 '1964': 0.0001779042874933286,
 '1990': 0.0001779042874933286,
 '1992': 0.0003558085749866572,
 '1997': 0.0003558085749866572,
 '1998': 0.0001779042874933286,
 '1999': 0.0001779042874933286,
 '1cm': 0.0001779042874933286,
 '1dexmag': 0.0001779042874933286,
 '1er': 0.00017790

In [150]:
from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)
X_train_0 = []
X_train_1 = []
print(X_train)
print(X_train.index)

index =  X_train.index
for i in index:
    if (y_train[i]==0):
        X_train_0.append(X_train[i])
    else:
        X_train_1.append(X_train[i])       

1609    dans armeedelair  pas inegalites de salaires e...
503              meilleure  aime plus tout balancetonporc
2295    nappellerai pas ca journaliste est tellement s...
783     etmaintenant joignezvous mouvement signez amp ...
1435    fille adoptive de woodyallen detaille agressio...
                              ...                        
732     solidaritefemme membre du collectif ensembleco...
527     angela merkel vient de feliciter hongrois vikt...
777     + affligeant dans laveritesurlesfilles est c c...
2421    campagne plus ridicule du mondeest vrai stress...
79      marianorajoy tweete espagnol theresamay tweete...
Name: tweet, Length: 2203, dtype: object
Int64Index([1609,  503, 2295,  783, 1435, 1466, 1958,  271, 1704, 1950,
            ...
            2848, 1734, 1954,  393, 1243,  732,  527,  777, 2421,   79],
           dtype='int64', length=2203)


In [151]:
vec_0 = CountVectorizer()
X_vec_0 = vec_0.fit_transform(X_train_0)

total_features0 = len(vec_0.get_feature_names())
total_features0


7266

In [152]:
vec_1 = CountVectorizer()
X_vec_1 = vec_1.fit_transform(X_train_1)

total_features1 = len(vec_1.get_feature_names())
total_features1

proba0 = total_features0 / (total_features0+total_features1)
proba1 = total_features1 / (total_features0+total_features1)

print(proba0)
print(proba1)

0.6232095376962004
0.37679046230379964


In [153]:
total_cnts_features_0 = count_list_0.sum(axis=0)
total_cnts_features_1 = count_list_1.sum(axis=0)

In [154]:
from nltk.tokenize import word_tokenize
new_sentence = 'what is the price of the book'
new_word_list = word_tokenize(new_sentence)

In [155]:
def proba_sentence_class0(sentence):
    new_word_list = word_tokenize(sentence)
    prob_s_with_ls = []
    for word in new_word_list:
        if word in freq_0.keys():
            count = freq_0[word]
        else:
            count = 0
        prob_s_with_ls.append((count + 1)/(total_cnts_features_0 + total_features0))
    d = dict(zip(new_word_list,prob_s_with_ls))
    res = 1
    for word in sentence.split():
        res = res * d[word]

    res = res * proba0
    return res


In [156]:
def proba_sentence_class1(sentence):
    new_word_list = word_tokenize(sentence)
    prob_s_with_ls = []
    for word in new_word_list:
        if word in freq_1.keys():
            count = freq_1[word]
        else:
            count = 0
        prob_s_with_ls.append((count + 1)/(total_cnts_features_1 + total_features1))
    d = dict(zip(new_word_list,prob_s_with_ls))
    res = 1
    for word in sentence.split():
        res = res * d[word]

    res = res * proba1
    return res

print(proba_sentence_class1("what is the price of the book"))

2.075591881789903e-31


In [157]:
predictions = []
for i in X_test:
    print(i)
    proba_class0 = proba_sentence_class0(i)
    proba_class1 = proba_sentence_class1(i)
    if proba_class0 > proba_class1:
        predictions.append(0)
    else:
        predictions.append(1)
        
print(predictions)

balancetonporc  responsabilite collective  by sophiebarel
hallucinantcatherine deneuve signataires mais pour certains est tribune de catherine deneuvemalhonnetete ou paresse intellectuelle  fatigue
article  mansplaining  de wikipediafr ete cree mars 2013 avant equivalent anglais sexisme
voir sur couverture de magazines pseudo plaidoyer pour quon donne enfin  parole aux hommes  pauvres femmes sont si mechantes  wtf consternant heureusement lire intervention de robert redford qui vous rasserene  metoo balancetonporc
bref sexisme est vraiment cool
 joxe il capable du meilleur comme du pire mais est dans pire quil meilleur  metoo balancetonporc
miss france est pas balancetonporc est balance cv  jade feret
 appel de 1 000 jeunes filles  contre 100 merci pour cette tribune feministe  metoo
laprovence justice passe sans sexisme djihad warcrime amp crime contre humanity oumanite apartheid de humanite ps  partie pour rejoindre neant dont elle naurait jamais du sortir causeur tvlofficiel libe
su

rt orsys pourquoi jecrirai lettre chaque propos sexiste entendu dans cadre de travail  sexisme parite feminisme entreprise carriere o
balancetonporc frero passe me voir voisin interpelle lui disant retourne chez toi arabe entre autre frero voulu lui expliqe vi sodomisant il e parti courant bh dormire moin con sachant q 1 elevage de porc cote dchez moi
maintenant est femme  bigre  ===gt gerald darmanin revele avoir ete accuse de viol par homme via valeurs
acimed revient sur traitement mediatique de balancetonporc premier article sur biais un certain nombre de medias commentateurs
quand fais de merde ne vais pas applaudir exemple meilleure amie de 20 ans enceinte ayant ni travail ni projets suis incapable applaudir ou etre heureux face ca
16jours fait environ 120 millions de femmes agees de moins de 20 ans ont deja ete victimes un viol ou actes sexuels forces source unicef moiaussi unicefsa unfpasa unwomenafrica unsouthafrica ilesttemps orangerlemonde
aujourhui berlin emmanuelmacron ange

prefet72 marleneschiappa jcboulard lemenerdom cmorancais damienpichereau fpersonne slefoll karamanli72 nadinegrelet ljncg72 comment parler egalite entre homme femme lorsque femme base de propres ideesfortune quelle ne beneficie de strictement rien
cet article du monde diplomatique sur sexisme tres convainquant promis jour ce journal autorisera meme femmes ecrire pour lui
jsavais pas ct possible etre connasse ce point mais meufs nont pas de limite
brutofficiel mais bien sur  egalite salariale femmehomme dans code du travail vigueur depuis de nombreuses annees france google ami monsieur journaliste
le recours aux ordonnances pour reforme du code du travail na pas empeche debat il me parait legitime gouvernement saisisse un sujet tel reforme de sncf debat aura bien lieu parlement spinetta franceinfo lesinformes
justice va devoir pencher sur raisons qui ont conduit renvoi du proces de tron balancetonporc prend pas sur droit 
grandparisexpress deputes lareman groupemodem recus 8h ce matin p

In [158]:
total_labels = len(y_test)
acc = 0
for i in range(len(y_test)):
    if(y_test[y_test.index[i]]==predictions[i]):
        acc = acc + 1
        
acc = acc / total_labels
print("Accuracy :" , acc)
        
metrics.confusion_matrix(y_test, predictions)

Accuracy : 0.8465608465608465


array([[512, 141],
       [  4, 288]])

### NAIVES BAYES + OVERSAMPLING

In [219]:
data = pd.read_csv("my_csv_clean.csv",sep = ',') #we got that csv after running the Text Preprocessing file
data.columns = ['tweet', 'class']
    

X = data['tweet']
y = data['class'] 


categories = ["NotSexist", "Sexist"]

X_0 = []
X_1 = []
for i in range(len(data)):
    if data['class'][i]==0:
        X_0.append(data['tweet'][i])
        
for i in range(len(data)):
    if data['class'][i]==1:
        X_1.append(data['tweet'][i])

vec_0 = CountVectorizer()
X_c0 = vec_0.fit_transform(X_0)
tdm_0 = pd.DataFrame(X_c0.toarray(), columns=vec_0.get_feature_names())


vec_1 = CountVectorizer()
X_c1 = vec_1.fit_transform(X_1)
tdm_1 = pd.DataFrame(X_c1.toarray(), columns=vec_1.get_feature_names())

tdm_1

Unnamed: 0,02,04,06,10,100,1000,102,12,120,12000,...,yqb,yves,zaimerais,zamzonite,zenash,zeniss,zetais,zlatan,zohrabitan,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [220]:
word_list_0 = vec_0.get_feature_names();    
count_list_0 = X_c0.toarray().sum(axis=0) 
freq_0 = dict(zip(word_list_0,count_list_0))
freq_0

{'000': 4,
 '10': 7,
 '100': 10,
 '1000': 1,
 '102': 2,
 '10h': 2,
 '10h00': 1,
 '10h30': 1,
 '11': 4,
 '1100': 1,
 '11000': 1,
 '117': 1,
 '12': 8,
 '1200': 1,
 '12000': 1,
 '120battementsparminute': 1,
 '1215': 1,
 '12h': 1,
 '12joursdaction': 1,
 '13': 7,
 '13h50': 1,
 '14': 9,
 '14eme': 1,
 '14h': 1,
 '14h00': 1,
 '14h15': 2,
 '15': 11,
 '157garlamd': 1,
 '15min': 1,
 '15x': 1,
 '16': 4,
 '169': 1,
 '16days': 2,
 '16h': 3,
 '16jours': 3,
 '17': 7,
 '1700': 1,
 '1703': 1,
 '1730': 1,
 '18': 9,
 '180': 1,
 '18006675005': 1,
 '18006686868': 1,
 '1825': 1,
 '18711955': 1,
 '18h': 2,
 '18h00': 1,
 '18h30': 1,
 '18h55': 1,
 '19': 6,
 '1900': 1,
 '1904': 1,
 '1905': 1,
 '1920': 2,
 '1927': 2,
 '1929': 2,
 '195': 2,
 '1954': 1,
 '1962': 1,
 '1965': 1,
 '1970': 1,
 '1979': 1,
 '1986': 1,
 '1987': 1,
 '1992': 1,
 '1999': 1,
 '19e': 1,
 '19h': 1,
 '19h30': 1,
 '19hruthelkrief': 2,
 '1a0maiscpasperdu': 1,
 '1dexmab': 1,
 '1dexmag': 1,
 '1e': 1,
 '1er': 10,
 '1ere': 17,
 '1iran': 1,
 '1les': 1,

In [221]:
word_list_1 = vec_1.get_feature_names();    
count_list_1 = X_c1.toarray().sum(axis=0) 
freq_1 = dict(zip(word_list_1,count_list_1))
freq_1

{'02': 3,
 '04': 3,
 '06': 1,
 '10': 6,
 '100': 2,
 '1000': 1,
 '102': 1,
 '12': 5,
 '120': 2,
 '12000': 1,
 '12f': 1,
 '13': 4,
 '14': 5,
 '14amp15': 1,
 '14ans': 1,
 '15': 2,
 '150eur': 1,
 '15ans': 1,
 '16': 2,
 '16ans': 1,
 '16days': 2,
 '16jours': 3,
 '17': 1,
 '1964': 1,
 '1990': 1,
 '1992': 2,
 '1997': 2,
 '1998': 1,
 '1999': 1,
 '1cm': 1,
 '1dexmag': 1,
 '1er': 1,
 '1ere': 3,
 '20': 7,
 '2004': 1,
 '2005': 1,
 '2006': 1,
 '2011': 1,
 '20122017': 1,
 '2014': 1,
 '2015': 2,
 '2016': 2,
 '2017': 9,
 '2018': 15,
 '20ans': 1,
 '20hfrance2': 1,
 '20minutes': 2,
 '21': 2,
 '21eme': 2,
 '22': 4,
 '23': 1,
 '24': 1,
 '2499': 1,
 '24hpujadas': 1,
 '25': 2,
 '25janvier': 2,
 '26': 1,
 '26caroll': 1,
 '27': 2,
 '28': 1,
 '2choses': 1,
 '2h': 1,
 '30': 4,
 '31': 1,
 '32': 1,
 '36': 1,
 '3600x': 1,
 '364': 1,
 '3784e': 1,
 '38': 2,
 '3eme': 1,
 '3min': 1,
 '40': 1,
 '40t': 1,
 '44': 1,
 '44t': 1,
 '4h': 1,
 '4meur': 1,
 '50': 5,
 '500': 1,
 '5000': 1,
 '50100500': 1,
 '50euros': 1,
 '50nuanc

In [222]:
prob_0 = []
for word,count in zip(word_list_0,count_list_0):
    prob_0.append(count/len(word_list_0))
dict(zip(word_list_0,prob_0))


{'000': 0.00043677658877484165,
 '10': 0.0007643590303559729,
 '100': 0.0010919414719371041,
 '1000': 0.00010919414719371041,
 '102': 0.00021838829438742082,
 '10h': 0.00021838829438742082,
 '10h00': 0.00010919414719371041,
 '10h30': 0.00010919414719371041,
 '11': 0.00043677658877484165,
 '1100': 0.00010919414719371041,
 '11000': 0.00010919414719371041,
 '117': 0.00010919414719371041,
 '12': 0.0008735531775496833,
 '1200': 0.00010919414719371041,
 '12000': 0.00010919414719371041,
 '120battementsparminute': 0.00010919414719371041,
 '1215': 0.00010919414719371041,
 '12h': 0.00010919414719371041,
 '12joursdaction': 0.00010919414719371041,
 '13': 0.0007643590303559729,
 '13h50': 0.00010919414719371041,
 '14': 0.0009827473247433938,
 '14eme': 0.00010919414719371041,
 '14h': 0.00010919414719371041,
 '14h00': 0.00010919414719371041,
 '14h15': 0.00021838829438742082,
 '15': 0.0012011356191308146,
 '157garlamd': 0.00010919414719371041,
 '15min': 0.00010919414719371041,
 '15x': 0.000109194147193

In [223]:
prob_1 = []
for word,count in zip(word_list_1,count_list_1):
    prob_1.append(count/len(word_list_1))
dict(zip(word_list_1,prob_1))


{'02': 0.0005337128624799857,
 '04': 0.0005337128624799857,
 '06': 0.0001779042874933286,
 '10': 0.0010674257249599715,
 '100': 0.0003558085749866572,
 '1000': 0.0001779042874933286,
 '102': 0.0001779042874933286,
 '12': 0.000889521437466643,
 '120': 0.0003558085749866572,
 '12000': 0.0001779042874933286,
 '12f': 0.0001779042874933286,
 '13': 0.0007116171499733144,
 '14': 0.000889521437466643,
 '14amp15': 0.0001779042874933286,
 '14ans': 0.0001779042874933286,
 '15': 0.0003558085749866572,
 '150eur': 0.0001779042874933286,
 '15ans': 0.0001779042874933286,
 '16': 0.0003558085749866572,
 '16ans': 0.0001779042874933286,
 '16days': 0.0003558085749866572,
 '16jours': 0.0005337128624799857,
 '17': 0.0001779042874933286,
 '1964': 0.0001779042874933286,
 '1990': 0.0001779042874933286,
 '1992': 0.0003558085749866572,
 '1997': 0.0003558085749866572,
 '1998': 0.0001779042874933286,
 '1999': 0.0001779042874933286,
 '1cm': 0.0001779042874933286,
 '1dexmag': 0.0001779042874933286,
 '1er': 0.00017790

In [250]:
from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)

# instantiating the random over sampler 
ros = RandomOverSampler()
# resampling X, y
X_ros_train, y_ros_train = ros.fit_resample(np.array(X_train).reshape(-1,1), y_train)# new class distribution 
X_ros_test, y_ros_test = ros.fit_resample(np.array(X_test).reshape(-1,1), y_test)# new class distribution 

X_train_0 = []
X_train_1 = []

for i in range(len(X_ros_train)):
    if (y_ros_train[i]==0):
        X_train_0.append(X_ros_train[i].tolist()[0])
    else:
        X_train_1.append(X_ros_train[i].tolist()[0])

In [252]:
vec_0 = CountVectorizer()
X_vec_0 = vec_0.fit_transform(X_train_0)

total_features0 = len(vec_0.get_feature_names())
total_features0


7175

In [253]:
vec_1 = CountVectorizer()
X_vec_1 = vec_1.fit_transform(X_train_1)

total_features1 = len(vec_1.get_feature_names())
total_features1

proba0 = total_features0 / (total_features0+total_features1)
proba1 = total_features1 / (total_features0+total_features1)

print(proba0)
print(proba1)

0.6156156156156156
0.3843843843843844


In [254]:
total_cnts_features_0 = count_list_0.sum(axis=0)
total_cnts_features_1 = count_list_1.sum(axis=0)

In [255]:
from nltk.tokenize import word_tokenize
new_sentence = 'what is the price of the book'
new_word_list = word_tokenize(new_sentence)

In [256]:
def proba_sentence_class0(sentence):
    new_word_list = word_tokenize(sentence)
    prob_s_with_ls = []
    for word in new_word_list:
        if word in freq_0.keys():
            count = freq_0[word]
        else:
            count = 0
        prob_s_with_ls.append((count + 1)/(total_cnts_features_0 + total_features0))
    d = dict(zip(new_word_list,prob_s_with_ls))
    res = 1
    for word in sentence.split():
        res = res * d[word]

    res = res * proba0
    return res


In [257]:
def proba_sentence_class1(sentence):
    new_word_list = word_tokenize(sentence)
    prob_s_with_ls = []
    for word in new_word_list:
        if word in freq_1.keys():
            count = freq_1[word]
        else:
            count = 0
        prob_s_with_ls.append((count + 1)/(total_cnts_features_1 + total_features1))
    d = dict(zip(new_word_list,prob_s_with_ls))
    res = 1
    for word in sentence.split():
        res = res * d[word]

    res = res * proba1
    return res

print(proba_sentence_class1("what is the price of the book"))

2.0626937950927426e-31


In [282]:
predictions = []
for i in X_ros_test:
    print(i)
    proba_class0 = proba_sentence_class0(i[0])
    proba_class1 = proba_sentence_class1(i[0])
    if proba_class0 > proba_class1:
        predictions.append(0)
    else:
        predictions.append(1)
        
print(predictions)

['balancetonporc  sandra muller attaquee diffamation']
['apres avoir parler violences faites aux femmes tf1 toute logique zoom sur fesses miss est juste incroyable  balancetonporc missfrance2018']
['balancetonporc  responsabilite collective  by sophiebarel']
['mehdi ils ont truc plus cm']
['bonne premiere mitemps deuxiemes on na domine beau match amelmajri7 bouhaddisarah els9france wrenard aller bleues']
['lisez notre texte  etmaintenant  solidaires derriere moiaussi tlmep']
['cedricrs francoisfillon fillon est tue tout seulcostumesemployer femme enfantcheque du senatest argent qui a tue']
['carla mais carla putin mais quest ce quelle conne cette pauvre fille perdu de societe  lma lmea w9']
['vp2 | apres metoo balancetonporc ou eston  gtegalitefh journeedesdroitsdesfemmes 8mars lt3']
['waterpolo len europa cup oosterhout j1 bleues inclinent 195 face aux paysbas rdv demain face espagne allezlesbleues']
['annedesevrant loreal lorealparisfr toujours probleme qd shampoo lance ds bain de po

['cc paulineboyer33 personnalite de annee et autres annees aussi balancetonporc']
['fin d39une semaine eprouvante durant laquelle segolene royal demontre capacite de depassement de courage de femme d39etat wow']
['ocombey photosdeheyce peu importe situation mais eglise devoir de preparer jeunes soeurs par billets veuves homme peut aider femme mais mariage pour femmes matures pretes cuisine particulier domaine femmes']
['jeanmichel apathie considere robe aurore berge non adaptee parole politique il convient donc desormais de faire distinction entre ce qui bien a gauche ce qui nest pas bien a droite bienvenue dans monde merveilleux neoprogressistes ']
['pere il a dit aujourhui truc 3600x lui fais oui papa sais merci il me chuchote dans oreille  va te faire foutre grosse connasse  eh mercee papa']
['leasalame  christiane taubira vous sentezvous visee quand emmanuel macron parle de lien abime avec eglise  chtaubira ']
['enceinte sur marche du travail   est handicap notoire   bienvenue 2018

In [283]:
total_labels = len(y_ros_test)
acc = 0
for i in range(len(y_ros_test)):
    if(y_ros_test[i]==predictions[i]):
        acc = acc + 1
        
acc = acc / total_labels
print("Accuracy :" , acc)
        
metrics.confusion_matrix(y_ros_test, predictions)

Accuracy : 0.8823529411764706


array([[512, 151],
       [  5, 658]])

### NAIVES BAYES + UNDERSAMPLING

In [261]:
data = pd.read_csv("my_csv_clean.csv",sep = ',') #we got that csv after running the Text Preprocessing file
data.columns = ['tweet', 'class']
    

X = data['tweet']
y = data['class'] 


categories = ["NotSexist", "Sexist"]

X_0 = []
X_1 = []
for i in range(len(data)):
    if data['class'][i]==0:
        X_0.append(data['tweet'][i])
        
for i in range(len(data)):
    if data['class'][i]==1:
        X_1.append(data['tweet'][i])

vec_0 = CountVectorizer()
X_c0 = vec_0.fit_transform(X_0)
tdm_0 = pd.DataFrame(X_c0.toarray(), columns=vec_0.get_feature_names())


vec_1 = CountVectorizer()
X_c1 = vec_1.fit_transform(X_1)
tdm_1 = pd.DataFrame(X_c1.toarray(), columns=vec_1.get_feature_names())

tdm_1

Unnamed: 0,02,04,06,10,100,1000,102,12,120,12000,...,yqb,yves,zaimerais,zamzonite,zenash,zeniss,zetais,zlatan,zohrabitan,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
985,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
987,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [262]:
word_list_0 = vec_0.get_feature_names();    
count_list_0 = X_c0.toarray().sum(axis=0) 
freq_0 = dict(zip(word_list_0,count_list_0))
freq_0

{'000': 4,
 '10': 7,
 '100': 10,
 '1000': 1,
 '102': 2,
 '10h': 2,
 '10h00': 1,
 '10h30': 1,
 '11': 4,
 '1100': 1,
 '11000': 1,
 '117': 1,
 '12': 8,
 '1200': 1,
 '12000': 1,
 '120battementsparminute': 1,
 '1215': 1,
 '12h': 1,
 '12joursdaction': 1,
 '13': 7,
 '13h50': 1,
 '14': 9,
 '14eme': 1,
 '14h': 1,
 '14h00': 1,
 '14h15': 2,
 '15': 11,
 '157garlamd': 1,
 '15min': 1,
 '15x': 1,
 '16': 4,
 '169': 1,
 '16days': 2,
 '16h': 3,
 '16jours': 3,
 '17': 7,
 '1700': 1,
 '1703': 1,
 '1730': 1,
 '18': 9,
 '180': 1,
 '18006675005': 1,
 '18006686868': 1,
 '1825': 1,
 '18711955': 1,
 '18h': 2,
 '18h00': 1,
 '18h30': 1,
 '18h55': 1,
 '19': 6,
 '1900': 1,
 '1904': 1,
 '1905': 1,
 '1920': 2,
 '1927': 2,
 '1929': 2,
 '195': 2,
 '1954': 1,
 '1962': 1,
 '1965': 1,
 '1970': 1,
 '1979': 1,
 '1986': 1,
 '1987': 1,
 '1992': 1,
 '1999': 1,
 '19e': 1,
 '19h': 1,
 '19h30': 1,
 '19hruthelkrief': 2,
 '1a0maiscpasperdu': 1,
 '1dexmab': 1,
 '1dexmag': 1,
 '1e': 1,
 '1er': 10,
 '1ere': 17,
 '1iran': 1,
 '1les': 1,

In [263]:
word_list_1 = vec_1.get_feature_names();    
count_list_1 = X_c1.toarray().sum(axis=0) 
freq_1 = dict(zip(word_list_1,count_list_1))
freq_1

{'02': 3,
 '04': 3,
 '06': 1,
 '10': 6,
 '100': 2,
 '1000': 1,
 '102': 1,
 '12': 5,
 '120': 2,
 '12000': 1,
 '12f': 1,
 '13': 4,
 '14': 5,
 '14amp15': 1,
 '14ans': 1,
 '15': 2,
 '150eur': 1,
 '15ans': 1,
 '16': 2,
 '16ans': 1,
 '16days': 2,
 '16jours': 3,
 '17': 1,
 '1964': 1,
 '1990': 1,
 '1992': 2,
 '1997': 2,
 '1998': 1,
 '1999': 1,
 '1cm': 1,
 '1dexmag': 1,
 '1er': 1,
 '1ere': 3,
 '20': 7,
 '2004': 1,
 '2005': 1,
 '2006': 1,
 '2011': 1,
 '20122017': 1,
 '2014': 1,
 '2015': 2,
 '2016': 2,
 '2017': 9,
 '2018': 15,
 '20ans': 1,
 '20hfrance2': 1,
 '20minutes': 2,
 '21': 2,
 '21eme': 2,
 '22': 4,
 '23': 1,
 '24': 1,
 '2499': 1,
 '24hpujadas': 1,
 '25': 2,
 '25janvier': 2,
 '26': 1,
 '26caroll': 1,
 '27': 2,
 '28': 1,
 '2choses': 1,
 '2h': 1,
 '30': 4,
 '31': 1,
 '32': 1,
 '36': 1,
 '3600x': 1,
 '364': 1,
 '3784e': 1,
 '38': 2,
 '3eme': 1,
 '3min': 1,
 '40': 1,
 '40t': 1,
 '44': 1,
 '44t': 1,
 '4h': 1,
 '4meur': 1,
 '50': 5,
 '500': 1,
 '5000': 1,
 '50100500': 1,
 '50euros': 1,
 '50nuanc

In [264]:
prob_0 = []
for word,count in zip(word_list_0,count_list_0):
    prob_0.append(count/len(word_list_0))
dict(zip(word_list_0,prob_0))


{'000': 0.00043677658877484165,
 '10': 0.0007643590303559729,
 '100': 0.0010919414719371041,
 '1000': 0.00010919414719371041,
 '102': 0.00021838829438742082,
 '10h': 0.00021838829438742082,
 '10h00': 0.00010919414719371041,
 '10h30': 0.00010919414719371041,
 '11': 0.00043677658877484165,
 '1100': 0.00010919414719371041,
 '11000': 0.00010919414719371041,
 '117': 0.00010919414719371041,
 '12': 0.0008735531775496833,
 '1200': 0.00010919414719371041,
 '12000': 0.00010919414719371041,
 '120battementsparminute': 0.00010919414719371041,
 '1215': 0.00010919414719371041,
 '12h': 0.00010919414719371041,
 '12joursdaction': 0.00010919414719371041,
 '13': 0.0007643590303559729,
 '13h50': 0.00010919414719371041,
 '14': 0.0009827473247433938,
 '14eme': 0.00010919414719371041,
 '14h': 0.00010919414719371041,
 '14h00': 0.00010919414719371041,
 '14h15': 0.00021838829438742082,
 '15': 0.0012011356191308146,
 '157garlamd': 0.00010919414719371041,
 '15min': 0.00010919414719371041,
 '15x': 0.000109194147193

In [265]:
prob_1 = []
for word,count in zip(word_list_1,count_list_1):
    prob_1.append(count/len(word_list_1))
dict(zip(word_list_1,prob_1))


{'02': 0.0005337128624799857,
 '04': 0.0005337128624799857,
 '06': 0.0001779042874933286,
 '10': 0.0010674257249599715,
 '100': 0.0003558085749866572,
 '1000': 0.0001779042874933286,
 '102': 0.0001779042874933286,
 '12': 0.000889521437466643,
 '120': 0.0003558085749866572,
 '12000': 0.0001779042874933286,
 '12f': 0.0001779042874933286,
 '13': 0.0007116171499733144,
 '14': 0.000889521437466643,
 '14amp15': 0.0001779042874933286,
 '14ans': 0.0001779042874933286,
 '15': 0.0003558085749866572,
 '150eur': 0.0001779042874933286,
 '15ans': 0.0001779042874933286,
 '16': 0.0003558085749866572,
 '16ans': 0.0001779042874933286,
 '16days': 0.0003558085749866572,
 '16jours': 0.0005337128624799857,
 '17': 0.0001779042874933286,
 '1964': 0.0001779042874933286,
 '1990': 0.0001779042874933286,
 '1992': 0.0003558085749866572,
 '1997': 0.0003558085749866572,
 '1998': 0.0001779042874933286,
 '1999': 0.0001779042874933286,
 '1cm': 0.0001779042874933286,
 '1dexmag': 0.0001779042874933286,
 '1er': 0.00017790

In [278]:
X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)

# instantiating the random over sampler 
rus = RandomUnderSampler()
# resampling X, y
X_rus_train, y_rus_train = rus.fit_resample(np.array(X_train).reshape(-1,1), y_train)# new class distribution 
X_rus_test, y_rus_test = rus.fit_resample(np.array(X_test).reshape(-1,1), y_test)# new class distribution 

X_train_0 = []
X_train_1 = []

for i in range(len(X_rus_train)):
    if (y_ros_train[i]==0):
        X_train_0.append(X_rus_train[i].tolist()[0])
    else:
        X_train_1.append(X_rus_train[i].tolist()[0])

In [268]:
vec_0 = CountVectorizer()
X_vec_0 = vec_0.fit_transform(X_train_0)

total_features0 = len(vec_0.get_feature_names())
total_features0


5518

In [269]:
vec_1 = CountVectorizer()
X_vec_1 = vec_1.fit_transform(X_train_1)

total_features1 = len(vec_1.get_feature_names())
total_features1

proba0 = total_features0 / (total_features0+total_features1)
proba1 = total_features1 / (total_features0+total_features1)

print(proba0)
print(proba1)

0.6384357283350689
0.36156427166493116


In [270]:
total_cnts_features_0 = count_list_0.sum(axis=0)
total_cnts_features_1 = count_list_1.sum(axis=0)

In [271]:
new_sentence = 'what is the price of the book'
new_word_list = word_tokenize(new_sentence)

In [280]:
predictions = []
for i in X_rus_test:
    print(i)
    proba_class0 = proba_sentence_class0(i[0])
    proba_class1 = proba_sentence_class1(i[0])
    if proba_class0 > proba_class1:
        predictions.append(0)
    else:
        predictions.append(1)
        
print(predictions)

['comment ca sur insta y gens qui postent par theme mode photos bleues samedi prochain pis apres est rose']
['sexisme ordinaire harcelement comment parler  on donne rendezvous jeudi 1er fevrier de 9h 10h30 pour conference theatralisee animee par unroleajouer equilibreseu pour vous inscrire ']
['angela merkel bientot paris pour parler europe | scoopit']
['tw agression sexuelle le fait nous avions epoque 14 15 ans mais apparemment ca ne a pas derange plus ca balancetonporc metoo']
['jaime video youtube  taharrush gamea  ou viol autorise adresse']
['seule realisatrice apres guerre jacqueline audry donna films tonalite feministe ils sont decouvrir sur cine+ classic ']
['mise jour concernant scandale windrush  gouvernement soutien theresa may apres demission amber rudd ministre de interieur il faut dire may facheuse posture  cet  environnement hostile  est elle qui a cree']
['montreal quotidien sorti de presse tlmep moiaussi']
['de gauche droite  angela merkel chanceliere allemande jena has

In [281]:
total_labels = len(y_rus_test)
acc = 0
for i in range(len(y_rus_test)):
    if(y_rus_test[i]==predictions[i]):
        acc = acc + 1
        
acc = acc / total_labels
print("Accuracy :" , acc)
        
metrics.confusion_matrix(y_rus_test, predictions)

Accuracy : 0.8956228956228957


array([[237,  60],
       [  2, 295]])

In [194]:
# ==> ajouter les poids
# ==> matrice de confusion
# ==> meler plusieurs methodes
# ==> tester avec des paramètres différents 
# ==> faire un joli fichier