### TUTORIAL 

In [30]:
import pandas as pd
import numpy as np
import csv
from sklearn.utils import Bunch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

from gensim.models import Word2Vec
import nltk
from gensim.models import KeyedVectors

from nltk.cluster import KMeansClusterer
import numpy as np 

from sklearn import cluster
from sklearn import metrics

In [3]:
categories = ["not_sexist", "sexist"]
#               2161           989

In [31]:
data = pd.read_csv("my_csv.csv",sep = ',')
data.columns = ['tweet', 'class']

def load_my_dataset(string):
    with open(r'my_csv.csv',encoding="mbcs") as csv_file:
        data_reader = csv.reader(csv_file)
        feature_names =  ['tweet', 'class']#next(data_reader)[:-1]
        data = []
        target = []
        for row in data_reader:
            features = row[:-1]
            label = row[-1]
            data.append([str for str in features])
            target.append(int(label))
        
        data = np.array(data)
        target = np.array(target)
    return Bunch(data=data, target=target,target_names = categories, feature_names=feature_names, subset=string, categories=categories, shuffle=True)
dataset = load_my_dataset('train')
#compter tweets
#tester algos différents et représentations différentes

### Tokenizing text with scikit-learn

In [6]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(dataset.data.ravel())
X_train_counts.shape

(3150, 14102)

In [7]:
count_vect.vocabulary_.get(u'femme')

4989

### From occurrences to frequencies

In [8]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3150, 14102)

### Training a classifier

In [9]:
clf = MultinomialNB().fit(X_train_tfidf, dataset.target)

In [41]:
docs_new = ["Les hommes c'est tous les mêmes", 'Il est étudiant.'] #marche pas du tout
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, dataset.target_names[category]))

"Les hommes c'est tous les mêmes" => not_sexist
'Il est étudiant.' => not_sexist


### Building a pipeline

In [11]:
text_clf = Pipeline([
...     ('vect', CountVectorizer()),
...     ('tfidf', TfidfTransformer()),
...     ('clf', MultinomialNB()),
... ])
text_clf.fit(dataset.data.ravel(), dataset.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

### Evaluation of the performance on the test set

In [42]:
test = load_my_dataset('test')
docs_test = test.data.ravel()
predicted = text_clf.predict(docs_test)
np.mean(predicted == test.target)

0.7796825396825396

### SVM

In [40]:
text_clf = Pipeline([
...     ('vect', CountVectorizer()),
...     ('tfidf', TfidfTransformer()),
...     ('clf', SGDClassifier(loss='hinge', penalty='l2',
...                           alpha=1e-3, random_state=42,
...                           max_iter=5, tol=None)),
... ])
text_clf.fit(dataset.data.ravel(), dataset.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == test.target)

0.8285714285714286

In [41]:
print(metrics.classification_report(test.target, predicted,
...     target_names=test.target_names))

              precision    recall  f1-score   support

  not_sexist       0.81      0.98      0.89      2161
      sexist       0.92      0.49      0.64       989

    accuracy                           0.83      3150
   macro avg       0.87      0.74      0.77      3150
weighted avg       0.85      0.83      0.81      3150



In [42]:
metrics.confusion_matrix(test.target, predicted)

array([[2121,   40],
       [ 500,  489]], dtype=int64)

### Parameter tuning using grid search

In [43]:
parameters = {
...     'vect__ngram_range': [(1, 1), (1, 2)],
...     'tfidf__use_idf': (True, False),
...     'clf__alpha': (1e-2, 1e-3),
... }

In [44]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [45]:
gs_clf = gs_clf.fit(dataset.data.ravel()[:400], dataset.target[:400])
#ameliorer vecteur et algo et representation
#-> plus de poids sur les mots sexistes

40 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\evara\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\evara\anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\evara\anaconda3\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 890, in fit
    return self._fit(
  File "C:\Users\evara\anaconda3\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 684, in _fit
    self._pa

ValueError: The number of classes has to be greater than one; got 1 class

### Oversampling

In [43]:
# instantiating the random over sampler 
ros = RandomOverSampler()
# resampling X, y
X_ros, y_ros = ros.fit_resample(dataset.data, dataset.target)# new class distribution 
print(Counter(y_ros))

Counter({0: 2161, 1: 2161})


In [49]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_ros.ravel())
X_train_counts.shape

(4322, 14102)

In [50]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(4322, 14102)

In [51]:
clf = MultinomialNB().fit(X_train_tfidf, y_ros)
docs_new = ["y'a que les femmes qui pleurent", "C'est un homme."] 
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, dataset.target_names[category]))
        
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf.fit(X_ros.ravel(), y_ros)


# resampling X, y
test = load_my_dataset('test')
X_ros_test, y_ros_test = ros.fit_resample(test.data, test.target)# new class distribution 
docs_test = X_ros_test.ravel()
predicted = text_clf.predict(docs_test)
np.mean(predicted == y_ros_test)
print("Accuracy : ", np.mean(predicted == y_ros_test))
metrics.confusion_matrix(y_ros_test, predicted)

"y'a que les femmes qui pleurent" => sexist
"C'est un homme." => sexist
Accuracy :  0.9113836186950486


array([[1815,  346],
       [  37, 2124]], dtype=int64)

### Undersampling

In [52]:
# instantiating the random over sampler 
ros = RandomUnderSampler()
# resampling X, y
X_ros, y_ros = ros.fit_resample(dataset.data, dataset.target)# new class distribution 
print(Counter(y_ros))

Counter({0: 989, 1: 989})


In [53]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_ros.ravel())
X_train_counts.shape

(1978, 10229)

In [54]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1978, 10229)

In [55]:
clf = MultinomialNB().fit(X_train_tfidf, y_ros)
docs_new = ["y'a que les femmes qui pleurent", "C'est un homme."] 
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, dataset.target_names[category]))
        
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf.fit(X_ros.ravel(), y_ros)


# resampling X, y
test = load_my_dataset('test')
X_ros_test, y_ros_test = ros.fit_resample(test.data, test.target)# new class distribution 
docs_test = X_ros_test.ravel()
predicted = text_clf.predict(docs_test)
print("Accuracy : ", np.mean(predicted == y_ros_test))
metrics.confusion_matrix(y_ros_test, predicted)


"y'a que les femmes qui pleurent" => sexist
"C'est un homme." => sexist
Accuracy :  0.8559150657229525


array([[712, 277],
       [  8, 981]], dtype=int64)

### Oversampling and Undersampling

In [56]:
over = RandomOverSampler(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.8)

In [57]:
X_over, y_over = over.fit_resample(dataset.data, dataset.target)
print(f"Oversampled: {Counter(y_over)}")

Oversampled: Counter({0: 2161, 1: 1080})


In [58]:
# now to comine under sampling 
X_ros, y_ros = under.fit_resample(X_over, y_over)
print(f"Combined Random Sampling: {Counter(y_ros)}")

Combined Random Sampling: Counter({0: 1350, 1: 1080})


In [59]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_ros.ravel())
X_train_counts.shape

(2430, 11484)

In [60]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2430, 11484)

In [61]:
clf = MultinomialNB().fit(X_train_tfidf, y_ros)
docs_new = ["y'a que les femmes qui pleurent", "C'est un homme."]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, dataset.target_names[category]))
        
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf.fit(X_ros.ravel(), y_ros)


# resampling X, y
test = load_my_dataset('test')
X_ros_test, y_ros_test = ros.fit_resample(test.data, test.target)# new class distribution 
docs_test = X_ros_test.ravel()
predicted = text_clf.predict(docs_test)
print("Accuracy : ", np.mean(predicted == y_ros_test))
metrics.confusion_matrix(y_ros_test, predicted)


"y'a que les femmes qui pleurent" => sexist
"C'est un homme." => sexist
Accuracy :  0.9150657229524772


array([[878, 111],
       [ 57, 932]], dtype=int64)

### Word Embedding

In [62]:
#1st step = getting cleaned data 
data = pd.read_csv("my_csv_clean.csv",sep = ',') #we got that csv after running the preprocessing.py file
data.columns = ['tweet', 'class']
