### TUTORIAL 

In [136]:
import pandas as pd
import numpy as np
import csv
from sklearn.utils import Bunch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [132]:
categories = ["not_sexist", "sexist"]
#               2161           989

In [133]:
data = pd.read_csv("my_csv.csv",sep = ',')
data.columns = ['tweet', 'class']

def load_my_dataset(string):
    with open(r'my_csv.csv') as csv_file:
        data_reader = csv.reader(csv_file)
        feature_names =  ['tweet', 'class']#next(data_reader)[:-1]
        data = []
        target = []
        for row in data_reader:
            features = row[:-1]
            label = row[-1]
            data.append([str for str in features])
            target.append(int(label))
        
        data = np.array(data)
        target = np.array(target)
    return Bunch(data=data, target=target,target_names = categories, feature_names=feature_names, subset=string, categories=categories, shuffle=True)
dataset = load_my_dataset('train')
#compter tweets
#tester algos différents et représentations différentes

Counter({0: 2161, 1: 2161})


### Tokenizing text with scikit-learn

In [75]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(dataset.data.ravel())
X_train_counts.shape

(3150, 14332)

In [78]:
count_vect.vocabulary_.get(u'femme')

5112

### From occurrences to frequencies

In [81]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3150, 14332)

### Training a classifier

In [85]:
clf = MultinomialNB().fit(X_train_tfidf, dataset.target)

In [99]:
docs_new = ["Les hommes c'est tous les mêmes", 'Il est étudiant.'] #marche pas du tout
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, dataset.target_names[category]))

"Les hommes c'est tous les mêmes" => not_sexist
'Il est étudiant.' => not_sexist


### Building a pipeline

In [103]:
text_clf = Pipeline([
...     ('vect', CountVectorizer()),
...     ('tfidf', TfidfTransformer()),
...     ('clf', MultinomialNB()),
... ])
text_clf.fit(dataset.data.ravel(), dataset.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

### Evaluation of the performance on the test set

In [150]:
test = load_my_dataset('test')
docs_test = test.data.ravel()
predicted = text_clf.predict(docs_test)
np.mean(predicted == test.target)

0.8936507936507937

### SVM

In [111]:
text_clf = Pipeline([
...     ('vect', CountVectorizer()),
...     ('tfidf', TfidfTransformer()),
...     ('clf', SGDClassifier(loss='hinge', penalty='l2',
...                           alpha=1e-3, random_state=42,
...                           max_iter=5, tol=None)),
... ])
text_clf.fit(dataset.data.ravel(), dataset.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == test.target)

0.8215873015873015

In [113]:
print(metrics.classification_report(test.target, predicted,
...     target_names=test.target_names))

              precision    recall  f1-score   support

  not_sexist       0.80      0.98      0.88      2161
      sexist       0.92      0.47      0.62       989

    accuracy                           0.82      3150
   macro avg       0.86      0.73      0.75      3150
weighted avg       0.84      0.82      0.80      3150



In [115]:
metrics.confusion_matrix(test.target, predicted)

array([[2123,   38],
       [ 524,  465]])

### Parameter tuning using grid search

In [117]:
parameters = {
...     'vect__ngram_range': [(1, 1), (1, 2)],
...     'tfidf__use_idf': (True, False),
...     'clf__alpha': (1e-2, 1e-3),
... }

In [118]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [125]:
gs_clf = gs_clf.fit(dataset.data.ravel()[:400], dataset.target[:400])
#ameliorer vecteur et algo et representation
#-> plus de poids sur les mots sexistes



ValueError: The number of classes has to be greater than one; got 1 class

### Oversampling

In [139]:
# instantiating the random over sampler 
ros = RandomOverSampler()
# resampling X, y
X_ros, y_ros = ros.fit_resample(dataset.data, dataset.target)# new class distribution 
print(Counter(y_ros))

Counter({0: 2161, 1: 2161})


In [140]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_ros.ravel())
X_train_counts.shape

(4322, 14332)

In [141]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(4322, 14332)

In [157]:
clf = MultinomialNB().fit(X_train_tfidf, y_ros)
docs_new = ["y'a que les femmes qui pleurent", "C'est un homme."] #marche pas du tout
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, dataset.target_names[category]))
        
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf.fit(X_ros.ravel(), y_ros)


# resampling X, y
test = load_my_dataset('test')
X_ros_test, y_ros_test = ros.fit_resample(test.data, test.target)# new class distribution 
docs_test = X_ros_test.ravel()
predicted = text_clf.predict(docs_test)
np.mean(predicted == y_ros_test)
metrics.confusion_matrix(y_ros_test, predicted)
#prochaine fois tester lautre oversampling

"y'a que les femmes qui pleurent" => sexist
"C'est un homme." => sexist


array([[1837,  324],
       [  24, 2137]])