### Grid Search

In [17]:
import pandas as pd
import numpy as np
import csv
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from collections import Counter

from nltk.cluster import KMeansClusterer
from nltk.tokenize import word_tokenize
from sklearn import cluster
from sklearn import metrics




In [18]:
    categories = ["not_sexist", "sexist"]
    #               2161           989

In [19]:
data = pd.read_csv("my_csv_clean.csv",sep = ',')
data.columns = ['tweet', 'class']

X = data['tweet']
y = data['class']

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)

### SVM

In [20]:
text_clf = Pipeline([
...     ('vect', CountVectorizer()),
...     ('tfidf', TfidfTransformer()),
...     ('clf', SGDClassifier(loss='hinge', penalty='l2',
...                           alpha=1e-3, random_state=42,
...                           max_iter=5, tol=None)),
... ])
text_clf.fit(X_train, y_train)

0.7566137566137566

### Parameter tuning using grid search

In [23]:
parameters = {
...     'vect__ngram_range': [(1, 1), (1, 2)],
...     'tfidf__use_idf': (True, False),
...     'clf__alpha': (1e-2, 1e-3),
... }

In [24]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [25]:
gs_clf = gs_clf.fit(X_train[:100], y_train[:100])

In [26]:
example_1 = 'La femme'
example_2 = "L'homme"
example_3 = "La femme doit être dans la cuisine"

print(example_1 + "  => " + str(categories[gs_clf.predict([example_1])[0]]))
print(example_2 + "  => " + str(categories[gs_clf.predict([example_2])[0]]))
print(example_3 + "  => " + str(categories[gs_clf.predict([example_3])[0]]))

La femme  => not_sexist
L'homme  => not_sexist
La femme doit être dans la cuisine  => not_sexist


In [27]:
gs_clf.best_score_

0.68

In [28]:
for param_name in sorted(parameters.keys()):
     print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.01
tfidf__use_idf: False
vect__ngram_range: (1, 2)


In [29]:
predicted = gs_clf.predict(X_test)
np.mean(predicted == y_test)
print("Accuracy : ", np.mean(predicted == y_test))
metrics.confusion_matrix(y_test, predicted)

Accuracy :  0.7037037037037037


array([[525, 126],
       [154, 140]], dtype=int64)