In [96]:
import pandas as pd
import numpy as np
import csv
from sklearn.utils import Bunch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [97]:
categories = ["not_sexist", "sexist"]
  #               2161           989

In [98]:
data = pd.read_csv("my_csv_clean.csv",sep = ',')
data.columns = ['tweet', 'class']

X = data['tweet']
y = data['class']

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)

### SVM

In [99]:
model = Pipeline([
...     ('vect', CountVectorizer()),
...     ('tfidf', TfidfTransformer()),
...     ('clf', SGDClassifier(loss='hinge', penalty='l2',
...                           alpha=1e-3, random_state=42,
...                           max_iter=5, tol=None)),
... ])

model.fit(X_train, y_train)
predicted = model.predict(X_test)
print("\nMatrice de confusion :  \n" , metrics.confusion_matrix(y_test, predicted))

print("Accuracy : ", np.mean(predicted == y_test), "\n")

print(metrics.classification_report(y_test, predicted,
...     target_names=categories))



Matrice de confusion :  
 [[595  47]
 [200 103]]
Accuracy :  0.7386243386243386 

              precision    recall  f1-score   support

  not_sexist       0.75      0.93      0.83       642
      sexist       0.69      0.34      0.45       303

    accuracy                           0.74       945
   macro avg       0.72      0.63      0.64       945
weighted avg       0.73      0.74      0.71       945



### TEST DE PARAMETRES DIFFERENTS 

In [100]:
# Creating the Bag of Words model
print(data)
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(data['tweet'])
y = data['class']

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


                                                  tweet  class
0     suis femme materialiste superficielle qui ne j...      0
1     mise ligne de article sur payetashnek pour dee...      0
2     achat du jour  tres bon livre payetashnek  sin...      0
3     hommage journeedelafemme toute vie elle vendu ...      0
4     lustre bois flotte feuilles exotiques bleues a...      0
...                                                 ...    ...
3143  segoleneroyal participe activement declin de f...      1
3144  depuis affaire dsk feministes ne veulent etre ...      1
3145  analogie est comme dire femme vous etes bien b...      1
3146  si as bite place du coeur etonne pas avoir put...      1
3147  admirfdlrc cardosolisa nadinemorano bout un mo...      1

[3148 rows x 2 columns]


In [101]:
#Support Vector : rbf
SVC_classifier = SVC(kernel = 'rbf')
SVC_classifier.fit(X_train, y_train)
y_pred_SVC = SVC_classifier.predict(X_test)
cm_SVC = confusion_matrix(y_test, y_pred_SVC)

print("Matrice de confusion : ")
print(cm_SVC)
print()
print("Accuracy : ", accuracy_score(y_test, y_pred_SVC))
print()
print(metrics.classification_report(y_test, y_pred_SVC,target_names=categories))

Matrice de confusion : 
[[413  24]
 [115  78]]

Accuracy :  0.7793650793650794

              precision    recall  f1-score   support

  not_sexist       0.78      0.95      0.86       437
      sexist       0.76      0.40      0.53       193

    accuracy                           0.78       630
   macro avg       0.77      0.67      0.69       630
weighted avg       0.78      0.78      0.76       630



In [102]:
#Support Vector : sigmoid
SVC_classifier = SVC(kernel = 'sigmoid')
SVC_classifier.fit(X_train, y_train)
y_pred_SVC = SVC_classifier.predict(X_test)
cm_SVC = confusion_matrix(y_test, y_pred_SVC)

print("Matrice de confusion : ")
print(cm_SVC)
print()
print("Accuracy : ", accuracy_score(y_test, y_pred_SVC))
print()
print(metrics.classification_report(y_test, y_pred_SVC,target_names=categories))

Matrice de confusion : 
[[374  63]
 [105  88]]

Accuracy :  0.7333333333333333

              precision    recall  f1-score   support

  not_sexist       0.78      0.86      0.82       437
      sexist       0.58      0.46      0.51       193

    accuracy                           0.73       630
   macro avg       0.68      0.66      0.66       630
weighted avg       0.72      0.73      0.72       630



In [103]:
#Support Vector : poly
SVC_classifier = SVC(kernel = 'poly', degree = 2)
SVC_classifier.fit(X_train, y_train)
y_pred_SVC = SVC_classifier.predict(X_test)
cm_SVC = confusion_matrix(y_test, y_pred_SVC)

print("Matrice de confusion : ")
print(cm_SVC)
print()
print("Accuracy : ", accuracy_score(y_test, y_pred_SVC))
print()
print(metrics.classification_report(y_test, y_pred_SVC,target_names=categories))

Matrice de confusion : 
[[415  22]
 [137  56]]

Accuracy :  0.7476190476190476

              precision    recall  f1-score   support

  not_sexist       0.75      0.95      0.84       437
      sexist       0.72      0.29      0.41       193

    accuracy                           0.75       630
   macro avg       0.73      0.62      0.63       630
weighted avg       0.74      0.75      0.71       630



In [104]:
#Support Vector : linear
SVC_classifier = SVC(kernel = 'linear')
SVC_classifier.fit(X_train, y_train)
y_pred_SVC = SVC_classifier.predict(X_test)
cm_SVC = confusion_matrix(y_test, y_pred_SVC)

print("Matrice de confusion : ")
print(cm_SVC)
print()
print("Accuracy : ", accuracy_score(y_test, y_pred_SVC))
print()
print(metrics.classification_report(y_test, y_pred_SVC,target_names=categories))

Matrice de confusion : 
[[364  73]
 [ 81 112]]

Accuracy :  0.7555555555555555

              precision    recall  f1-score   support

  not_sexist       0.82      0.83      0.83       437
      sexist       0.61      0.58      0.59       193

    accuracy                           0.76       630
   macro avg       0.71      0.71      0.71       630
weighted avg       0.75      0.76      0.75       630



In [105]:
# Random Forest
rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
cm_RandFor = confusion_matrix(y_test, y_pred_rf)
print("Matrice de confusion : ")
print(cm_RandFor)
print()
print("Accuracy : ", accuracy_score(y_test, y_pred_rf))
print()
print(metrics.classification_report(y_test, y_pred_rf,target_names=categories))

Matrice de confusion : 
[[403  34]
 [114  79]]

Accuracy :  0.765079365079365

              precision    recall  f1-score   support

  not_sexist       0.78      0.92      0.84       437
      sexist       0.70      0.41      0.52       193

    accuracy                           0.77       630
   macro avg       0.74      0.67      0.68       630
weighted avg       0.75      0.77      0.74       630

