# SVM Models

In [23]:
import pandas as pd
import numpy as np 
import csv
from sklearn.utils import Bunch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier 
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [24]:
categories = ["not_sexist", "sexist"]

In [25]:
data = pd.read_csv("my_csv_clean.csv",sep = ',')
data.columns = ['tweet', 'class']

X = data['tweet']
y = data['class']

X_train, X_test, y_train , y_test = train_test_split(X , y ,test_size=0.3)

### SVM

In [26]:
model = Pipeline([
...     ('vect', CountVectorizer()),
...     ('tfidf', TfidfTransformer()),
...     ('clf', SGDClassifier(loss='hinge', penalty='l2',
...                           alpha=1e-3, random_state=42,
...                           max_iter=5, tol=None)),
... ])

model.fit(X_train, y_train)
predicted = model.predict(X_test)
print("\nMatrice de confusion :  \n" , metrics.confusion_matrix(y_test, predicted))
print()
print("Accuracy : ", np.mean(predicted == y_test), "\n")
print("Balanced accuracy : ", metrics.balanced_accuracy_score(y_test, predicted), "\n")

print(metrics.classification_report(y_test, predicted,
...     target_names=categories))



Matrice de confusion :  
 [[608  26]
 [214  97]]

Accuracy :  0.746031746031746 

Balanced accuracy :  0.6354438211934637 

              precision    recall  f1-score   support

  not_sexist       0.74      0.96      0.84       634
      sexist       0.79      0.31      0.45       311

    accuracy                           0.75       945
   macro avg       0.76      0.64      0.64       945
weighted avg       0.76      0.75      0.71       945



### TEST DE PARAMETRES DIFFERENTS 

In [27]:
# Creating the Bag of Words model
print(data)
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(data['tweet'])
y = data['class']

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


                                                  tweet  class
0     femm material superficiel ne jug quau physiqu ...      0
1     mis lign articl payetashnek deermag soon  paye...      0
2     achat jour  tre bon livr payetashnek  sinon check      0
3     hommag journeedelafemm tout vi elle vendu fleu...      0
4     lustr bois flott feuill exot bleu abat jour vi...      0
...                                                 ...    ...
3143  segoleneroyal particip activ declin franc sous...      1
3144  depuis affair dsk femin ne veulent etre ni put...      1
3145  analogie est comm dir femme ete bien bel aujou...      1
3146  si as bit plac coeur eton pas avoir put plac u...      1
3147  admirfdlrc cardosolis nadinemorano bout un mom...      1

[3148 rows x 2 columns]


In [28]:
#Support Vector : rbf
SVC_classifier = SVC(kernel = 'rbf')
SVC_classifier.fit(X_train, y_train)
y_pred_SVC = SVC_classifier.predict(X_test)
cm_SVC = confusion_matrix(y_test, y_pred_SVC)

print("Matrice de confusion : ")
print(cm_SVC)
print()
print("Accuracy : ", accuracy_score(y_test, y_pred_SVC))
print()
print("Balanced accuracy : ", metrics.balanced_accuracy_score(y_test, y_pred_SVC), "\n")
print(metrics.classification_report(y_test, y_pred_SVC,target_names=categories))

Matrice de confusion : 
[[404  33]
 [122  71]]

Accuracy :  0.753968253968254

Balanced accuracy :  0.6461803867632587 

              precision    recall  f1-score   support

  not_sexist       0.77      0.92      0.84       437
      sexist       0.68      0.37      0.48       193

    accuracy                           0.75       630
   macro avg       0.73      0.65      0.66       630
weighted avg       0.74      0.75      0.73       630



In [29]:
#Support Vector : sigmoid
SVC_classifier = SVC(kernel = 'sigmoid')
SVC_classifier.fit(X_train, y_train)
y_pred_SVC = SVC_classifier.predict(X_test)
cm_SVC = confusion_matrix(y_test, y_pred_SVC)

print("Matrice de confusion : ")
print(cm_SVC)
print()
print("Accuracy : ", accuracy_score(y_test, y_pred_SVC))
print()
print("Balanced accuracy : ", metrics.balanced_accuracy_score(y_test, y_pred_SVC), "\n")
print(metrics.classification_report(y_test, y_pred_SVC,target_names=categories))

Matrice de confusion : 
[[387  50]
 [107  86]]

Accuracy :  0.7507936507936508

Balanced accuracy :  0.6655896894748699 

              precision    recall  f1-score   support

  not_sexist       0.78      0.89      0.83       437
      sexist       0.63      0.45      0.52       193

    accuracy                           0.75       630
   macro avg       0.71      0.67      0.68       630
weighted avg       0.74      0.75      0.74       630



In [30]:
#Support Vector : poly
SVC_classifier = SVC(kernel = 'poly', degree = 2)
SVC_classifier.fit(X_train, y_train)
y_pred_SVC = SVC_classifier.predict(X_test)
cm_SVC = confusion_matrix(y_test, y_pred_SVC)

print("Matrice de confusion : ")
print(cm_SVC)
print()
print("Accuracy : ", accuracy_score(y_test, y_pred_SVC))
print()
print("Balanced accuracy : ", metrics.balanced_accuracy_score(y_test, y_pred_SVC), "\n")
print(metrics.classification_report(y_test, y_pred_SVC,target_names=categories))

Matrice de confusion : 
[[415  22]
 [143  50]]

Accuracy :  0.7380952380952381

Balanced accuracy :  0.6043620540425179 

              precision    recall  f1-score   support

  not_sexist       0.74      0.95      0.83       437
      sexist       0.69      0.26      0.38       193

    accuracy                           0.74       630
   macro avg       0.72      0.60      0.61       630
weighted avg       0.73      0.74      0.69       630



In [31]:
#Support Vector : linear
SVC_classifier = SVC(kernel = 'linear')
SVC_classifier.fit(X_train, y_train)
y_pred_SVC = SVC_classifier.predict(X_test)
cm_SVC = confusion_matrix(y_test, y_pred_SVC)

print("Matrice de confusion : ")
print(cm_SVC)
print()
print("Accuracy : ", accuracy_score(y_test, y_pred_SVC))
print()
print("Balanced accuracy : ", metrics.balanced_accuracy_score(y_test, y_pred_SVC), "\n")
print(metrics.classification_report(y_test, y_pred_SVC,target_names=categories))

Matrice de confusion : 
[[359  78]
 [ 98  95]]

Accuracy :  0.7206349206349206

Balanced accuracy :  0.6568691383787244 

              precision    recall  f1-score   support

  not_sexist       0.79      0.82      0.80       437
      sexist       0.55      0.49      0.52       193

    accuracy                           0.72       630
   macro avg       0.67      0.66      0.66       630
weighted avg       0.71      0.72      0.72       630

