Chargement des librairies

In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
import os

Chargement des données d'apprentissage et de test

In [3]:

directory_path = os.path.abspath('')
data_train = pd.read_csv(directory_path + '/data_deft2017/task1-train.csv', names = ['id', 'text', 'class'], sep = '\t', skiprows=10)
data_test = pd.read_csv(directory_path + '/data_deft2017/task1-testGold.csv', names = ['id', 'text', 'class'], sep = '\t', skiprows=10)

data_train

Unnamed: 0,id,text,class
0,1,Rencontre avec Dodo La Saumure : «Je ne connai...,objective
1,2,"Galère du dimanche matin: sur DSK,Marthe, 6 an...",negative
2,3,"2011 année noire pour la PQR, seuls 2 quotidie...",objective
3,4,"Le procès civil de l'affaire #DSK, dans le Bro...",objective
4,5,Heureusement que le candidat n'est pas DSK car...,negative
...,...,...,...
3882,3902,J'étais content d'avoir enfin un jour de repos...,mixed
3883,3903,"Pour #TPMP, la télé #transphobe, @guillaumeple...",negative
3884,3904,"Tellement mérité pour RABIOT, très content pou...",positive
3885,3905,Très content pour Rabiot. Enfin l'immunité dip...,positive


Gestion des valeurs manquantes dans les données

In [4]:
print(data_train.isnull().sum())

id       0
text     0
class    0
dtype: int64


Gestion des valeurs manquantes dans les données

In [5]:
print(data_test.isnull().sum())

id       0
text     0
class    0
dtype: int64


Vectorisation avec TF.IDF des données textes

In [6]:
vectorizer = TfidfVectorizer()

x_train_vectorized = vectorizer.fit_transform(list(data_train['text']))

Apprentissage du SVM

In [7]:
svm_model = SVC(kernel='linear')
svm_model.fit(x_train_vectorized, list(data_train['class']))

Inférence avec le SVM sur les données de test

In [8]:
x_test_vectorized = vectorizer.transform(list(data_test['text']))
y_prediction = svm_model.predict(x_test_vectorized)
print(y_prediction)

['objective' 'negative' 'objective' 'objective' 'objective' 'negative'
 'objective' 'objective' 'objective' 'negative' 'objective' 'objective'
 'negative' 'objective' 'objective' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'objective' 'objective' 'objective'
 'objective' 'negative' 'negative' 'negative' 'objective' 'negative'
 'negative' 'negative' 'negative' 'objective' 'objective' 'negative'
 'negative' 'objective' 'objective' 'mixed' 'objective' 'negative'
 'negative' 'negative' 'objective' 'objective' 'objective' 'negative'
 'negative' 'negative' 'objective' 'objective' 'negative' 'mixed'
 'objective' 'negative' 'negative' 'negative' 'objective' 'negative'
 'objective' 'objective' 'negative' 'negative' 'negative' 'negative'
 'objective' 'negative' 'objective' 'negative' 'objective' 'negative'
 'negative' 'objective' 'negative' 'negative' 'negative' 'negative'
 'negative' 'objective' 'negative' 'negative' 'negative' 'objective'
 'negative' 'negative' 'negative

Evaluation du modèle sur les résultats de l'inférence

In [16]:
accuracy = accuracy_score(list(data_test['class']), y_prediction)
print(f'Accuracy: {accuracy:.2f}')

print(svm_model.coef_)


Accuracy: 0.66
  (0, 2777)	-0.023897112380835764
  (0, 4247)	-0.02083456911066779
  (0, 5777)	-0.023897112380835764
  (0, 7747)	-0.023897112380835764
  (0, 2782)	-0.24524611186786402
  (0, 3928)	-0.24524611186786402
  (0, 5921)	-0.24524611186786402
  (0, 6221)	-0.23364637939353647
  (0, 7880)	-0.5504846558309303
  (0, 115)	-0.244258207615755
  (0, 1955)	-0.2934719150928127
  (0, 6291)	-0.40975881190777963
  (0, 7382)	-0.244258207615755
  (0, 11806)	-0.244258207615755
  (0, 10423)	-0.2415313761212183
  (0, 12402)	-0.480432241294861
  (0, 1834)	-0.16095252062189566
  (0, 4563)	-0.16095252062189566
  (0, 9013)	-0.16095252062189566
  (0, 10695)	-0.30375011342891434
  (0, 7640)	-0.4013948810810617
  (0, 2190)	-0.21990591433729678
  (0, 5498)	-0.47458426660675135
  (0, 8025)	-0.25149081613137736
  (0, 8765)	-0.09044943655123948
  :	:
  (5, 11793)	0.3997609086989434
  (5, 4602)	0.4708020501397293
  (5, 12581)	0.4708020501397293
  (5, 247)	0.12495480998086488
  (5, 1665)	0.12495480998086488
  