In [1]:
import pandas as pd

df = pd.read_csv('./../data/train_split.csv')

In [2]:
df.lang.unique()

array(['GER', 'TUR', 'CHI', 'TEL', 'ARA', 'SPA', 'HIN', 'JPN', 'KOR',
       'FRE', 'ITA'], dtype=object)

In [3]:
X, y = df["text"], df["lang"]

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

In [5]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, tol=None)),
])
text_clf.fit(X_train, y_train)

In [6]:
import numpy as np

# Prédictions sur le dataset train
predicted = text_clf.predict(X_train)
np.mean(predicted == y_train)

0.9166105499438832

In [7]:
# Prédictions sur le dataset test
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.7252525252525253

In [8]:
from sklearn import metrics

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

         ARA       0.73      0.77      0.75        90
         CHI       0.72      0.84      0.78        90
         FRE       0.75      0.68      0.71        90
         GER       0.67      0.91      0.77        90
         HIN       0.70      0.69      0.70        90
         ITA       0.71      0.80      0.75        90
         JPN       0.75      0.67      0.71        90
         KOR       0.71      0.61      0.66        90
         SPA       0.66      0.54      0.60        90
         TEL       0.76      0.73      0.75        90
         TUR       0.82      0.73      0.78        90

    accuracy                           0.73       990
   macro avg       0.73      0.73      0.72       990
weighted avg       0.73      0.73      0.72       990

