In [1]:
import pandas as pd

df = pd.read_csv('./../data/train_split.csv')
X, y = df["text"], df["lang"]

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Feature engineering
tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9)
features = tfidf.fit_transform(X_train)

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

# Hyperparameter tuning
param_grid = {
    'loss': ['hinge', 'log'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'max_iter': [100, 500, 1000]
}
model = SGDClassifier()
grid_search = GridSearchCV(model, param_grid, scoring='f1_macro', n_jobs=-1, cv=5)
grid_search.fit(features, y_train)

In [5]:
# Get the best hyperparameters and model
best_hyperparams = grid_search.best_params_
best_model = grid_search.best_estimator_

In [10]:
best_hyperparams, best_model

({'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'},
 SGDClassifier())

In [8]:
import numpy as np

# Prédictions sur le dataset test
test_features = tfidf.transform(X_test)
predicted = grid_search.predict(test_features)
np.mean(predicted == y_test)

0.7818181818181819

In [9]:
from sklearn import metrics

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

         ARA       0.75      0.77      0.76        90
         CHI       0.77      0.88      0.82        90
         FRE       0.82      0.81      0.82        90
         GER       0.85      0.91      0.88        90
         HIN       0.67      0.73      0.70        90
         ITA       0.79      0.83      0.81        90
         JPN       0.81      0.77      0.79        90
         KOR       0.77      0.66      0.71        90
         SPA       0.75      0.69      0.72        90
         TEL       0.76      0.71      0.74        90
         TUR       0.86      0.84      0.85        90

    accuracy                           0.78       990
   macro avg       0.78      0.78      0.78       990
weighted avg       0.78      0.78      0.78       990

