# II- Build ML Model

## 4- Training and Testing Data Split

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, make_scorer, precision_score, confusion_matrix, classification_report
import joblib

In [16]:
X = joblib.load('../../artifacts/x.pkl')

y = joblib.load('../../artifacts/y.pkl')

target_classes = joblib.load('../../artifacts/label_classes.pkl')

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 ,stratify=y)

In [18]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)


(119999, 10000)
(30000, 10000)
(119999,)
(30000,)


## 5- Choose Appropriate ML Algorithm

I choose Stochastic Gradient Descent (SGD) because it is fast, memory-efficient, and works well for large datasets.

## 6- Build ML Model With Training Data:

In [19]:
model = SGDClassifier(random_state=42)
model.fit(x_train, y_train)


0,1,2
,loss,'hinge'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


# III- Tune Model With Testing Data:

## 7- Evaluate Model With Testing Data:

In [24]:
y_pred = model.predict(x_test)

## 8- Compute Performance Metrics:

In [25]:
acc = accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

conf = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: ", conf)

class_rep = classification_report(y_test, y_pred, target_names=target_classes)
print("Classification Report:", class_rep)

Accuracy:  0.7471666666666666
Confusion Matrix:  [[8310 1358  332]
 [2237 5641 2122]
 [ 395 1141 8464]]
Classification Report:               precision    recall  f1-score   support

    Negative       0.76      0.83      0.79     10000
      Neutre       0.69      0.56      0.62     10000
    Positive       0.78      0.85      0.81     10000

    accuracy                           0.75     30000
   macro avg       0.74      0.75      0.74     30000
weighted avg       0.74      0.75      0.74     30000



### ==> Overfitting Verification:

In [23]:
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)

print("Accuracy sur train :", train_acc)
print("Accuracy sur test  :", test_acc)

if train_acc - test_acc > 0.1:  
    print(" Possible overfitting (écart trop grand entre train et test)")
else:
    print(" Pas d'overfitting significatif")


Accuracy sur train : 0.7608396736639472
Accuracy sur test  : 0.7471666666666666
 Pas d'overfitting significatif


## 9- Perform Hyperparameter Tuning:

In [30]:
param_grid = {
        'penalty': ['l2','l1','elasticnet'],   
        'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 0.1],       
        'loss': ['hinge', 'log_loss', 'squared_hinge']    
}
scoring = {
    'precision': make_scorer(precision_score   , average='macro')
}

grid_search = GridSearchCV(SGDClassifier(random_state=42), param_grid=param_grid, cv=5, n_jobs=1, scoring=scoring, refit='precision', verbose=2 )

grid_search.fit(x_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")


best_estimator = grid_search.best_estimator_


Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV] END ................alpha=1e-05, loss=hinge, penalty=l2; total time=   2.9s
[CV] END ................alpha=1e-05, loss=hinge, penalty=l2; total time=   1.4s
[CV] END ................alpha=1e-05, loss=hinge, penalty=l2; total time=   1.5s
[CV] END ................alpha=1e-05, loss=hinge, penalty=l2; total time=   1.4s
[CV] END ................alpha=1e-05, loss=hinge, penalty=l2; total time=   1.3s
[CV] END ................alpha=1e-05, loss=hinge, penalty=l1; total time=   2.6s
[CV] END ................alpha=1e-05, loss=hinge, penalty=l1; total time=   2.8s
[CV] END ................alpha=1e-05, loss=hinge, penalty=l1; total time=   2.8s
[CV] END ................alpha=1e-05, loss=hinge, penalty=l1; total time=   2.6s
[CV] END ................alpha=1e-05, loss=hinge, penalty=l1; total time=   2.8s
[CV] END ........alpha=1e-05, loss=hinge, penalty=elasticnet; total time=   2.9s
[CV] END ........alpha=1e-05, loss=hinge, penal



[CV] END ........alpha=1e-05, loss=squared_hinge, penalty=l2; total time= 1.3min




[CV] END ........alpha=1e-05, loss=squared_hinge, penalty=l2; total time= 1.3min




[CV] END ........alpha=1e-05, loss=squared_hinge, penalty=l2; total time= 1.2min




[CV] END ........alpha=1e-05, loss=squared_hinge, penalty=l2; total time= 1.2min




[CV] END ........alpha=1e-05, loss=squared_hinge, penalty=l2; total time= 1.2min




[CV] END ........alpha=1e-05, loss=squared_hinge, penalty=l1; total time= 2.8min




[CV] END ........alpha=1e-05, loss=squared_hinge, penalty=l1; total time= 2.8min




[CV] END ........alpha=1e-05, loss=squared_hinge, penalty=l1; total time= 2.8min




[CV] END ........alpha=1e-05, loss=squared_hinge, penalty=l1; total time= 2.9min




[CV] END ........alpha=1e-05, loss=squared_hinge, penalty=l1; total time= 3.2min




[CV] END alpha=1e-05, loss=squared_hinge, penalty=elasticnet; total time= 2.8min




[CV] END alpha=1e-05, loss=squared_hinge, penalty=elasticnet; total time= 2.9min




[CV] END alpha=1e-05, loss=squared_hinge, penalty=elasticnet; total time= 3.0min




[CV] END alpha=1e-05, loss=squared_hinge, penalty=elasticnet; total time= 2.8min




[CV] END alpha=1e-05, loss=squared_hinge, penalty=elasticnet; total time= 2.8min
[CV] END ...............alpha=0.0001, loss=hinge, penalty=l2; total time=   0.8s
[CV] END ...............alpha=0.0001, loss=hinge, penalty=l2; total time=   0.8s
[CV] END ...............alpha=0.0001, loss=hinge, penalty=l2; total time=   0.8s
[CV] END ...............alpha=0.0001, loss=hinge, penalty=l2; total time=   0.8s
[CV] END ...............alpha=0.0001, loss=hinge, penalty=l2; total time=   1.0s
[CV] END ...............alpha=0.0001, loss=hinge, penalty=l1; total time=   1.6s
[CV] END ...............alpha=0.0001, loss=hinge, penalty=l1; total time=   1.5s
[CV] END ...............alpha=0.0001, loss=hinge, penalty=l1; total time=   1.1s
[CV] END ...............alpha=0.0001, loss=hinge, penalty=l1; total time=   1.3s
[CV] END ...............alpha=0.0001, loss=hinge, penalty=l1; total time=   1.3s
[CV] END .......alpha=0.0001, loss=hinge, penalty=elasticnet; total time=   1.8s
[CV] END .......alpha=0.0001



[CV] END .......alpha=0.0001, loss=squared_hinge, penalty=l2; total time= 1.3min




[CV] END .......alpha=0.0001, loss=squared_hinge, penalty=l2; total time= 1.4min




[CV] END .......alpha=0.0001, loss=squared_hinge, penalty=l2; total time= 1.7min




[CV] END .......alpha=0.0001, loss=squared_hinge, penalty=l2; total time= 1.4min




[CV] END .......alpha=0.0001, loss=squared_hinge, penalty=l2; total time= 1.2min




[CV] END .......alpha=0.0001, loss=squared_hinge, penalty=l1; total time= 2.8min




[CV] END .......alpha=0.0001, loss=squared_hinge, penalty=l1; total time= 3.1min




[CV] END .......alpha=0.0001, loss=squared_hinge, penalty=l1; total time= 3.5min




[CV] END .......alpha=0.0001, loss=squared_hinge, penalty=l1; total time= 3.0min




[CV] END .......alpha=0.0001, loss=squared_hinge, penalty=l1; total time= 2.8min




[CV] END alpha=0.0001, loss=squared_hinge, penalty=elasticnet; total time= 2.7min




[CV] END alpha=0.0001, loss=squared_hinge, penalty=elasticnet; total time= 2.7min




[CV] END alpha=0.0001, loss=squared_hinge, penalty=elasticnet; total time= 2.6min




[CV] END alpha=0.0001, loss=squared_hinge, penalty=elasticnet; total time= 2.5min




[CV] END alpha=0.0001, loss=squared_hinge, penalty=elasticnet; total time= 2.6min
[CV] END ................alpha=0.001, loss=hinge, penalty=l2; total time=   0.8s
[CV] END ................alpha=0.001, loss=hinge, penalty=l2; total time=   0.7s
[CV] END ................alpha=0.001, loss=hinge, penalty=l2; total time=   0.7s
[CV] END ................alpha=0.001, loss=hinge, penalty=l2; total time=   0.8s
[CV] END ................alpha=0.001, loss=hinge, penalty=l2; total time=   0.7s
[CV] END ................alpha=0.001, loss=hinge, penalty=l1; total time=   0.9s
[CV] END ................alpha=0.001, loss=hinge, penalty=l1; total time=   0.9s
[CV] END ................alpha=0.001, loss=hinge, penalty=l1; total time=   1.2s
[CV] END ................alpha=0.001, loss=hinge, penalty=l1; total time=   1.2s
[CV] END ................alpha=0.001, loss=hinge, penalty=l1; total time=   1.0s
[CV] END ........alpha=0.001, loss=hinge, penalty=elasticnet; total time=   1.2s
[CV] END ........alpha=0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END .................alpha=0.01, loss=hinge, penalty=l1; total time=   0.7s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END .................alpha=0.01, loss=hinge, penalty=l1; total time=   0.8s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END .................alpha=0.01, loss=hinge, penalty=l1; total time=   0.8s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END .................alpha=0.01, loss=hinge, penalty=l1; total time=   0.8s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END .................alpha=0.01, loss=hinge, penalty=l1; total time=   0.9s
[CV] END .........alpha=0.01, loss=hinge, penalty=elasticnet; total time=   0.8s
[CV] END .........alpha=0.01, loss=hinge, penalty=elasticnet; total time=   0.8s
[CV] END .........alpha=0.01, loss=hinge, penalty=elasticnet; total time=   0.8s
[CV] END .........alpha=0.01, loss=hinge, penalty=elasticnet; total time=   0.9s
[CV] END .........alpha=0.01, loss=hinge, penalty=elasticnet; total time=   0.9s
[CV] END ..............alpha=0.01, loss=log_loss, penalty=l2; total time=   0.8s
[CV] END ..............alpha=0.01, loss=log_loss, penalty=l2; total time=   0.8s
[CV] END ..............alpha=0.01, loss=log_loss, penalty=l2; total time=   0.8s
[CV] END ..............alpha=0.01, loss=log_loss, penalty=l2; total time=   0.7s
[CV] END ..............alpha=0.01, loss=log_loss, penalty=l2; total time=   0.7s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..............alpha=0.01, loss=log_loss, penalty=l1; total time=   1.0s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..............alpha=0.01, loss=log_loss, penalty=l1; total time=   1.1s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..............alpha=0.01, loss=log_loss, penalty=l1; total time=   1.1s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..............alpha=0.01, loss=log_loss, penalty=l1; total time=   1.1s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..............alpha=0.01, loss=log_loss, penalty=l1; total time=   1.1s
[CV] END ......alpha=0.01, loss=log_loss, penalty=elasticnet; total time=   1.1s
[CV] END ......alpha=0.01, loss=log_loss, penalty=elasticnet; total time=   1.1s
[CV] END ......alpha=0.01, loss=log_loss, penalty=elasticnet; total time=   1.1s
[CV] END ......alpha=0.01, loss=log_loss, penalty=elasticnet; total time=   1.2s
[CV] END ......alpha=0.01, loss=log_loss, penalty=elasticnet; total time=   1.4s
[CV] END .........alpha=0.01, loss=squared_hinge, penalty=l2; total time=   0.8s
[CV] END .........alpha=0.01, loss=squared_hinge, penalty=l2; total time=   0.7s
[CV] END .........alpha=0.01, loss=squared_hinge, penalty=l2; total time=   0.8s
[CV] END .........alpha=0.01, loss=squared_hinge, penalty=l2; total time=   0.8s
[CV] END .........alpha=0.01, loss=squared_hinge, penalty=l2; total time=   0.7s
[CV] END .........alpha=0.01, loss=squared_hinge, penalty=l1; total time=   1.4s
[CV] END .........alpha=0.01

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..................alpha=0.1, loss=hinge, penalty=l1; total time=   2.5s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..................alpha=0.1, loss=hinge, penalty=l1; total time=   2.8s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..................alpha=0.1, loss=hinge, penalty=l1; total time=   2.8s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..................alpha=0.1, loss=hinge, penalty=l1; total time=   2.9s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..................alpha=0.1, loss=hinge, penalty=l1; total time=   2.8s
[CV] END ..........alpha=0.1, loss=hinge, penalty=elasticnet; total time=   2.7s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..........alpha=0.1, loss=hinge, penalty=elasticnet; total time=   2.6s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..........alpha=0.1, loss=hinge, penalty=elasticnet; total time=   2.7s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..........alpha=0.1, loss=hinge, penalty=elasticnet; total time=   2.7s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..........alpha=0.1, loss=hinge, penalty=elasticnet; total time=   2.5s
[CV] END ...............alpha=0.1, loss=log_loss, penalty=l2; total time=   1.4s
[CV] END ...............alpha=0.1, loss=log_loss, penalty=l2; total time=   0.9s
[CV] END ...............alpha=0.1, loss=log_loss, penalty=l2; total time=   0.9s
[CV] END ...............alpha=0.1, loss=log_loss, penalty=l2; total time=   1.0s
[CV] END ...............alpha=0.1, loss=log_loss, penalty=l2; total time=   1.0s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ...............alpha=0.1, loss=log_loss, penalty=l1; total time=   1.3s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ...............alpha=0.1, loss=log_loss, penalty=l1; total time=   1.3s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ...............alpha=0.1, loss=log_loss, penalty=l1; total time=   1.3s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ...............alpha=0.1, loss=log_loss, penalty=l1; total time=   1.3s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ...............alpha=0.1, loss=log_loss, penalty=l1; total time=   1.0s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END .......alpha=0.1, loss=log_loss, penalty=elasticnet; total time=   1.2s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END .......alpha=0.1, loss=log_loss, penalty=elasticnet; total time=   1.2s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END .......alpha=0.1, loss=log_loss, penalty=elasticnet; total time=   1.1s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END .......alpha=0.1, loss=log_loss, penalty=elasticnet; total time=   1.1s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END .......alpha=0.1, loss=log_loss, penalty=elasticnet; total time=   1.4s
[CV] END ..........alpha=0.1, loss=squared_hinge, penalty=l2; total time=   0.8s
[CV] END ..........alpha=0.1, loss=squared_hinge, penalty=l2; total time=   0.7s
[CV] END ..........alpha=0.1, loss=squared_hinge, penalty=l2; total time=   0.8s
[CV] END ..........alpha=0.1, loss=squared_hinge, penalty=l2; total time=   0.7s
[CV] END ..........alpha=0.1, loss=squared_hinge, penalty=l2; total time=   2.5s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..........alpha=0.1, loss=squared_hinge, penalty=l1; total time=   1.0s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..........alpha=0.1, loss=squared_hinge, penalty=l1; total time=   1.2s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..........alpha=0.1, loss=squared_hinge, penalty=l1; total time=   1.2s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..........alpha=0.1, loss=squared_hinge, penalty=l1; total time=   1.1s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..........alpha=0.1, loss=squared_hinge, penalty=l1; total time=   1.1s
[CV] END ..alpha=0.1, loss=squared_hinge, penalty=elasticnet; total time=   1.2s
[CV] END ..alpha=0.1, loss=squared_hinge, penalty=elasticnet; total time=   1.0s
[CV] END ..alpha=0.1, loss=squared_hinge, penalty=elasticnet; total time=   1.1s
[CV] END ..alpha=0.1, loss=squared_hinge, penalty=elasticnet; total time=   1.1s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ..alpha=0.1, loss=squared_hinge, penalty=elasticnet; total time=   1.0s
Best parameters: {'alpha': 1e-05, 'loss': 'log_loss', 'penalty': 'l1'}


In [31]:
y_pred = best_estimator.predict(x_test)

acc = accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

conf = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: ", conf)

class_rep = classification_report(y_test, y_pred, target_names=target_classes)
print("Classification Report:", class_rep)

Accuracy:  0.7583666666666666
Confusion Matrix:  [[8106 1619  275]
 [1914 6317 1769]
 [ 308 1364 8328]]
Classification Report:               precision    recall  f1-score   support

    Negative       0.78      0.81      0.80     10000
      Neutre       0.68      0.63      0.65     10000
    Positive       0.80      0.83      0.82     10000

    accuracy                           0.76     30000
   macro avg       0.76      0.76      0.76     30000
weighted avg       0.76      0.76      0.76     30000



In [32]:
# 4. Évaluer sur train et test
y_pred_train = best_estimator.predict(x_train)
y_pred_test = best_estimator.predict(x_test)

train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)

print("Accuracy sur train :", train_acc)
print("Accuracy sur test  :", test_acc)

# Vérification overfitting
if train_acc - test_acc > 0.1:  # seuil arbitraire
    print(" Possible overfitting (écart trop grand entre train et test)")
else:
    print(" Pas d'overfitting significatif")


Accuracy sur train : 0.7750397919982667
Accuracy sur test  : 0.7583666666666666
 Pas d'overfitting significatif


# IV- Choose ML Model:

“I first trained a baseline model using SGDClassifier(random_state=42) and evaluated its performance on the test set. Then, I performed hyperparameter tuning using GridSearchCV with refit on the primary metric (precision). The best model from GridSearchCV was selected automatically. The performance of the baseline model was compared visually using metrics like accuracy and precision, without additional fine-tuning.”

In [33]:
joblib.dump(best_estimator, 'SGD.pkl')

['SGD.pkl']

## ==> Test Model:

In [3]:
load_model = joblib.load('SGD.pkl')
load_vectorizer = joblib.load('../../artifacts/TfidfVectorizer.pkl')
load_label_encoder = joblib.load('../../artifacts/LabelEncoder.pkl')

# Exemple de texte pour la prédiction
example_text = ["The food was okay, nothing special but not bad either."]

# Vectorisation du texte
example_tfidf = load_vectorizer.transform(example_text)

# Prédiction
predicted_rating_encoded = load_model.predict(example_tfidf)

predicted_rating_label = load_label_encoder.inverse_transform(predicted_rating_encoded)



print(f"Predicted classe : {predicted_rating_label[0]}")

Predicted classe : Neutre


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
