Notebook 3 : Modèle simple


---


In [4]:
import sys
from pathlib import Path
import joblib

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)
from sklearn.pipeline import Pipeline

import mlflow
import mlflow.sklearn

from tqdm.auto import tqdm

ROOT = Path("..").resolve()
DATA_PATH = ROOT / "data"
OUT_PATH = ROOT / "out"
SCRIPTS_PATH = ROOT / "scripts"
OUT_PATH.mkdir(exist_ok=True)

sys.path.append(str(SCRIPTS_PATH))

from preprocessing import preprocess_simple, drop_short_texts

tqdm.pandas(desc="Preprocessing")
pd.set_option("display.max_colwidth", 200)

In [5]:
mlflow.set_tracking_uri(f"file:{ROOT / 'mlruns'}")

mlflow.set_experiment("sentiment_airparadis_modele_simple")

  return FileStore(store_uri, store_uri)


<Experiment: artifact_location=('file:C:\\Users\\Gui\\Desktop\\AAA_doc\\Openclassroom school\\Python '
 'project\\proj_proj\\proj7\\mlruns/854533176105273696'), creation_time=1764515973439, experiment_id='854533176105273696', last_update_time=1764515973439, lifecycle_stage='active', name='sentiment_airparadis_modele_simple', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [6]:
col_names = ["target", "ids", "date", "flag", "user", "text"]

df = pd.read_csv(
    DATA_PATH / "training.1600000.processed.noemoticon.csv",
    encoding="latin-1",
    header=None,
    names=col_names,
)

df["label"] = (df["target"] == 4).astype(int)

df.head()
len(df)

1600000

In [7]:
df["text_simple"] = df["text"].progress_apply(preprocess_simple)

df = drop_short_texts(df, "text_simple", min_len=2)

df[["text", "text_simple"]].sample(5, random_state=0)

Preprocessing:   0%|          | 0/1600000 [00:00<?, ?it/s]

Preprocessing: 100%|██████████| 1600000/1600000 [02:32<00:00, 10482.77it/s]


[drop_short_texts] Colonne 'text_simple': 72684 lignes supprimées sur 1600000 (4.5427%). Min len = 2


Unnamed: 0,text,text_simple
917361,"@PotFace Yeah, you're right",yeah right
803513,@airnchoo Good survival skill,good survival skill
193831,"I am dealing with a lot of jealousy in my life, which is something I wasn't really feeling before",dealing lot jealousy life something really feeling
1151078,@AgentIceCream who me? have you been talking to my wife? remember she is biased and thinks I make good looking kids,talking wife remember biased think make good looking kid
115287,@xoxoangelrose ok haha. I'm excited i haven't been to disney since i was little. I miss Florida. So much.,haha excited disney since little miss florida much


In [8]:
X = df["text_simple"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

split_info = pd.DataFrame(
    {
        "ids": pd.concat([X_train, X_test]).index,
        "split": ["train"] * len(X_train) + ["test"] * len(X_test),
    }
)

split_info.to_csv(OUT_PATH / "split.csv", index=False)

len(X_train), len(X_test)

(1221852, 305464)

In [9]:
max_features = 50000
ngram_range = (1, 2)
C = 1.0

pipe = Pipeline(
    [
        ("tfidf", TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)),
        ("clf", LogisticRegression(max_iter=1000, C=C, n_jobs=-1)),
    ]
)

In [10]:
with mlflow.start_run(run_name="logreg_tfidf_baseline"):

    mlflow.log_param("model_type", "logreg_tfidf")
    mlflow.log_param("max_features", max_features)
    mlflow.log_param("ngram_range", ngram_range)
    mlflow.log_param("C", C)
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="binary"
    )
    roc_auc = roc_auc_score(y_test, y_proba)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("roc_auc", roc_auc)

    mlflow.sklearn.log_model(pipe, "model")

print("Accuracy :", acc)
print("F1-score :", f1)
print("ROC AUC  :", roc_auc)



Accuracy : 0.7931016420920305
F1-score : 0.7951975112608963
ROC AUC  : 0.8743315727541261


In [11]:
print(classification_report(y_test, y_pred, digits=4))

cm = confusion_matrix(y_test, y_pred)
cm

              precision    recall  f1-score   support

           0     0.8035    0.7788    0.7910    153528
           1     0.7832    0.8075    0.7952    151936

    accuracy                         0.7931    305464
   macro avg     0.7934    0.7932    0.7931    305464
weighted avg     0.7934    0.7931    0.7931    305464



array([[119569,  33959],
       [ 29241, 122695]])

In [12]:
MODELS_PATH = Path("..") / "models"
MODELS_PATH.mkdir(exist_ok=True)

MODEL_PATH = MODELS_PATH / "tfidf_logreg.joblib"

joblib.dump(pipe, MODEL_PATH)
print(f"Modèle sauvegardé dans : {MODEL_PATH}")

Modèle sauvegardé dans : ..\models\tfidf_logreg.joblib
