In [3]:
import os
import mlflow
import numpy as np
import pandas as pd
import mlflow.sklearn
from mlflow.tracking import MlflowClient
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report


# Racine du projet = parent de "notebooks"
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Verrouille le backend store sur la DB de la racine (chemin absolu)
mlflow.set_tracking_uri(f"sqlite:///{os.path.join(PROJECT_ROOT, 'mlflow.db')}")

print("cwd:", os.getcwd())
print("tracking_uri:", mlflow.get_tracking_uri())


cwd: /home/aurelien/formation_openclassrooms/projet_7/tweet_reaction_prediction/notebooks
tracking_uri: sqlite:////home/aurelien/formation_openclassrooms/projet_7/tweet_reaction_prediction/mlflow.db


In [4]:
EXPERIMENT_NAME = "sentiment_tweets_clean"
VAL_PATH = Path("data/processed/val.csv")
OUT_DIR = Path("artifacts")
OUT_DIR.mkdir(exist_ok=True)

client = MlflowClient()
exp = client.get_experiment_by_name(EXPERIMENT_NAME)
if exp is None:
    raise RuntimeError(f"MLflow experiment '{EXPERIMENT_NAME}' not found.")

runs = client.search_runs(
        experiment_ids=[exp.experiment_id],
        order_by=[f"metrics.val_auc DESC"],
        max_results=10,
    )
if not runs:
    raise RuntimeError(f"No runs found in experiment '{EXPERIMENT_NAME}'.")

best_run = runs[0]

2026/01/23 08:46:48 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.schemas
2026/01/23 08:46:48 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.tables
2026/01/23 08:46:48 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.types
2026/01/23 08:46:48 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.constraints
2026/01/23 08:46:48 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.defaults
2026/01/23 08:46:48 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.comments
2026/01/23 08:46:48 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/23 08:46:48 INFO mlflow.store.db.utils: Updating database tables
2026/01/23 08:46:48 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/23 08:46:48 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/23 08:46:48 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/23 08:46:48 INFO alembic.runtime

In [5]:
best_run


<Run: data=<RunData: metrics={'fit_time_sec': 25.19512441199913,
 'test_accuracy': 0.818615625,
 'test_auc': 0.8984187907421876,
 'test_f1': 0.8165964667258601,
 'test_precision': 0.8257890936048109,
 'test_predict_time_sec': 4.021737162001955,
 'test_recall': 0.80760625,
 'val_accuracy': 0.81890234375,
 'val_accuracy_at_0.5': 0.81890234375,
 'val_auc': 0.8982090605773927,
 'val_f1': 0.8171300770356462,
 'val_f1_at_0.5': 0.8171300770356462,
 'val_precision': 0.8252057457436722,
 'val_precision_at_0.5': 0.8252057457436722,
 'val_predict_time_sec': 3.415660733997356,
 'val_recall': 0.8092109375,
 'val_recall_at_0.5': 0.8092109375}, params={'classifier': 'logreg',
 'dataset': 'sentiment140_processed_v1',
 'max_features': '100000',
 'max_iter': '1000',
 'min_df': '2',
 'ngram_range': '1-2',
 'split': 'train/val/test',
 'threshold': '0.5',
 'vectorizer': 'tfidf'}, tags={'mlflow.runName': 'baseline_tfidf_logreg',
 'mlflow.source.git.commit': '24917a2a4a4c57d0be46c6fe33a5db772bedc5cb',
 'mlfl

In [6]:

def run_to_row(r):
    # r.data.params / r.data.metrics / r.info
    row = {
        "run_id": r.info.run_id,
        "run_name": (r.data.tags or {}).get("mlflow.runName", ""),
        "start_time": r.info.start_time,
    }
    # mets ici les métriques importantes (adapte les clés à ton projet)
    for k in ["accuracy", "f1", "f1_score", "val_accuracy", "val_f1", "val_loss", "loss"]:
        if k in r.data.metrics:
            row[k] = r.data.metrics[k]
    # params utiles
    for k in ["use_glove", "embedding_type", "glove", "tokenizer_type", "trainable_embedding", "max_len", "vocab_size"]:
        if k in r.data.params:
            row[k] = r.data.params[k]
    return row

df_runs = pd.DataFrame([run_to_row(r) for r in runs])
df_runs.head(20)


Unnamed: 0,run_id,run_name,start_time,val_accuracy,val_f1
0,40c53c9d38274903a1f77e6bf0213fac,baseline_tfidf_logreg,1769085228657,0.818902,0.81713


In [7]:
val_fn = pd.read_csv("../artifacts/val_errors_false_negatives.csv")
val_fp = pd.read_csv("../artifacts/val_errors_false_positives.csv")
val_threshold_sweep = pd.read_csv("../artifacts/val_threshold_sweep.csv")

In [10]:
val_fn.sort_values(by="proba_negative", ascending=True).head(10)

Unnamed: 0,text,target,proba_negative,pred_05
0,thanks federico,1,0.000697,0
1,thanks,1,0.000697,0
2,thanks,1,0.000697,0
3,thanks kacie,1,0.000697,0
4,thanks,1,0.000697,0
5,thanks,1,0.000697,0
6,thanks,1,0.000697,0
7,awesome,1,0.001325,0
8,hello renesme ur cute,1,0.001413,0
9,haha thanx,1,0.001821,0


In [11]:
val_fp.sort_values(by="proba_negative", ascending=False).head(10)

Unnamed: 0,text,target,proba_negative,pred_05
0,i miss my stephie poo,0,0.99977,1
1,i wish,0,0.999477,1
2,wish hbd,0,0.999477,1
3,wish hbd,0,0.999477,1
4,i wish,0,0.999477,1
5,awww miley why so sad ur making me sad now i l...,0,0.999344,1
6,kristennnnnaaaaaaaaaa stop itaaaaaaa miss youu...,0,0.999309,1
7,lost outofthecloset,0,0.999086,1
8,its almost supper here im not hungery not feel...,0,0.99906,1
9,i feel like crying but i don t wanna cry get it,0,0.998677,1
