In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# ========= 1. Chargement données =========
df = pd.read_csv("income_cleaned.csv")

# ========= 2. Cible = income =========
TARGET_COL = "income"

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# ========= 3. Séparer num / cat =========
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numériques :", numeric_cols)
print("Catégorielles :", cat_cols)

# ========= 4. Prétraitement =========
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

# ========= 5. Modèle =========
model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

# ========= 6. Pipeline complet =========
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", model)
])

# ========= 7. Split train/test =========
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ========= 8. Entraînement =========
print("\nEntraînement...")
clf.fit(X_train, y_train)

# ========= 9. Évaluation =========
y_pred = clf.predict(X_test)

print("\nAccuracy :", accuracy_score(y_test, y_pred))
print("\nClassification report :\n")
print(classification_report(y_test, y_pred))

# ========= 10. Exemple prédiction =========
print("\nExemple de prédictions :")
sample = X_test.iloc[:3]
print(sample)
print("\nPrédictions :", clf.predict(sample))


Numériques : ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Catégorielles : ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

Entraînement...

Accuracy : 0.8435599778883361

Classification report :

              precision    recall  f1-score   support

           0       0.88      0.92      0.90      6803
           1       0.71      0.62      0.66      2242

    accuracy                           0.84      9045
   macro avg       0.80      0.77      0.78      9045
weighted avg       0.84      0.84      0.84      9045


Exemple de prédictions :
       age  workclass     education  education-num      marital-status  \
36919   27    Private  Some-college             10       Never-married   
17947   43  Local-gov   Prof-school             15  Married-civ-spouse   
3173    49  Local-gov     Bachelors             13            Divorced   

           occupation relationship   race     sex  capital-gain 

In [5]:
# ========= 1. Chargement données =========
df = pd.read_csv("income_boosted.csv")

# ========= 2. Cible = income =========
TARGET_COL = "income"

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# ========= 3. Séparer num / cat =========
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numériques :", numeric_cols)
print("Catégorielles :", cat_cols)

# ========= 4. Prétraitement =========
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

# ========= 5. Modèle =========
model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

# ========= 6. Pipeline complet =========
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", model)
])

# ========= 7. Split train/test =========
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ========= 8. Entraînement =========
print("\nEntraînement...")
clf.fit(X_train, y_train)

# ========= 9. Évaluation =========
y_pred = clf.predict(X_test)

print("\nAccuracy :", accuracy_score(y_test, y_pred))
print("\nClassification report :\n")
print(classification_report(y_test, y_pred))

# ========= 10. Exemple prédiction =========
print("\nExemple de prédictions :")
sample = X_test.iloc[:3]
print(sample)
print("\nPrédictions :", clf.predict(sample))


Numériques : ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'estimation_carriere_age', 'is_married']
Catégorielles : ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

Entraînement...

Accuracy : 0.8468767274737424

Classification report :

              precision    recall  f1-score   support

           0       0.88      0.92      0.90      6803
           1       0.72      0.62      0.67      2242

    accuracy                           0.85      9045
   macro avg       0.80      0.77      0.78      9045
weighted avg       0.84      0.85      0.84      9045


Exemple de prédictions :
       age  workclass     education  education-num      marital-status  \
36919   27    Private  Some-college             10       Never-married   
17947   43  Local-gov   Prof-school             15  Married-civ-spouse   
3173    49  Local-gov     Bachelors             13            Divorced   

           occupation r

In [6]:
from sklearn.metrics import f1_score

f1_global = f1_score(y_test, y_pred, average="weighted")
print("\nF1-score global (weighted) :", round(f1_global, 4))


F1-score global (weighted) : 0.8426
