In [1]:
import ssl
import pandas as pd
import numpy as np
from sklearn import set_config
from sklearn.pipeline import make_pipeline, FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

set_config(transform_output = "pandas")

ssl._create_default_https_context = ssl._create_unverified_context
df = pd.read_csv('https://raw.githubusercontent.com/ElieLECAS/ML_avance/fil_rouge/activite_finale/data/census_us.csv')
df.drop(['fnlwgt'], axis=1, inplace=True)
display(df.head(5))

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,<=50K
4,37,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,<=50K


In [2]:
print(f'nb dupe {df.loc[df.duplicated()].shape[0]}')
df = df.drop_duplicates(ignore_index=True)
df.describe()

nb dupe 6340


Unnamed: 0,age,capital_gain,capital_loss,hours_per_week
count,37492.0,37492.0,37492.0,37492.0
mean,39.644618,1257.112931,102.925024,40.70255
std,13.878898,7993.566016,434.941795,13.013938
min,17.0,0.0,0.0,1.0
25%,29.0,0.0,0.0,38.0
50%,38.0,0.0,0.0,40.0
75%,49.0,0.0,0.0,45.0
max,90.0,99999.0,4356.0,99.0


# Modélisation

## Split

In [3]:
X = df.drop(columns=['income'])
y = df['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=.85, shuffle=True)

## Encodage
### Income (target)

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_test  = le.fit_transform(y_test)
y_train = le.fit_transform(y_train)

### Education

In [5]:
from sklearn.preprocessing import OrdinalEncoder

df.education.value_counts()
# Créer un tableau de données contenant les niveaux d'éducation
data = [[
    "Preschool",
    "1st-4th grade",
    "5th-6th grade",
    "7th-8th grade",
    "9th grade",
    "10th grade",
    "11th grade",
    "12th grade",
    "Assoc-voc",
    "Assoc-acdm",
    "Some-college",
    "Bachelors",
    "Masters",
    "Doctorate",
    "Prof-school"
]]

education_ordinal_encoder_column = ['education']

# Créer un ordinalencoder
education_ordinal_encoder = OrdinalEncoder(categories=data, handle_unknown='use_encoded_value', unknown_value=-1)

### OHE

In [6]:
from sklearn.preprocessing import OneHotEncoder

binary_columns = ['sex']
nominal_columns = ['workclass', 'marital_status', 'occupation', 'relationship', 'race']

binary_encoder = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
nominal_encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)

### Création du pipeline

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def create_transformers(add_transformers = None):
    init_transformers = [
        ('education', education_ordinal_encoder, education_ordinal_encoder_column),
        ('binary var', binary_encoder, binary_columns),
        ('nominal var', nominal_encoder, nominal_columns),
    ]
    if add_transformers is not None:
        init_transformers = init_transformers + add_transformers
            
    return ColumnTransformer(
        transformers=init_transformers,
        remainder='passthrough'
    )

pipe = make_pipeline(create_transformers(), MinMaxScaler())
display(pipe.fit(X_train, y_train))
X_transform = pipe.fit_transform(X_train, y_train)
display(X_transform.head())



Unnamed: 0,education__education,binary var__sex_Male,nominal var__workclass_Federal-gov,nominal var__workclass_Local-gov,nominal var__workclass_Never-worked,nominal var__workclass_Private,nominal var__workclass_Self-emp-inc,nominal var__workclass_Self-emp-not-inc,nominal var__workclass_State-gov,nominal var__workclass_Without-pay,...,nominal var__relationship_Unmarried,nominal var__relationship_Wife,nominal var__race_Asian-Pac-Islander,nominal var__race_Black,nominal var__race_Other,nominal var__race_White,remainder__age,remainder__capital_gain,remainder__capital_loss,remainder__hours_per_week
36449,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.041096,0.0,0.0,0.397959
32827,0.866667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.178082,1.0,0.0,0.397959
34728,0.866667,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.356164,0.0,0.0,0.653061
25226,0.733333,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.287671,0.0,0.0,0.55102
17192,0.6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.191781,0.0,0.0,0.397959


### Test de performance

In [17]:
import time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
import warnings
warnings.simplefilter(action='ignore', category=Warning)

pipe = make_pipeline(
            create_transformers(),
            MinMaxScaler()).fit_transform(X_train, y_train)
print('Nb colonnes: ', len(pipe.columns))

start = time.time()
pipe = make_pipeline(
            create_transformers(),
            MinMaxScaler(),
            GradientBoostingClassifier(max_depth=5, random_state=42))

pipe.fit(X_train, y_train)
stop = time.time()
print(f"Training time: {round(stop - start, 2)}s")

y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))

Nb colonnes:  43
Training time: 3.03s
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      4195
           1       0.80      0.65      0.71      1429

    accuracy                           0.87      5624
   macro avg       0.84      0.80      0.81      5624
weighted avg       0.86      0.87      0.86      5624



In [13]:
from sklearn.feature_selection import SelectKBest, f_classif, chi2, VarianceThreshold
variance_treshold = .0001
k = 25
kbest_test = chi2

pipe = make_pipeline(
            create_transformers(),
            MinMaxScaler(),
            SelectKBest(kbest_test, k = k),
            VarianceThreshold(variance_treshold),
            ).fit_transform(X_train, y_train)
print('Nb colonnes: ', len(pipe.columns))

start = time.time()
pipe = make_pipeline(
            create_transformers(),
            MinMaxScaler(),
            SelectKBest(kbest_test, k = k),
            VarianceThreshold(variance_treshold),
            GradientBoostingClassifier(max_depth=5, random_state=42))

pipe.fit(X_train, y_train)
stop = time.time()
print(f"Training time: {round(stop - start, 2)}s")

y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))

Nb colonnes:  25
Training time: 2.51s
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      4195
           1       0.80      0.63      0.71      1429

    accuracy                           0.87      5624
   macro avg       0.84      0.79      0.81      5624
weighted avg       0.86      0.87      0.86      5624

