Experimentation phase to explore preprocessing type, compare model and iterate on features selection. To simplify, binary classification

In [44]:
#import libraries
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder

from sklearn.preprocessing import RobustScaler, MinMaxScaler

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score,train_test_split, GridSearchCV

In [2]:
#import dataset
df = pd.read_csv('clean_training_set.csv', index_col='id')

In [3]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59400 entries, 69572 to 26348
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   amount_tsh         59400 non-null  float64
 1   gps_height         59400 non-null  float64
 2   longitude          59400 non-null  float64
 3   latitude           59400 non-null  float64
 4   basin              59400 non-null  object 
 5   region             59400 non-null  object 
 6   district_code      59400 non-null  int64  
 7   ward               59400 non-null  object 
 8   population         59400 non-null  float64
 9   scheme_management  59400 non-null  object 
 10  extraction_type    59400 non-null  object 
 11  management         59400 non-null  object 
 12  payment_type       59400 non-null  object 
 13  water_quality      59400 non-null  object 
 14  quantity           59400 non-null  object 
 15  source             59400 non-null  object 
 16  waterpoint_type    5940

In [5]:
#drop repairs labels
df_binary = df[df['status_group'] != 'functional needs repair']

In [6]:
#encode labels (binaire)
df_binary['status_group'] = df_binary['status_group'].map({'functional': 1,'non functional': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_binary['status_group'] = df_binary['status_group'].map({'functional': 1,'non functional': 0})


In [7]:
df_binary['status_group'].unique()

array([1, 0])

In [8]:
features = df_binary.columns.drop(['status_group'])

In [9]:
features = features.to_list()

In [10]:
X = df_binary[features]

In [11]:
y = df_binary['status_group']

# Compare Preprocessing method

Robust Scaler & One hot encoder

In [12]:
#encode categorical features
categorical = X.select_dtypes('object').columns

In [13]:
ohe = OneHotEncoder(handle_unknown='ignore')

In [14]:
#scaling
num_col = X.select_dtypes(['int','float']).columns
rs = RobustScaler()

In [17]:
#make pipeline
num_transformer = make_pipeline(rs)
cat_transformer = make_pipeline(ohe)

In [18]:
preprocessor = ColumnTransformer(
      transformers=[('num', num_transformer, num_col),
                    ('cat', cat_transformer, categorical)])

In [19]:
lr = LogisticRegression(class_weight = 'balanced', solver = 'lbfgs',max_iter=500)

In [20]:
pipe = make_pipeline(preprocessor,lr)

In [21]:
cv = cross_val_score(pipe, X, y, cv=5,scoring='roc_auc')

In [22]:
cv.mean()

0.8919496205157884

TargetEncoder & MinMax Scaler

In [24]:
scaler = MinMaxScaler()
te = TargetEncoder(cols=categorical)

In [25]:
#make pipeline
num_transformer = make_pipeline(scaler)
cat_transformer = make_pipeline(te)

In [26]:
preprocessor = ColumnTransformer(
      transformers=[('num', num_transformer, num_col),
                    ('cat', cat_transformer, categorical)])

In [27]:
pipe = make_pipeline(preprocessor,lr)

In [28]:
cv = cross_val_score(pipe, X, y, cv=5,scoring='roc_auc')

In [29]:
cv.mean()

0.8586442066549441

TargetEncoder & Robust Scaler

In [30]:
scaler = RobustScaler()
te = TargetEncoder(cols=categorical)
#make pipeline
num_transformer = make_pipeline(scaler)
cat_transformer = make_pipeline(te)
preprocessor = ColumnTransformer(
      transformers=[('num', num_transformer, num_col),
                    ('cat', cat_transformer, categorical)])
pipe = make_pipeline(preprocessor,lr)

In [31]:
cv = cross_val_score(pipe, X, y, cv=5,scoring='roc_auc')
cv.mean()

0.8587321832375583

MinMax & One Hot

In [36]:
scaler = MinMaxScaler()
ohe = OneHotEncoder(handle_unknown='ignore')
#make pipeline
num_transformer = make_pipeline(scaler)
cat_transformer = make_pipeline(ohe)
preprocessor = ColumnTransformer(
      transformers=[('num', num_transformer, num_col),
                    ('cat', cat_transformer, categorical)])
pipe = make_pipeline(preprocessor,lr)

In [37]:
cv = cross_val_score(pipe, X, y, cv=5,scoring='roc_auc')
cv.mean()

0.8917535283696519

--> Best perform with Ohe + Robust Scaler

# Compare Models

Decision Tree Classifier

In [38]:
preprocessor

In [40]:
dt = DecisionTreeClassifier(class_weight='balanced')

pipe = make_pipeline(preprocessor,dt)

In [41]:
cv = cross_val_score(pipe, X, y, cv=5, scoring='roc_auc')

In [42]:
cv.mean()

0.8186097404944983

Random forest

In [45]:
rf = RandomForestClassifier()
pipe = make_pipeline(preprocessor,rf)

In [46]:
cv = cross_val_score(pipe, X, y, cv=5, scoring='roc_auc')
cv.mean()

0.9235757716628952

Best model is Random Forest

# Finetuning

In [54]:
rf.estimator_params

('criterion',
 'max_depth',
 'min_samples_split',
 'min_samples_leaf',
 'min_weight_fraction_leaf',
 'max_features',
 'max_leaf_nodes',
 'min_impurity_decrease',
 'random_state',
 'ccp_alpha',
 'monotonic_cst')

In [65]:
pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('minmaxscaler',
                                                     MinMaxScaler())]),
                                    Index(['amount_tsh', 'gps_height', 'longitude', 'latitude', 'district_code',
          'population', 'operation_time'],
         dtype='object')),
                                   ('cat',
                                    Pipeline(steps=[('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    Index(['basin', 'region', 'ward', 'scheme_management', 'extraction_type',
          'management', 'payment_type', 'water_quality', 'quantity', 'source',
          'waterpoint_type', 'top_installer', 'top_funders'],
         dtype='object'))])),
  ('randomforestclassifier', RandomForestClassifier())],
 'transform_input': None,
 'verbose'

In [76]:
param_grid = {'randomforestclassifier__n_estimators': [10, 50, 100],  
    'randomforestclassifier__max_depth': [None, 10, 20],  
    'randomforestclassifier__min_samples_split': [2, 5, 10]}

In [77]:
grid_search = GridSearchCV(pipe,param_grid,n_jobs=1,cv=5,scoring='roc_auc')

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
grid_search.fit(X_train, y_train)

In [80]:
grid_search.best_score_

0.923876927919823

In [81]:
grid_search.best_params_

{'randomforestclassifier__max_depth': None,
 'randomforestclassifier__min_samples_split': 5,
 'randomforestclassifier__n_estimators': 100}

Selection preprocessing + estimator + hyperparameters
- Ohe
- RobustScaler
- Random Forest
- Params: max_depth': None,min_samples_split': 5,n_estimators': 100