# Tutorial for the use of Tuning Hyperparams and Feature Engineering for Classification Model

This tutorial consists of the use of tuning_hyperparams and feature_engineering, both methods from mlutils. 
The process is to import modules and after that apply each function for its purpose

In [None]:
!pip install mlutils

In [1]:
import pandas as pd

from mlutils.feature_engineering import feature_selection_filter
from mlutils.feature_engineering import feature_selection_wrapper
from mlutils.feature_engineering import feature_selection_embedded

from mlutils.tuning_hyperparams import tuning_hyperparams

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:

df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv', 
                          names=['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'])
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


#### Applying Feature Selection for each method from feature_engineering

In [3]:
feature_selection_filter(df, 'class', num_feats= 4)

['preg', 'plas', 'pedi', 'age']

In [4]:
feature_selection_wrapper(df, 'class',num_feats = 4 , step = 10)

['preg', 'plas', 'mass', 'pedi']

In [5]:
feature_selection_embedded(df, 'class', num_feats=4 , n_estimators = 50)

['plas', 'mass', 'pedi', 'age']

#### Applying Tuning Hyperparams for a DataFrame with the relevant columns

In [6]:
df = df[['plas', 'mass', 'pedi', 'age','class']]
df.head()

Unnamed: 0,plas,mass,pedi,age,class
0,148,33.6,0.627,50,1
1,85,26.6,0.351,31,0
2,183,23.3,0.672,32,1
3,89,28.1,0.167,21,0
4,137,43.1,2.288,33,1


In [7]:
tuning_hyperparams(df=df,
                   target='class',
                   parameters= [{"name": "min_samples_leaf", "type": "Integer", "low": 50, "high": 75},
                                {"name": "max_depth", "type": "Integer", "low": 12, "high": 24}],
                   algorithm=RandomForestClassifier,
                   metric=accuracy_score,
                   scoring_option="maximize",
                   n_trials=20
                  )

[32m[I 2021-11-30 16:39:28,379][0m A new study created in memory with name: no-name-6008e356-3c80-430d-b814-1819504f2e4c[0m
[32m[I 2021-11-30 16:39:29,396][0m Trial 0 finished with value: 0.7681305536568694 and parameters: {'min_samples_leaf': 59, 'max_depth': 24}. Best is trial 0 with value: 0.7681305536568694.[0m
[32m[I 2021-11-30 16:39:30,374][0m Trial 1 finished with value: 0.775974025974026 and parameters: {'min_samples_leaf': 69, 'max_depth': 19}. Best is trial 1 with value: 0.775974025974026.[0m
[32m[I 2021-11-30 16:39:31,363][0m Trial 2 finished with value: 0.7654818865345182 and parameters: {'min_samples_leaf': 54, 'max_depth': 14}. Best is trial 1 with value: 0.775974025974026.[0m
[32m[I 2021-11-30 16:39:32,317][0m Trial 3 finished with value: 0.7655160628844839 and parameters: {'min_samples_leaf': 51, 'max_depth': 23}. Best is trial 1 with value: 0.775974025974026.[0m
[32m[I 2021-11-30 16:39:33,234][0m Trial 4 finished with value: 0.7720779220779221 and para

{'min_samples_leaf': 65, 'max_depth': 13}