# Tunability testing

In [19]:
import pandas as pd

random_state = pd.read_csv("../../data/vars.csv")['random_state'].iloc[0]
n_iter = pd.read_csv("../../data/vars.csv")['n_iter'].iloc[0]

In [4]:
df_college = pd.read_csv("../../data/processed/college.csv")
df_credit = pd.read_csv("../../data/processed/credit.csv")
df_diabetes = pd.read_csv("../../data/processed/diabetes.csv")
df_penguins = pd.read_csv("../../data/processed/penguins.csv")

In [5]:
print(df_college.info())
print()
print(df_credit.info())
print()
print(df_diabetes.info())
print()
print(df_penguins.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   type_school            1000 non-null   object 
 1   school_accreditation   1000 non-null   object 
 2   gender                 1000 non-null   object 
 3   interest               1000 non-null   object 
 4   residence              1000 non-null   object 
 5   parent_age             1000 non-null   int64  
 6   parent_salary          1000 non-null   int64  
 7   house_area             1000 non-null   float64
 8   average_grades         1000 non-null   float64
 9   parent_was_in_college  1000 non-null   bool   
 10  will_go_to_college     1000 non-null   int64  
dtypes: bool(1), float64(2), int64(3), object(5)
memory usage: 79.2+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype 

# Credit dataset

In [6]:
X = df_credit.iloc[:, 0:-1]
y = df_credit.iloc[:, -1]

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
import numpy as np


num_pipeline = Pipeline(steps=[
    ('num_impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('cat_impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

col_trans = ColumnTransformer([
    ('num_pipeline', num_pipeline, make_column_selector(dtype_include = np.number)),
    ('cat_pipeline', cat_pipeline, make_column_selector(dtype_include = np.object_))
])

model_pipeline = Pipeline(steps=[
    ('preprocessing', col_trans),
    ('model', DecisionTreeClassifier())
])

tree_params = {
    'model__criterion' : ['gini', 'entropy', 'log_loss'],
    'model__max_depth': [None] + list(range(2, 20)),
    'model__min_samples_split': range(2, 21),                  
    'model__min_samples_leaf': range(1, 21),                         
    'model__max_leaf_nodes': [None] + list(range(10, 100, 10))
}

In [8]:
random_search_model = RandomizedSearchCV(estimator = model_pipeline,
                                   param_distributions = tree_params,
                                   n_iter = n_iter,
                                   cv = 5,
                                   scoring = 'f1',
                                   random_state = random_state)

In [9]:
random_search_model.fit(X, y)

In [10]:
best_f1_credit = random_search_model.best_score_
best_f1_credit

0.8648002281347171

In [11]:
tested_params = random_search_model.cv_results_['params']

# Data frame with params, results and mean result

In [12]:
df_decision_tree = pd.DataFrame(tested_params)
df_decision_tree['credit_f1'] = random_search_model.cv_results_['mean_test_score']
df_decision_tree

Unnamed: 0,model__min_samples_split,model__min_samples_leaf,model__max_leaf_nodes,model__max_depth,model__criterion,credit_f1
0,3,4,80.0,5.0,log_loss,0.789703
1,12,2,90.0,12.0,entropy,0.796568
2,16,8,80.0,11.0,gini,0.816832
3,4,9,20.0,2.0,gini,0.864800
4,4,14,50.0,14.0,log_loss,0.828465
...,...,...,...,...,...,...
95,10,13,20.0,18.0,entropy,0.832254
96,11,5,80.0,18.0,log_loss,0.810284
97,12,14,60.0,,entropy,0.828465
98,2,18,,16.0,log_loss,0.842258


# College dataset

In [13]:
from sklearn.model_selection import GridSearchCV

X = df_college.iloc[:, 0:-1]
y = df_college.iloc[:, -1]

formatted_params = []
for params in tested_params:
    formatted_param = {key: [value] for key, value in params.items()}
    formatted_params.append(formatted_param)

grid_search_model = GridSearchCV(model_pipeline,
                                 param_grid=formatted_params,
                                 scoring='f1',
                                 cv = 5)

grid_search_model.fit(X, y)

df_decision_tree['college_f1'] = grid_search_model.cv_results_['mean_test_score']

best_f1_college = grid_search_model.best_score_

print(f"Best f1 score: {best_f1_college}")
print(f"Best params: {grid_search_model.best_params_}")
grid_search_model.best_estimator_

Best f1 score: 0.8627268515134437
Best params: {'model__criterion': 'log_loss', 'model__max_depth': 17, 'model__max_leaf_nodes': 50, 'model__min_samples_leaf': 16, 'model__min_samples_split': 19}


# Diabetes dataset

In [14]:
from sklearn.model_selection import GridSearchCV

X = df_diabetes.iloc[:, 0:-1]
y = df_diabetes.iloc[:, -1]

formatted_params = []
for params in tested_params:
    formatted_param = {key: [value] for key, value in params.items()}
    formatted_params.append(formatted_param)

grid_search_model = GridSearchCV(model_pipeline,
                                 param_grid=formatted_params,
                                 scoring='f1',
                                 cv = 5)

grid_search_model.fit(X, y)

df_decision_tree['diabetes_f1'] = grid_search_model.cv_results_['mean_test_score']

best_f1_diabetes = grid_search_model.best_score_

print(f"Best f1 score: {best_f1_diabetes}")
print(f"Best params: {grid_search_model.best_params_}")
grid_search_model.best_estimator_

Best f1 score: 0.6417007553711286
Best params: {'model__criterion': 'gini', 'model__max_depth': 16, 'model__max_leaf_nodes': 70, 'model__min_samples_leaf': 19, 'model__min_samples_split': 14}


# Penguins dataset

In [15]:
from sklearn.model_selection import GridSearchCV

X = df_penguins.iloc[:, 0:-1]
y = df_penguins.iloc[:, -1]

formatted_params = []
for params in tested_params:
    formatted_param = {key: [value] for key, value in params.items()}
    formatted_params.append(formatted_param)

grid_search_model = GridSearchCV(model_pipeline,
                                 param_grid=formatted_params,
                                 scoring='f1',
                                 cv = 5)

grid_search_model.fit(X, y)

df_decision_tree['penguins_f1'] = grid_search_model.cv_results_['mean_test_score']

best_f1_penguins = grid_search_model.best_score_


print(f"Best f1 score: {best_f1_penguins}")
print(f"Best params: {grid_search_model.best_params_}")
grid_search_model.best_estimator_

Best f1 score: 0.9967213114754099
Best params: {'model__criterion': 'log_loss', 'model__max_depth': None, 'model__max_leaf_nodes': 40, 'model__min_samples_leaf': 1, 'model__min_samples_split': 17}


# Tunability 

In [16]:
df_decision_tree['mean_f1'] = df_decision_tree[['credit_f1', 'college_f1', 'diabetes_f1', 'penguins_f1']].mean(axis=1)
df_decision_tree

Unnamed: 0,model__min_samples_split,model__min_samples_leaf,model__max_leaf_nodes,model__max_depth,model__criterion,credit_f1,college_f1,diabetes_f1,penguins_f1,mean_f1
0,3,4,80.0,5.0,log_loss,0.789703,0.856695,0.610900,0.989944,0.811810
1,12,2,90.0,12.0,entropy,0.796568,0.828469,0.574138,0.982807,0.795495
2,16,8,80.0,11.0,gini,0.816832,0.830681,0.576898,0.986554,0.802741
3,4,9,20.0,2.0,gini,0.864800,0.729040,0.546090,0.986554,0.781621
4,4,14,50.0,14.0,log_loss,0.828465,0.854927,0.596049,0.986554,0.816499
...,...,...,...,...,...,...,...,...,...,...
95,10,13,20.0,18.0,entropy,0.832254,0.860352,0.590895,0.986554,0.817514
96,11,5,80.0,18.0,log_loss,0.810284,0.836283,0.540896,0.989944,0.794352
97,12,14,60.0,,entropy,0.828465,0.857257,0.601461,0.986554,0.818434
98,2,18,,16.0,log_loss,0.842258,0.860310,0.620838,0.986554,0.827490


In [17]:
# sigma star
sigma_star_row = df_decision_tree[df_decision_tree['mean_f1'] == max(df_decision_tree['mean_f1'])]
sigma_star_row

Unnamed: 0,model__min_samples_split,model__min_samples_leaf,model__max_leaf_nodes,model__max_depth,model__criterion,credit_f1,college_f1,diabetes_f1,penguins_f1,mean_f1
16,14,19,70.0,16.0,gini,0.841165,0.847157,0.641701,0.986554,0.829144


# 3.3 Measuring Overall Tunability of a ML Algorithm

In [18]:
d_credit = sigma_star_row['credit_f1'].iloc[0] - best_f1_credit
d_college = sigma_star_row['college_f1'].iloc[0] - best_f1_college
d_diabetes = sigma_star_row['diabetes_f1'].iloc[0] - best_f1_diabetes
d_penguins = sigma_star_row['penguins_f1'].iloc[0] - best_f1_penguins

d = pd.Series([d_credit, d_college, d_diabetes, d_penguins])
print(d)
print(f"Mean: {d.mean()}")


0   -0.023635
1   -0.015570
2    0.000000
3   -0.010168
dtype: float64
Mean: -0.012343366965756275
