# Text-based Model: Training Models: Tuning

-------   
> In this last part, we will **tune the hyperparameters** of the chosen model (**RandomForestClassifier**) to find out the set of parameters that give the best results.

----------

<pre>
📝 <b>Note</b>
<div style="background-color:#C2F2ED;">
The term <b>best</b> algorithm is used to refer to the algorithm that gives the best results <b>among the evaluated algorithms</b> in this project and not among all the classificatiion algorithms.
</div> </pre> 


In [1]:
#Generic libs
import pandas as pd
import numpy as np

# predefined modules
from modules import Train_Functions as Train_F

# ML libs 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier

#global params
dataset_path = 'data/autism_with_metadata.csv'

model_path = 'models/'
report_path = 'models/'

val_ratio = 0.2
seed = 0

## Load Data

In [2]:
data = pd.read_csv(dataset_path)
data.head()

Unnamed: 0,name,age,sex,speech,ASD,abs_age,clean_annotated_speech,lemmatized_speech,meaningful_speech,structured_speech,...,n_uni,n_rep,n_inq,n_ono,n_hes,n_mis,n_disf,age_in_months,n_diff_words,density
0,Eigsti,5;03.10,0,\tokay .,1,5.0,okay,okay,okay,okay,...,0,0,0,0,0,0,0,63.0,1,4
1,Eigsti,5;03.10,0,\tdid you see this ?,1,5.0,did you see this,do you see this,do you see this,do you see this,...,0,0,0,0,0,0,0,63.0,4,13
2,Eigsti,5;03.10,0,\tyeah .,1,5.0,yeah,yeah,yeah,yeah,...,0,0,0,0,0,0,0,63.0,1,4
3,Eigsti,5;03.10,0,\txxx let's see +...,1,5.0,uni let's see inq,uni let us see inq,uni let us see inq,uni let we see inq,...,1,0,1,0,0,0,0,63.0,3,8
4,Eigsti,5;03.10,0,\txxx .,1,5.0,uni,uni,uni,uni,...,1,0,0,0,0,0,0,63.0,0,0


In [3]:
#Train_F.missing(data)

## Features and Target

In [4]:
# 1. drop null values
data.dropna(subset=['clean_annotated_speech'], inplace=True)

# define the features and the target
numerical_features = ['sex','age_in_months','len_clean_annotated_speech',
       'len_meaningful_speech', 'len_structured_speech', 'n_bab', 'n_gue',
       'n_uni', 'n_rep', 'n_inq', 'n_ono', 'n_hes', 'n_mis', 'n_disf',
        'n_diff_words', 'density']

nlp_features = ['clean_annotated_speech']

y = data['ASD'].values

## Model Pipeline

In [5]:
# combine the textual features and the numerical features
cols = numerical_features + nlp_features
X = data[cols]

# split the data into training and validation subsets
X_train, X_val, y_train, y_val = Train_F.train_val(X, y, val_ratio) 

# Training pipeline
preprocessor = ColumnTransformer(
    [('tfidfV', TfidfVectorizer(), 'clean_annotated_speech')], 
    remainder='passthrough'
)


combined_clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("classifier", RandomForestClassifier(n_jobs=-1, random_state=seed, verbose=3))]
)

## Tuning Chosen Model

In [6]:
print(Train_F.check_params_exist(combined_clf, 'classifier'))

['classifier', 'classifier__bootstrap', 'classifier__ccp_alpha', 'classifier__class_weight', 'classifier__criterion', 'classifier__max_depth', 'classifier__max_features', 'classifier__max_leaf_nodes', 'classifier__max_samples', 'classifier__min_impurity_decrease', 'classifier__min_impurity_split', 'classifier__min_samples_leaf', 'classifier__min_samples_split', 'classifier__min_weight_fraction_leaf', 'classifier__n_estimators', 'classifier__n_jobs', 'classifier__oob_score', 'classifier__random_state', 'classifier__verbose', 'classifier__warm_start']


In [None]:
param_grid = {
    #tfidf hyperparams
    'preprocessor__tfidfV__max_df': np.linspace(0.6, 1.0, 3),   #ignore terms that appear in more than x% of the documents
    'preprocessor__tfidfV__min_df': np.linspace(0.1, 0.4, 3),   #ignore terms that appear in less than x% of the documents
    'preprocessor__tfidfV__max_features': [5_000, 10_000],      #  only consider the k top max_features ordered by term frequency across the corpus.
    #classifier hyperparams
    'classifier__max_depth': [5,15],
    'classifier__max_features': [2, 3],
    'classifier__min_samples_leaf': [3, 4, 5],
    'classifier__min_samples_split': [8, 10, 12],
    'classifier__n_estimators': [10, 50, 100, 200],
    }
best_clf = Train_F.grid_search(combined_clf, X_train, X_val,y_train, y_val, param_grid)

## Evaluating Best Classifier

In [None]:
Train_F.classifier(best_clf, 'RandomForestClassifier', X_train, y_train, X_val, y_val, stage='tuned', intermediate=False)

------------
## Conclusion 
<pre><div style="background-color:#F7D819;">
The model can detect <b>74 among 100 children with ASD</b>  with an <b>86% of precision</b> As a first attempt, these results are encouraging.
</div></pre>
------------