******************************************************************************


#### Tuning the model

##### Case Example - Predicting customer churn problem (File Name: telco.csv)

Model used for example: Random Forest

Step 1 
- Create a dictionary for the hyperparameters

Step 2
- Tuning 'n_estimators' - Number of features for best split.
- Tuning 'max_features' - Number features RF should consider ehrn looking for the best split at decision tree.
- Tuning other hyperparameters - let algorithm to try out all possible combinations of hyperparameters to identify the best combination
- Randomised Search - shorten the execution time for different combination (trade off - may miss out the best combination)

Step 3
- Create label to features/attributes
- Determine which features mostly important

********************************************************************************

In [None]:
import pandas as pd

# load dataset - telco.csv

telco = pd.read_csv("telco.csv")

from sklearn.svm import SVC
svc = SVC()
svc.fit(telco['data'], telco['target'])

1. Tuning 'n_estimators' 

In [None]:
# Tuning 'n_estimators' - Number of features for best split.
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': np.arange(10, 51)}
clf_cv = GridSearchCV(RandomForestClassifier(), param_grid)
clf_cv.fit(X, y)
clf_cv.best_params_


In [None]:
# Scores representing how much each feature contributes to a prediction
clf_cv.best_score_

2. Tuning 'max_features' 

In [None]:
# Tuning 'max_features' - Number features RF should consider ehrn looking for the best split at decision tree.

# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Create the hyperparameter grid
param_grid = {'max_features': ['auto', 'sqrt', 'log2']}

# Call GridSearchCV
grid_search = GridSearchCV(clf, param_grid)

# Fit the model
grid_search.fit(X, y)

# Print the optimal parameters
print(grid_search.best_params_)

3. Tuning other hyperparameters

In [None]:
# Tuning other hyperparameters - 
# let algorithm to try out all possible combinations of hyperparameters 
# to identify the best combination


# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Create the hyperparameter grid
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# Call GridSearchCV
grid_search = GridSearchCV(clf, param_grid)

# Fit the model
grid_search.fit(X, y)

4. Using 'Random Search'

In [None]:
# Randomised Search - shorten the execution time for different combination 
# (trade off - may miss out the best combination)

# Import RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Create the hyperparameter grid
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# Call RandomizedSearchCV
random_search = RandomizedSearchCV(clf, param_dist)

# Fit the model
random_search.fit(X, y)

5. Label the feature / Determine which features are important drivers of churn


In [None]:
# Calculate feature importances
importances = clf.feature_importances_

# Sort importances
sorted_index = np.argsort(importances)

# Create labels
labels = X.columns[sorted_index]

# Clear current plot
plt.clf()

# Create plot
plt.barh(range(X.shape[1]), importances[sorted_index], tick_label=labels)
plt.show()