# Step 4 and Final - Model Selection and fine tuning

## Upload latest pickle

In [1]:
import matplotlib as plt
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sb
import warnings
import pickle
from scipy.stats import zscore, ks_2samp

#pulling Flat File PKL format

pickle_file = 'tmdb_EDA_feature_eng_file.pkl'

#Read a pickle file and load the data
with open(pickle_file, 'rb') as file:
    loaded_data = pickle.load(file)

print(type(loaded_data))

if not isinstance(loaded_data, pd.DataFrame):
    print('Problem with desrielizing TMDB Pickle file')
else: 
    df = loaded_data.copy()

<class 'pandas.core.frame.DataFrame'>


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71370 entries, 0 to 71369
Data columns (total 57 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              71370 non-null  float64
 1   number_of_episodes              71370 non-null  float64
 2   vote_count                      71370 non-null  float64
 3   vote_average                    71370 non-null  float64
 4   adult                           71370 non-null  float64
 5   episode_run_time                71370 non-null  float64
 6   genres_Family                   71370 non-null  float64
 7   genres_War & Politics           71370 non-null  float64
 8   genres_Talk                     71370 non-null  float64
 9   genres_Crime                    71370 non-null  float64
 10  genres_Animation                71370 non-null  float64
 11  genres_Empty                    71370 non-null  float64
 12  genres_Documentary              

In [3]:
df.describe()

Unnamed: 0,id,number_of_episodes,vote_count,vote_average,adult,episode_run_time,genres_Family,genres_War & Politics,genres_Talk,genres_Crime,...,original_language_ja,original_language_zh,origin_continent_Africa,origin_continent_Asia,origin_continent_Europe,origin_continent_North America,origin_continent_Oceania,origin_continent_South America,origin_continent_Other,popularity
count,71370.0,71370.0,71370.0,71370.0,71370.0,71370.0,71370.0,71370.0,71370.0,71370.0,...,71370.0,71370.0,71370.0,71370.0,71370.0,71370.0,71370.0,71370.0,71370.0,71370.0
mean,150890.174891,17.405254,16.142931,2.731535,0.011798,18.097118,0.036668,0.010186,0.025599,0.055009,...,0.079543,0.115679,0.010831,0.34579,0.264859,0.188342,0.015273,0.029256,0.145649,5.339953
std,60144.160984,42.779086,212.217426,3.620145,0.107975,23.189481,0.187947,0.100413,0.157937,0.228,...,0.270587,0.319842,0.103507,0.475628,0.441261,0.390988,0.122636,0.168524,0.352757,40.716793
min,344.0,0.0,0.0,0.0,0.0,-8.357729,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,97683.5,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6
50%,132952.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.054
75%,214330.25,17.0,1.0,6.7,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.744
max,251181.0,2107.0,17836.0,10.0,1.0,105.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3707.008


In [4]:
X = df.loc[:,df.columns != 'popularity']
y = df['popularity']

In [5]:
# check that 'popularity' doesn't include negative values 
if (y < 0).any():
    print("There are negative values in the 'popularity' column.")
else:
    print("All values in the 'popularity' column are non-negative.")

All values in the 'popularity' column are non-negative.


## Regression Models

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
#!pip install xgboost
import xgboost as xgb
import sklearn.metrics as metrics 
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split

In [7]:
def regressionMetrics(y, yhat):
    res = {'MSE': metrics.mean_squared_error(y,yhat),
           'RMSE': np.sqrt(metrics.mean_squared_error(y,yhat)),
           'MAE': metrics.mean_absolute_error(y,yhat),
           'RMSLE': np.sqrt(metrics.mean_squared_log_error(y,yhat))
          }
    return res

##### Splitting data (Train = 0.7, Test = 0.15, Dev (Valid) = 0.15

In [8]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

In [9]:
models_list = pd.DataFrame()

In [10]:
def add_train_and_test_to_model_list(model_name_str, model, df_model_list):
    y_train_pred = model.predict(X_train)
        
    if (y_train_pred < 0).any():
        y_train_pred = np.array([max(0, pred) for pred in y_train_pred])
    
    model_dict = {'model': model_name_str}
    new_row = pd.DataFrame([{**model_dict, **regressionMetrics(y_train, y_train_pred)}])
    df_model_list = pd.concat([df_model_list, new_row], ignore_index=True)


    return df_model_list

''''
    y_test_pred = model.predict(X_test)

    if (y_test_pred < 0).any():
        y_test_pred = np.array([max(0, pred) for pred in y_test_pred])

    model_name_str_test = model_name_str + ' Test'
    model_dict = {'model': model_name_str_test}
    new_row = pd.DataFrame([{**model_dict, **regressionMetrics(y_test, y_test_pred)}])
    df_model_list = pd.concat([df_model_list, new_row], ignore_index=True)
'''

def add_dev_to_model_list(model_name_str, model, df_model_list):
    y_dev_pred = model.predict(X_dev)
        
    if (y_dev_pred < 0).any():
        y_dev_pred = np.array([max(0, pred) for pred in y_dev_pred])
    
    model_dict = {'model': model_name_str}
    new_row = pd.DataFrame([{**model_dict, **regressionMetrics(y_dev, y_dev_pred)}])
    df_model_list = pd.concat([df_model_list, new_row], ignore_index=True)


    return df_model_list

### Liner Regression

In [11]:
mod1 = LinearRegression()
mod1.fit(X_train,y_train)

models_list = add_train_and_test_to_model_list("Linear Regression", mod1, models_list)

print (models_list) 

               model          MSE       RMSE       MAE     RMSLE
0  Linear Regression  1218.205141  34.902796  5.828314  0.918692


### Decision Tree

In [12]:
mod2 = DecisionTreeRegressor(random_state=1)
mod2.fit(X_train,y_train)

models_list = add_train_and_test_to_model_list("Decision Tree", mod2, models_list)

print (models_list) 

               model          MSE       RMSE       MAE     RMSLE
0  Linear Regression  1218.205141  34.902796  5.828314  0.918692
1      Decision Tree     1.594394   1.262693  0.055189  0.066744


### Random Forest

In [13]:
mod3 = RandomForestRegressor(random_state=1)
mod3.fit(X_train,y_train)

models_list = add_train_and_test_to_model_list("Random Forest", mod3, models_list)

print (models_list) 

               model          MSE       RMSE       MAE     RMSLE
0  Linear Regression  1218.205141  34.902796  5.828314  0.918692
1      Decision Tree     1.594394   1.262693  0.055189  0.066744
2      Random Forest   156.815355  12.522594  1.556532  0.264730


### Adaptive Boosting (ADABoost)

In [14]:
mod4 = AdaBoostRegressor(random_state=1)
mod4.fit(X_train,y_train)

models_list = add_train_and_test_to_model_list("Ada Boost", mod4, models_list)

print (models_list) 

               model          MSE       RMSE        MAE     RMSLE
0  Linear Regression  1218.205141  34.902796   5.828314  0.918692
1      Decision Tree     1.594394   1.262693   0.055189  0.066744
2      Random Forest   156.815355  12.522594   1.556532  0.264730
3          Ada Boost  3298.285542  57.430702  34.442867  2.260733


### Gradient Boosting Machine (GBM)

In [15]:
mod5 = GradientBoostingRegressor(random_state=1)
mod5.fit(X_train,y_train)

models_list = add_train_and_test_to_model_list("Gradient Boosting", mod5, models_list)

print (models_list) 

               model          MSE       RMSE        MAE     RMSLE
0  Linear Regression  1218.205141  34.902796   5.828314  0.918692
1      Decision Tree     1.594394   1.262693   0.055189  0.066744
2      Random Forest   156.815355  12.522594   1.556532  0.264730
3          Ada Boost  3298.285542  57.430702  34.442867  2.260733
4  Gradient Boosting   653.622646  25.566045   4.020585  0.648504


### Support Vector Machine (SVM)

In [16]:
mod6 = SVR()
mod6.fit(X_train,y_train)

models_list = add_train_and_test_to_model_list("Support Vector Machine", mod6, models_list)

print (models_list) 

                    model          MSE       RMSE        MAE     RMSLE
0       Linear Regression  1218.205141  34.902796   5.828314  0.918692
1           Decision Tree     1.594394   1.262693   0.055189  0.066744
2           Random Forest   156.815355  12.522594   1.556532  0.264730
3               Ada Boost  3298.285542  57.430702  34.442867  2.260733
4       Gradient Boosting   653.622646  25.566045   4.020585  0.648504
5  Support Vector Machine  1460.429104  38.215561   4.658155  0.893035


### XGBoost Regressor

In [17]:
mod7= xgb.XGBRegressor()
mod7.fit(X_train,y_train)

models_list = add_train_and_test_to_model_list("XGBoost Regressor", mod7, models_list)

print (models_list) 

                    model          MSE       RMSE        MAE     RMSLE
0       Linear Regression  1218.205141  34.902796   5.828314  0.918692
1           Decision Tree     1.594394   1.262693   0.055189  0.066744
2           Random Forest   156.815355  12.522594   1.556532  0.264730
3               Ada Boost  3298.285542  57.430702  34.442867  2.260733
4       Gradient Boosting   653.622646  25.566045   4.020585  0.648504
5  Support Vector Machine  1460.429104  38.215561   4.658155  0.893035
6       XGBoost Regressor    53.491300   7.313775   2.217207  0.509906


## Model Selection

Metrics:
MSE - Mean Squared Error
RMSE Root Mean Squared Error
MAE Mean Absolute Error Calculates the average of the absolute differences between predicted and actual values.
RMSLE Root Mean Squared Logarithmic Error

In [20]:
models_list.sort_values('MAE')

Unnamed: 0,model,MSE,RMSE,MAE,RMSLE
1,Decision Tree,1.594394,1.262693,0.055189,0.066744
2,Random Forest,156.815355,12.522594,1.556532,0.26473
6,XGBoost Regressor,53.4913,7.313775,2.217207,0.509906
4,Gradient Boosting,653.622646,25.566045,4.020585,0.648504
5,Support Vector Machine,1460.429104,38.215561,4.658155,0.893035
0,Linear Regression,1218.205141,34.902796,5.828314,0.918692
3,Ada Boost,3298.285542,57.430702,34.442867,2.260733


In [21]:
models_list.sort_values('RMSE')

Unnamed: 0,model,MSE,RMSE,MAE,RMSLE
1,Decision Tree,1.594394,1.262693,0.055189,0.066744
6,XGBoost Regressor,53.4913,7.313775,2.217207,0.509906
2,Random Forest,156.815355,12.522594,1.556532,0.26473
4,Gradient Boosting,653.622646,25.566045,4.020585,0.648504
0,Linear Regression,1218.205141,34.902796,5.828314,0.918692
5,Support Vector Machine,1460.429104,38.215561,4.658155,0.893035
3,Ada Boost,3298.285542,57.430702,34.442867,2.260733


## Run base model = DecisionTree as in mod2

# START FINE TUNING PROCESS

In [22]:
# Define the parameter grid to search for Decision Tree
param_grid = {
    'criterion': ['squared_error', 'friedman_mse'],  # Reduced to 2 options
    'splitter': ['best'],  # Fixed to 'best'
    'max_depth': [10, 20, 30],  # Reduced to 3 options
    'min_samples_split': [5, 10],  # Reduced to 2 options
    'min_samples_leaf': [1, 4],  # Reduced to 2 options
    'max_features': ['sqrt', 'log2'],  # Reduced to 2 options
    'max_leaf_nodes': [None, 50],  # Reduced to 2 options
    'ccp_alpha': [0.0, 0.01]  # Reduced to 2 options
}

# Create a base model of Decision-Tree (the best modewl you had)
rf = mod2

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2)

#Verbose - level of details that will be provided in each iteration
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 192 candidates, totalling 576 fits


In [23]:
# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)

Best Parameters: {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 10, 'max_features': 'log2', 'max_leaf_nodes': 50, 'min_samples_leaf': 4, 'min_samples_split': 10, 'splitter': 'best'}
Best Score: -0.021448884211068237


In [24]:
grid_search.best_estimator_

In [25]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mae = 100 * np.mean(errors)
    print('Model Performance')
    print('Mean Absolute Error: {:0.4f}'.format(np.mean(errors)))
    return mae

## Compare fine-tunned model (base-model vs. fine_tuned model)

In [27]:
best_grid = grid_search.best_estimator_
base_accuracy = evaluate(mod2, X_dev, y_dev)
grid_accuracy = evaluate(best_grid, X_dev, y_dev)



Model Performance
Mean Absolute Error: 5.2166
Model Performance
Mean Absolute Error: 5.4635


In [28]:
print('Improvement of {:0.2f}%.'.format( 100 * (base_accuracy - grid_accuracy) / base_accuracy))

Improvement of -4.73%.


## Conclution:
## No improvements have been achieved with the above hyperparameter grid search and random search.

# Run X_test and y_test on original DecisionTree

In [35]:
y_test_pred = mod2.predict(X_test)
res = regressionMetrics(y_test, y_test_pred)

print (res)

{'MSE': 4552.83015847978, 'RMSE': 67.4746630853373, 'MAE': 5.603265489756523, 'RMSLE': 0.6198808341746612}
