# Gradient Boosting Setup and Appliance

We will apply both the CatBoost GB and XGBoost!

## Imports

In [24]:
import json
from typing import (
    List,
    Dict,
    Any,
    Union,
)

import numpy as np
import pandas as pd
import sklearn as sk
import catboost as cb
import xgboost as xgb

from custom_functions import (
    store_json,
    load_json,
)

### Get Data Ready for the Model

In [2]:
df = (pd
    .read_csv("./data/new_data_no_missingvalues.csv")
    .drop(columns=["Unnamed: 0", "id", "unique_id", "date"])
    .drop(columns=["total_calls", "total_sms", "average_com", "average_soc", "average_finance", "average_entertainment"]) # Might change overtime
)

In [3]:
df

Unnamed: 0,average_mood,mood_next_day,average_valence,average_arousal,average_office,average_game
0,6.25,6.25,0.750000,-0.250000,0.000,0.0
1,6.25,6.40,0.200000,0.200000,57.402,0.0
2,6.40,6.80,0.698649,0.600000,0.000,0.0
3,6.80,6.00,0.800000,0.200000,0.000,0.0
4,6.00,6.75,0.000000,0.800000,3.010,0.0
...,...,...,...,...,...,...
1210,6.25,5.40,0.481111,-0.750000,0.000,0.0
1211,5.40,6.20,0.000000,-0.400000,130.530,0.0
1212,6.20,8.20,0.400000,-0.600000,0.000,0.0
1213,8.20,7.00,1.200000,0.000000,0.000,0.0


In [4]:
def prep_data(raw_df: pd.DataFrame, target_columns: List, drop_first: bool = True, make_na_col: bool = True) -> pd.DataFrame:
    """Dummify a pandas dataframe
    """
    dataframe_dummy = pd.get_dummies(raw_df, columns=target_columns, 
                                        drop_first=drop_first, 
                                        dummy_na=make_na_col)
    return (dataframe_dummy)

### Splitting the Data

In [5]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=["mood_next_day"]), 
    df.mood_next_day, 
    test_size=0.35, random_state=58)

In [8]:
# Validation data so we can keep the test data untouched
X_tset, X_val, y_tset, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size=0.35, random_state=58)

### Apply the Model

In [9]:
# CatBoost Specific Optimization
train_dataset = cb.Pool(X_tset, y_tset) 
test_dataset = cb.Pool(X_val, y_val)

entire_train = cb.Pool(X_train, y_train)

In [25]:
grid_params ={ # New Grid search setup so it doesn't take 48 hrs...
    'one': {
        'l2_leaf_reg': [1,3,5,7,9,11,20],
        'depth': [6, 8, 10],
        'random_strength': [0.01, 0.05, 0.1]},
    'two': {
        'l2_leaf_reg': [1,3,5,7,9,11,20],
        'depth': [6, 8, 10],
        'random_strength': [0.01, 0.05, 0.1]},
    'three': {
        'l2_leaf_reg': [1,3,5,7,9,11,20],
        'depth': [6, 8, 10],
        'random_strength': [0.01, 0.05, 0.1]},
}

In [16]:
searched_cat_objects = dict()

for grid_selection in grid_params:
    
    learning_rate = 0.1 if grid_selection == 'one' else 0.05
    
    if grid_selection == 'three': # Small clausule to prevent a learning rate of 0.01 when we see a convergence between 0.05 and 0.1
        learning_rate = 0.01
        if searched_cat_objects['one']['params'] == searched_cat_objects['two']['params']:
            break
        else:
            print(f"Did not find similar parameters, trying a learning rate of 0.01...")

    # Set general object params
    cbc_params = {
        'iterations': 1000,
        'learning_rate': learning_rate,
        'random_seed': np.random.randint(low=1, high=100),
        'verbose': 100,
        'loss_function': 'RMSE'}

    CBC = cb.CatBoostRegressor(**cbc_params)

    # Grid Search Uses CV internally, allowing me to parse the entire training set more effectively
    CBC_grid = CBC.grid_search(grid_params[grid_selection], X=entire_train, partition_random_seed=np.random.randint(low=1, high=100))

    searched_cat_objects[grid_selection] = CBC_grid

0:	learn: 6.3669580	test: 6.4124144	best: 6.4124144 (0)	total: 1.38ms	remaining: 1.38s
100:	learn: 0.4628282	test: 0.7255587	best: 0.7239214 (69)	total: 137ms	remaining: 1.22s
200:	learn: 0.4066202	test: 0.7340570	best: 0.7239214 (69)	total: 293ms	remaining: 1.17s
300:	learn: 0.3623531	test: 0.7454386	best: 0.7239214 (69)	total: 428ms	remaining: 994ms
400:	learn: 0.3342876	test: 0.7471520	best: 0.7239214 (69)	total: 565ms	remaining: 844ms
500:	learn: 0.3133881	test: 0.7546182	best: 0.7239214 (69)	total: 698ms	remaining: 695ms
600:	learn: 0.2974735	test: 0.7605364	best: 0.7239214 (69)	total: 833ms	remaining: 553ms
700:	learn: 0.2833934	test: 0.7654172	best: 0.7239214 (69)	total: 971ms	remaining: 414ms
800:	learn: 0.2725558	test: 0.7700281	best: 0.7239214 (69)	total: 1.11s	remaining: 275ms
900:	learn: 0.2630875	test: 0.7741288	best: 0.7239214 (69)	total: 1.25s	remaining: 138ms
999:	learn: 0.2552534	test: 0.7786442	best: 0.7239214 (69)	total: 1.4s	remaining: 0us

bestTest = 0.72392141
bes

In [23]:
if isinstance(searched_cat_objects, dict): store_json(searched_cat_objects, "./data/new_stored_cat_hyper")

In [None]:
feature_df = CBR.get_feature_importance(prettified=True)
feature_df = feature_df[feature_df["Importances"] > 1]
feature_df

Unnamed: 0,Feature Id,Importances
0,average_mood,37.553716
1,average_arousal,12.705544
2,average_com,10.155628
3,average_entertainment,6.97637
4,average_valence,6.5323
5,average_office,5.906204
6,average_soc,5.72953
7,total_calls,4.15941
8,average_finance,3.215705
9,average_game,2.328309


This shows we do not actually need the date's to determine any type of mood.

In [None]:
y_pred = CBR.predict(X_val)

In [None]:
# CatBoost Specific Optimization
# train_dataset = cb.Pool(X_tset, y_tset) 
# test_dataset = cb.Pool(X_val, y_val)

In [None]:
CBR_percentage = 0 
for x, y in zip(y_pred, y_val): 
    difference = x - y
    if difference < 0.5 and difference > -0.5:
        CBR_percentage += 1

In [None]:
CBR_percentage/len(y_val) *100

67.14801444043322

Sadly it's quite low atm. This is most likely due to the limited dataset.

### Cheating to Check how the Model Improves

In [None]:
# CatBoost Specific Optimization
trainset = cb.Pool(X_train, y_train) 
testset = cb.Pool(X_test, y_test)

In [None]:
CBRC = cb.CatBoostRegressor(**params)

In [None]:
CBRC.fit(trainset, 
eval_set=testset,
use_best_model=True,
# plot=True # One of CatBoost's most powerfull features but isnt supported in DeepNote
)

Learning rate set to 0.044059
0:	learn: 0.7172516	test: 0.7178727	best: 0.7178727 (0)	total: 2.05ms	remaining: 2.05s
100:	learn: 0.4217718	test: 0.5143571	best: 0.5143571 (100)	total: 216ms	remaining: 1.93s
200:	learn: 0.3728832	test: 0.5100793	best: 0.5096232 (189)	total: 434ms	remaining: 1.72s
300:	learn: 0.3262427	test: 0.5125615	best: 0.5096232 (189)	total: 684ms	remaining: 1.59s
400:	learn: 0.2964895	test: 0.5157831	best: 0.5096232 (189)	total: 902ms	remaining: 1.35s
500:	learn: 0.2685912	test: 0.5185992	best: 0.5096232 (189)	total: 1.19s	remaining: 1.18s
600:	learn: 0.2432462	test: 0.5220345	best: 0.5096232 (189)	total: 1.4s	remaining: 931ms
700:	learn: 0.2230288	test: 0.5249131	best: 0.5096232 (189)	total: 1.65s	remaining: 706ms
800:	learn: 0.2053769	test: 0.5268063	best: 0.5096232 (189)	total: 1.85s	remaining: 461ms
900:	learn: 0.1902339	test: 0.5282712	best: 0.5096232 (189)	total: 2.11s	remaining: 232ms
999:	learn: 0.1769719	test: 0.5302190	best: 0.5096232 (189)	total: 2.34s	r

<catboost.core.CatBoostRegressor at 0x7f2c9928fc50>

Still very Poor...

# Gradient Boosting using SKlearn

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
# hard_params = {'n_estimators': 1000, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2,
#                 'min_samples_split': 5}

# # intermediate params

# params = dict(hard_params)
# # params.update(inter_params)

In [None]:
# X_tset, X_val, y_tset, y_val
GBC = GradientBoostingRegressor(
    # **params
    )
GBC.fit(X_tset, y_tset)

GradientBoostingRegressor()

In [None]:
GBC.score(X_val, y_val)

0.1491325422363745

Rippppp

In [None]:
ynew = GBC.predict(X_val)

In [None]:
percentage = 0 
for x, y in zip(ynew, y_val): 
    difference = x - y
    if difference < 0.5 and difference > -0.5:
        percentage += 1

In [None]:
percentage/len(y_val) *100

63.898916967509024

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d79defa5-2359-4699-82dc-d6bf5eddd7a7' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>