# Gradient Boosting using CatBoost

This is a propriatory setup for the coming gradient boosting. After we finished the preprocessing, this might be a very powerfull step to create a predictor!

In [25]:
from typing import (
    List,
    Dict,
    Any,
    Union,
)

In [26]:
import numpy as np
import pandas as pd
import sklearn as sk
import catboost as cb

### Get Data Ready for the Model

In [32]:
df = (pd
    .read_csv("./data/new_data_no_missingvalues.csv")
    .drop(columns=["Unnamed: 0", "id", "unique_id", "date"])
    .drop(columns=["total_calls", "total_sms", "average_com", "average_soc", "average_finance", "average_entertainment"]) # Might change overtime
)

In [33]:
df

Unnamed: 0,average_mood,mood_next_day,average_valence,average_arousal,average_office,average_game
0,6.25,6.25,0.750000,-0.250000,0.000,0.0
1,6.25,6.40,0.200000,0.200000,57.402,0.0
2,6.40,6.80,0.698649,0.600000,0.000,0.0
3,6.80,6.00,0.800000,0.200000,0.000,0.0
4,6.00,6.75,0.000000,0.800000,3.010,0.0
...,...,...,...,...,...,...
1210,6.25,5.40,0.481111,-0.750000,0.000,0.0
1211,5.40,6.20,0.000000,-0.400000,130.530,0.0
1212,6.20,8.20,0.400000,-0.600000,0.000,0.0
1213,8.20,7.00,1.200000,0.000000,0.000,0.0


In [34]:
def prep_data(raw_df: pd.DataFrame, target_columns: List, drop_first: bool = True, make_na_col: bool = True) -> pd.DataFrame:
    """Dummify a pandas dataframe programmatically
    """
    dataframe_dummy = pd.get_dummies(raw_df, columns=target_columns, 
                                        drop_first=drop_first, 
                                        dummy_na=make_na_col)
    return (dataframe_dummy)

### Splitting the Data

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    dfp.drop(columns=["mood_next_day"]), 
    dfp.mood_next_day, 
    test_size=0.35, random_state=58)

In [37]:
# Validation data so we can keep the test data untouched
X_tset, X_val, y_tset, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size=0.35, random_state=58)

### Apply the Model

In [38]:
# CatBoost Specific Optimization
train_dataset = cb.Pool(X_tset, y_tset) 
test_dataset = cb.Pool(X_val, y_val)

entire_train = cb.Pool(X_train, y_train)

In [11]:
# Set grid search params
cbc_params = {
    'random_seed': 88,
    'verbose': 10,
    'loss_function': 'MultiClass'}

CBC = cb.CatBoostRegressor(**cbc_params)

grid_params ={
    'iterations': [100,500,1000],
    'learning_rate': [0.01,0.05,0.1],
    'l2_leaf_reg': [1,3,5,7,9,11,20],
    'depth': [6, 8, 10],
    'random_strength': [0.01, 0.05, 0.1]}

# Grid Search Uses CV internally, allowing me to parse the entire training set more effectively
CBC_grid = CBC.grid_search(grid_params, X=entire_train, partition_random_seed=42)

0:	learn: 4.0777937	test: 4.0864269	best: 4.0864269 (0)	total: 377ms	remaining: 37.3s


In [12]:
CBR.fit(train_dataset, 
eval_set=test_dataset,
use_best_model=True,
# plot=True # One of CatBoost's most powerfull features but isnt supported in DeepNote
)

Learning rate set to 0.040601
0:	learn: 0.7001830	test: 0.7003748	best: 0.7003748 (0)	total: 48.5ms	remaining: 48.5s
100:	learn: 0.5265611	test: 0.6197855	best: 0.6197855 (100)	total: 336ms	remaining: 2.99s
200:	learn: 0.4716525	test: 0.6182995	best: 0.6172107 (140)	total: 600ms	remaining: 2.38s
300:	learn: 0.4159971	test: 0.6211584	best: 0.6172107 (140)	total: 948ms	remaining: 2.2s
400:	learn: 0.3719100	test: 0.6244920	best: 0.6172107 (140)	total: 1.3s	remaining: 1.94s
500:	learn: 0.3337030	test: 0.6280538	best: 0.6172107 (140)	total: 1.52s	remaining: 1.51s
600:	learn: 0.3002727	test: 0.6328362	best: 0.6172107 (140)	total: 1.75s	remaining: 1.16s
700:	learn: 0.2737452	test: 0.6385015	best: 0.6172107 (140)	total: 1.97s	remaining: 841ms
800:	learn: 0.2514705	test: 0.6437407	best: 0.6172107 (140)	total: 2.3s	remaining: 571ms
900:	learn: 0.2287498	test: 0.6485190	best: 0.6172107 (140)	total: 2.58s	remaining: 284ms
999:	learn: 0.2104614	test: 0.6525608	best: 0.6172107 (140)	total: 2.81s	rem

<catboost.core.CatBoostRegressor at 0x7fb87094a890>

In [13]:
feature_df = CBR.get_feature_importance(prettified=True)
feature_df = feature_df[feature_df["Importances"] > 1]
feature_df

Unnamed: 0,Feature Id,Importances
0,average_mood,37.553716
1,average_arousal,12.705544
2,average_com,10.155628
3,average_entertainment,6.97637
4,average_valence,6.5323
5,average_office,5.906204
6,average_soc,5.72953
7,total_calls,4.15941
8,average_finance,3.215705
9,average_game,2.328309


This shows we do not actually need the date's to determine any type of mood.

In [14]:
y_pred = CBR.predict(X_val)

In [None]:
# CatBoost Specific Optimization
# train_dataset = cb.Pool(X_tset, y_tset) 
# test_dataset = cb.Pool(X_val, y_val)

In [15]:
CBR_percentage = 0 
for x, y in zip(y_pred, y_val): 
    difference = x - y
    if difference < 0.5 and difference > -0.5:
        CBR_percentage += 1

In [16]:
CBR_percentage/len(y_val) *100

67.14801444043322

Sadly it's quite low atm. This is most likely due to the limited dataset.

### Cheating to Check how the Model Improves

In [None]:
# CatBoost Specific Optimization
trainset = cb.Pool(X_train, y_train) 
testset = cb.Pool(X_test, y_test)

In [None]:
CBRC = cb.CatBoostRegressor(**params)

In [None]:
CBRC.fit(trainset, 
eval_set=testset,
use_best_model=True,
# plot=True # One of CatBoost's most powerfull features but isnt supported in DeepNote
)

Learning rate set to 0.044059
0:	learn: 0.7172516	test: 0.7178727	best: 0.7178727 (0)	total: 2.05ms	remaining: 2.05s
100:	learn: 0.4217718	test: 0.5143571	best: 0.5143571 (100)	total: 216ms	remaining: 1.93s
200:	learn: 0.3728832	test: 0.5100793	best: 0.5096232 (189)	total: 434ms	remaining: 1.72s
300:	learn: 0.3262427	test: 0.5125615	best: 0.5096232 (189)	total: 684ms	remaining: 1.59s
400:	learn: 0.2964895	test: 0.5157831	best: 0.5096232 (189)	total: 902ms	remaining: 1.35s
500:	learn: 0.2685912	test: 0.5185992	best: 0.5096232 (189)	total: 1.19s	remaining: 1.18s
600:	learn: 0.2432462	test: 0.5220345	best: 0.5096232 (189)	total: 1.4s	remaining: 931ms
700:	learn: 0.2230288	test: 0.5249131	best: 0.5096232 (189)	total: 1.65s	remaining: 706ms
800:	learn: 0.2053769	test: 0.5268063	best: 0.5096232 (189)	total: 1.85s	remaining: 461ms
900:	learn: 0.1902339	test: 0.5282712	best: 0.5096232 (189)	total: 2.11s	remaining: 232ms
999:	learn: 0.1769719	test: 0.5302190	best: 0.5096232 (189)	total: 2.34s	r

<catboost.core.CatBoostRegressor at 0x7f2c9928fc50>

Still very Poor...

# Gradient Boosting using SKlearn

In [19]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
# hard_params = {'n_estimators': 1000, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2,
#                 'min_samples_split': 5}

# # intermediate params

# params = dict(hard_params)
# # params.update(inter_params)

In [20]:
# X_tset, X_val, y_tset, y_val
GBC = GradientBoostingRegressor(
    # **params
    )
GBC.fit(X_tset, y_tset)

GradientBoostingRegressor()

In [21]:
GBC.score(X_val, y_val)

0.1491325422363745

Rippppp

In [22]:
ynew = GBC.predict(X_val)

In [23]:
percentage = 0 
for x, y in zip(ynew, y_val): 
    difference = x - y
    if difference < 0.5 and difference > -0.5:
        percentage += 1

In [24]:
percentage/len(y_val) *100

63.898916967509024

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d79defa5-2359-4699-82dc-d6bf5eddd7a7' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>