# Gradient Boosting using CatBoost

This is a propriatory setup for the coming gradient boosting. After we finished the preprocessing, this might be a very powerfull step to create a predictor!

In [8]:
from typing import (
    List,
    Dict,
    Any,
    Union,
    
)

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import catboost as cb

### Get Data Ready for the Model

In [12]:
df = (pd
    .read_csv("./data/new_data_no_missingvalues.csv")
    .drop(columns=["Unnamed: 0", "id", "unique_id"])
)

In [13]:
df

Unnamed: 0,date,average_mood,total_calls,total_sms,average_com,mood_next_day,average_soc,average_valence,average_arousal,average_finance,average_office,average_entertainment,average_game
0,2014-02-26,6.25,0.0,0.0,0.000000,6.25,0.000000,0.750000,-0.250000,0.000000,0.000,0.000000,0.0
1,2014-03-21,6.25,6.0,0.0,55.095526,6.40,75.141667,0.200000,0.200000,6.193000,57.402,53.024000,0.0
2,2014-03-22,6.40,0.0,0.0,51.697063,6.80,36.636000,0.698649,0.600000,7.025333,0.000,46.662000,0.0
3,2014-03-23,6.80,0.0,0.0,53.442031,6.00,50.046611,0.800000,0.200000,10.850750,0.000,31.448667,0.0
4,2014-03-24,6.00,0.0,0.0,47.541687,6.75,74.968047,0.000000,0.800000,5.684333,3.010,122.121375,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1210,2014-05-25,6.25,3.0,8.0,34.829867,5.40,45.534460,0.481111,-0.750000,0.000000,0.000,64.738667,0.0
1211,2014-05-26,5.40,10.0,3.0,34.320472,6.20,111.046204,0.000000,-0.400000,0.000000,130.530,68.876900,0.0
1212,2014-05-27,6.20,1.0,2.0,59.382943,8.20,50.259100,0.400000,-0.600000,0.000000,0.000,62.347000,0.0
1213,2014-05-28,8.20,10.0,1.0,37.238750,7.00,99.281685,1.200000,0.000000,0.000000,0.000,43.907571,0.0


In [14]:
def prep_data(raw_df: pd.DataFrame, target_columns: List, drop_first: bool = True, make_na_col: bool = True) -> pd.DataFrame:
    """Dummify a pandas dataframe programmatically
    """
    dataframe_dummy = pd.get_dummies(raw_df, columns=target_columns, 
                                        drop_first=drop_first, 
                                        dummy_na=make_na_col)
    return (dataframe_dummy)

In [15]:
target_columns = [
    "date",
]
dfp = prep_data(df, target_columns=target_columns)

### Splitting the Data

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    dfp.drop(columns=["average_mood"]), 
    dfp.average_mood, 
    test_size=0.35, random_state=58)

In [20]:
# Validation data so we can keep the test data untouched
X_tset, X_val, y_tset, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size=0.35, random_state=58)

### Apply the Model

In [45]:
# CatBoost Specific Optimization
train_dataset = cb.Pool(X_tset, y_tset) 
test_dataset = cb.Pool(X_val, y_val)

params = {
    "verbose": 100,
    "random_seed": 46
}

In [42]:
CBR = cb.CatBoostRegressor(**params)

In [29]:
CBR.fit(train_dataset, 
eval_set=test_dataset,
use_best_model=True,
# plot=True # One of CatBoost's most powerfull features but isnt supported in DeepNote
)

Learning rate set to 0.040601
0:	learn: 0.6830160	test: 0.7840774	best: 0.7840774 (0)	total: 601us	remaining: 600ms
100:	learn: 0.4171239	test: 0.5526877	best: 0.5526877 (100)	total: 174ms	remaining: 1.55s
200:	learn: 0.3575185	test: 0.5316921	best: 0.5315901 (197)	total: 377ms	remaining: 1.5s
300:	learn: 0.3135491	test: 0.5320198	best: 0.5304701 (232)	total: 605ms	remaining: 1.41s
400:	learn: 0.2761319	test: 0.5339120	best: 0.5304701 (232)	total: 810ms	remaining: 1.21s
500:	learn: 0.2483145	test: 0.5364827	best: 0.5304701 (232)	total: 1.01s	remaining: 1s
600:	learn: 0.2235363	test: 0.5387043	best: 0.5304701 (232)	total: 1.21s	remaining: 802ms
700:	learn: 0.2008271	test: 0.5416215	best: 0.5304701 (232)	total: 1.41s	remaining: 602ms
800:	learn: 0.1804462	test: 0.5435100	best: 0.5304701 (232)	total: 1.63s	remaining: 405ms
900:	learn: 0.1635631	test: 0.5442284	best: 0.5304701 (232)	total: 1.83s	remaining: 201ms
999:	learn: 0.1494318	test: 0.5458645	best: 0.5304701 (232)	total: 2.02s	remai

<catboost.core.CatBoostRegressor at 0x7f2c9cd79ed0>

In [36]:
feature_df = CBR.get_feature_importance(prettified=True)
feature_df = feature_df[feature_df["Importances"] > 1]
feature_df

Unnamed: 0,Feature Id,Importances
0,average_valence,47.507017
1,mood_next_day,17.914918
2,average_entertainment,7.132984
3,average_soc,5.320121
4,average_com,5.265219
5,average_arousal,4.660912
6,average_game,2.48015
7,average_office,2.225003
8,total_calls,1.600627
9,average_finance,1.102605


This shows we do not actually need the date's to determine any type of mood.

In [37]:
CBR.score(test_dataset)

0.5523780111135459

Sadly it's quite low atm. This is most likely due to the limited dataset.

### Cheating to Check how the Model Improves

In [46]:
# CatBoost Specific Optimization
trainset = cb.Pool(X_train, y_train) 
testset = cb.Pool(X_test, y_test)

In [47]:
CBRC = cb.CatBoostRegressor(**params)

In [48]:
CBRC.fit(trainset, 
eval_set=testset,
use_best_model=True,
# plot=True # One of CatBoost's most powerfull features but isnt supported in DeepNote
)

Learning rate set to 0.044059
0:	learn: 0.7172516	test: 0.7178727	best: 0.7178727 (0)	total: 2.05ms	remaining: 2.05s
100:	learn: 0.4217718	test: 0.5143571	best: 0.5143571 (100)	total: 216ms	remaining: 1.93s
200:	learn: 0.3728832	test: 0.5100793	best: 0.5096232 (189)	total: 434ms	remaining: 1.72s
300:	learn: 0.3262427	test: 0.5125615	best: 0.5096232 (189)	total: 684ms	remaining: 1.59s
400:	learn: 0.2964895	test: 0.5157831	best: 0.5096232 (189)	total: 902ms	remaining: 1.35s
500:	learn: 0.2685912	test: 0.5185992	best: 0.5096232 (189)	total: 1.19s	remaining: 1.18s
600:	learn: 0.2432462	test: 0.5220345	best: 0.5096232 (189)	total: 1.4s	remaining: 931ms
700:	learn: 0.2230288	test: 0.5249131	best: 0.5096232 (189)	total: 1.65s	remaining: 706ms
800:	learn: 0.2053769	test: 0.5268063	best: 0.5096232 (189)	total: 1.85s	remaining: 461ms
900:	learn: 0.1902339	test: 0.5282712	best: 0.5096232 (189)	total: 2.11s	remaining: 232ms
999:	learn: 0.1769719	test: 0.5302190	best: 0.5096232 (189)	total: 2.34s	r

<catboost.core.CatBoostRegressor at 0x7f2c9928fc50>

Still very Poor...

# Gradient Boosting using SKlearn

In [51]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
# hard_params = {'n_estimators': 1000, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2,
#                 'min_samples_split': 5}

# # intermediate params

# params = dict(hard_params)
# # params.update(inter_params)

In [52]:
# X_tset, X_val, y_tset, y_val
GBC = GradientBoostingRegressor(
    # **params
    )
GBC.fit(X_tset, y_tset)

GradientBoostingRegressor()

In [53]:
GBC.score(X_val, y_val)

0.5476870746911642

Rippppp

In [54]:
ynew = GBC.predict(X_val)

In [61]:
percentage = 0 
for x, y in zip(ynew, y_val): 
    difference = x - y
    if difference < 0.5 and difference > -0.5:
        percentage += 1

In [63]:
percentage/len(y_val) *100

71.48014440433214

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d79defa5-2359-4699-82dc-d6bf5eddd7a7' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>