
# Install and Import libraries

In [1]:
!pip install catboost



In [6]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor
import lightgbm as lgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load the dataset and give it the desired format

In [7]:
from sklearn.datasets import load_breast_cancer, load_boston
bc = load_breast_cancer()
boston = load_boston()

df_classification = pd.DataFrame(data=np.c_[bc['data'], bc['target']], columns=list(bc['feature_names'])+['target'])
df_regression = pd.DataFrame(data=np.c_[boston['data'], boston['target']], columns=list(boston['feature_names'])+['target'])

In [8]:
df_classification.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,0.0
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,0.0
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,0.0
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,0.0
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,1.0


In [7]:
df_regression.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


# Train / Test split

In [8]:
X = df_classification.drop('target', axis=1)
y = df_classification['target']
X_train_classification, X_val_classification, y_train_classification, y_val_classification = train_test_split(X, y, test_size = 0.3, random_state = 101)

X = df_regression.drop('target', axis=1)
y = df_regression['target']
X_train_regression, X_val_regression, y_train_regression, y_val_regression = train_test_split(X, y, test_size = 0.3, random_state = 101)

# Models

Define parameters

In [9]:
n_trees = 1000
lr = 0.05
rs = 101

#Decision Tree

Regression:

In [10]:
from sklearn.tree import DecisionTreeRegressor
dtree_reg = DecisionTreeRegressor()
dtree_reg.fit(X_train_regression, y_train_regression)
acc_dt_reg = dtree_reg.score(X_val_regression, y_val_regression)
print(acc_dt_reg)

0.7809901988337643


Classification:

In [11]:
from sklearn.tree import DecisionTreeClassifier
dtree_class = DecisionTreeClassifier()
dtree_class.fit(X_train_classification, y_train_classification)
acc_dt_classification = dtree_class.score(X_val_classification, y_val_classification)
print(acc_dt_classification)

0.9064327485380117


# Random Forest

Regression:

In [12]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=n_trees, random_state=rs, n_jobs=-1)
rfr.fit(X_train_regression, y_train_regression)
acc_rf_reg = rfr.score(X_val_regression, y_val_regression)
print(acc_rf_reg)

0.8643991426525824


Classification:

In [13]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=n_trees, random_state=rs, n_jobs=-1)
rfc.fit(X_train_classification, y_train_classification)
acc_rf_classification = rfc.score(X_val_classification, y_val_classification)
print(acc_rf_classification)

0.9590643274853801


# XGBOOST Model



XGBoost stands for "Extreme Gradient Boosting". It is a Random Forest model with Boosting and optimised for speed.

**What is Boosting?**

I'ts very similar to a Random Forest algorithm but instead of creating the new trees at random, you create them looking to minimize the error.

More info about Gradient Boosting Machines:
https://towardsdatascience.com/understanding-gradient-boosting-machines-9be756fe76ab

Regression:

In [14]:
model=xgb.XGBRegressor(objective ='reg:squarederror',n_estimators=n_trees, learning_rate=lr, random_state=rs, n_jobs=-1) #for the best model, high number of estimators, low learning rate
model.fit(X_train_regression, y_train_regression)
acc_xgb_reg = model.score(X_val_regression,y_val_regression)
print(acc_xgb_reg)

0.8787903725114736


Classification:

In [15]:
model = xgb.XGBClassifier(n_estimators=n_trees, learning_rate=lr, random_state=rs,  n_jobs=-1) #for the best model, high number of estimators, low learning rate
model.fit(X_train_classification, y_train_classification)
acc_xgb_classification = model.score(X_val_classification,y_val_classification)
print(acc_xgb_classification)

0.9649122807017544


# CatBoost Model
Very useful when dealing with categorical features. With the datasets in this session we can't use the most powerful feature of catboost which is its processing of categorical features. When dealing with categorical features you should use the parameter *cat_features* — This parameter is a must in order to leverage Catboost preprocessing of categorical features, if you encode the categorical features yourself and don’t pass the columns indices as cat_features you are missing the essence of Catboost.

There is an example of code in the comments of how to use the categorical features

More info:
https://towardsdatascience.com/https-medium-com-talperetz24-mastering-the-new-generation-of-gradient-boosting-db04062a7ea2

https://medium.com/@hanishsidhu/whats-so-special-about-catboost-335d64d754ae



Regression:

In [16]:
model = CatBoostRegressor(silent=True, n_estimators=n_trees, learning_rate=lr, random_state=rs)
#categorical_features_indices = np.where(df.dtypes != np.float)[0]
model.fit(X_train_regression, y_train_regression) #Add parameter cat_features=categorical_features_indices
acc_cb_reg = model.score(X_val_regression, y_val_regression)
print(acc_cb_reg)

0.8911988874756677


Classification:

In [17]:
model = CatBoostClassifier(silent=True, n_estimators=n_trees, learning_rate=lr, random_state=rs)
#categorical_features_indices = np.where(df.dtypes != np.float)[0]
model.fit(X_train_classification, y_train_classification) #Add parameter cat_features=categorical_features_indices
#acc_cb_classification = model.score(X_val_classification, y_val_classification) #For some reason not working
acc_cb_classification = 1-sum(model.predict(X_val_classification)-y_val_classification)/len(y_val_classification)
print(acc_cb_classification)

0.9883040935672515


# AdaBoost

Another way of boosting.

More info:
https://towardsdatascience.com/boosting-and-adaboost-clearly-explained-856e21152d3e

Regression:

In [18]:
model = AdaBoostRegressor(random_state=101, n_estimators=n_trees, learning_rate=lr)
model.fit(X_train_regression, y_train_regression)
acc_ab_reg = model.score(X_val_regression, y_val_regression)
print(acc_ab_reg)

0.8279691982291295


Classification:

In [19]:
model = AdaBoostClassifier(random_state=101, n_estimators=n_trees, learning_rate=lr)
model.fit(X_train_classification, y_train_classification)
acc_ab_classification = model.score(X_val_classification,y_val_classification)
print(acc_ab_classification)

0.9590643274853801


# Model performance comparison

In [20]:
models_regression = pd.DataFrame({'Model': ['Decision Tree', 'Random Forest', 'XGBoost', 'CatBoost', 'AdaBoost'],
                       'Score': [acc_dt_reg, acc_rf_reg, acc_xgb_reg, acc_cb_reg, acc_ab_reg]})
models_regression.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,CatBoost,0.891199
2,XGBoost,0.87879
1,Random Forest,0.864399
4,AdaBoost,0.827969
0,Decision Tree,0.78099


In [21]:
models_categorical = pd.DataFrame({'Model': ['Decision Tree', 'Random Forest', 'XGBoost', 'CatBoost', 'AdaBoost'],
                       'Score': [acc_dt_classification, acc_rf_classification, acc_xgb_classification, acc_cb_classification, acc_ab_classification]})
models_categorical.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,CatBoost,0.988304
2,XGBoost,0.964912
1,Random Forest,0.959064
4,AdaBoost,0.959064
0,Decision Tree,0.906433


In [22]:
#Load Boston Data Set
