## AdaBoost Classifier

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

### Import and check out data

In [3]:
df_heart = pd.read_csv('./data/heart.csv')

In [4]:
df_heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
thal        303 non-null int64
target      303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.2 KB


In [5]:
df_heart.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,303.0,54.366337,9.082101,29.0,47.5,55.0,61.0,77.0
sex,303.0,0.683168,0.466011,0.0,0.0,1.0,1.0,1.0
cp,303.0,0.966997,1.032052,0.0,0.0,1.0,2.0,3.0
trestbps,303.0,131.623762,17.538143,94.0,120.0,130.0,140.0,200.0
chol,303.0,246.264026,51.830751,126.0,211.0,240.0,274.5,564.0
fbs,303.0,0.148515,0.356198,0.0,0.0,0.0,0.0,1.0
restecg,303.0,0.528053,0.52586,0.0,0.0,1.0,1.0,2.0
thalach,303.0,149.646865,22.905161,71.0,133.5,153.0,166.0,202.0
exang,303.0,0.326733,0.469794,0.0,0.0,0.0,1.0,1.0
oldpeak,303.0,1.039604,1.161075,0.0,0.0,0.8,1.6,6.2


### train/test split

In [6]:
X = df_heart.drop('target', axis = 1)

In [7]:
y = df_heart['target']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

### Instantiate, fit, and score model

In [9]:
ada_class = AdaBoostClassifier(random_state = 12)

In [10]:
ada_class.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=12)

In [11]:
ada_class.score(X_train, y_train)

0.933920704845815

In [12]:
ada_class.score(X_test, y_test)

0.7368421052631579

#### Compared to Random Forest

In [13]:
rf_class = RandomForestClassifier(random_state=12)

In [14]:
rf_class.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=12, verbose=0,
                       warm_start=False)

In [15]:
rf_class.score(X_train, y_train)

0.9911894273127754

In [16]:
rf_class.score(X_test, y_test)

0.7236842105263158

### Gridsearching with AdaBoost

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
ada_class = AdaBoostClassifier()

#### Create our set of estimators

In [19]:
rf_class_1 = RandomForestClassifier(max_depth = 1, n_estimators = 1, random_state = 24)
dt_class_1 = DecisionTreeClassifier(max_depth = 1, random_state = 24)
rf_class_2 = RandomForestClassifier(max_depth = 2, n_estimators = 1, random_state = 24)
dt_class_2 = DecisionTreeClassifier(max_depth = 2, random_state = 24)
rf_class_3 = RandomForestClassifier(max_depth = 3, n_estimators = 1, random_state = 24)
dt_class_3 = DecisionTreeClassifier(max_depth = 3, random_state = 24)
rf_class_10 = RandomForestClassifier(max_depth = 10, n_estimators = 1, random_state = 24)
dt_class_10 = DecisionTreeClassifier(max_depth = 10, random_state = 24)

#### Create our set of AdaBoost parameters to gridsearch through

In [20]:
params = {'base_estimator' : [rf_class_1, dt_class_1, rf_class_2, dt_class_2, 
                              rf_class_3, dt_class_3, rf_class_10, dt_class_10],
         'n_estimators' : [50, 100, 150, 200],
         'learning_rate' : [1, .9, .8],
         'random_state' : [24]}

#### Instantiate and fit our GridSearch model

In [21]:
gs = GridSearchCV(estimator = ada_class, param_grid = params, cv = 3)

In [22]:
gs.fit(X_train, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='warn', n_jobs=None,
             param_grid={'base_estimator': [RandomForestClassifier(bootstrap=True,
                                                                   class_weight=None,
                                                                   criterion='gini',
                                                                   max_depth=1,
                                                                   max_features='auto',
                                                                   max_leaf_nodes=None,...
                                                                   max_leaf_nodes=None,
                                           

In [27]:
# Returns the best vross validated score

gs.best_score_

0.8458149779735683

In [24]:
gs.best_params_

{'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=3, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=1,
                        n_jobs=None, oob_score=False, random_state=24, verbose=0,
                        warm_start=False),
 'learning_rate': 0.8,
 'n_estimators': 50,
 'random_state': 24}

In [25]:
gs.score(X_train, y_train)

0.9911894273127754

In [26]:
gs.score(X_test, y_test)

0.75

#### How about a Pipeline to adjust 

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
steps = 

In [None]:
ada_pipeline = Pipeline()

## Gradient Boost Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
grb_class = GradientBoostingClassifier(random_state = 12)

In [None]:
grb_class.fit(X_train, y_train)

In [None]:
grb_class.score(X_train, y_train)

In [None]:
grb_class.score(X_test, y_test)

## XGBoost - eXtreme Gradient Boost Classifier

In [29]:
!pip install xgboost

Collecting xgboost
  Downloading https://files.pythonhosted.org/packages/5e/49/b95c037b717b4ceadc76b6e164603471225c27052d1611d5a2e832757945/xgboost-0.90-py2.py3-none-win_amd64.whl (18.3MB)
Installing collected packages: xgboost
Successfully installed xgboost-0.90


The system cannot find the path specified.


In [28]:
from xgboost import XGBClassifier, XGBRFClassifier

ModuleNotFoundError: No module named 'xgboost'

In [None]:
xgb_class = XGBClassifier(random_state = 12)
xgb_rf_class = XGBRFClassifier(random_state = 12)

In [None]:
xgb_class.fit(X_train, y_train)

In [None]:
xgb_class.score(X_train, y_train)

In [None]:
xgb_class.score(X_test, y_test)

In [None]:
xgb_rf_class.fit(X_train, y_train)

In [None]:
xgb_rf_class.score(X_train, y_train)

In [None]:
xgb_rf_class.score(X_test, y_test)

## GradientBoost Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
df = pd.read_csv('data/superconductor.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

### train / test split

In [None]:
y = df['critical_temp']

In [None]:
X = df.drop('critical_temp', axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

### Instantiate, fit, and score model

In [None]:
grb_model = GradientBoostingRegressor()

In [None]:
grb_model.fit(X_train, y_train)

In [None]:
grb_model.score(X_train, y_train)

In [None]:
grb_model.score(X_test, y_test)

### Random Forest Comparison

In [None]:
rf_model = RandomForestRegressor(n_estimators=100)

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
rf_model.score(X_train, y_train)

In [None]:
rf_model.score(X_test, y_test)

## XGB Regressor

In [None]:
xgb_model = xgboost.XGBRegressor()

In [None]:
xgb_model.fit(X_train, y_train)

In [None]:
xgb_model.score(X_train, y_train)

In [None]:
xgb_model.score(X_test, y_test)

#### Max Depth Change

In [None]:
xgbrf_model = xgboost.XGBRFRegressor(max_depth = 10)

In [None]:
xgbrf_model.fit(X_train, y_train)

In [None]:
xgbrf_model.score(X_train, y_train)

In [None]:
xgbrf_model.score(X_test, y_test)

## AdaBoost Regressor

In [None]:
from sklearn.linear_model import LinearRegression, LassoCV

In [None]:
rf_reg = RandomForestRegressor(n_estimators = 10, max_depth = 5)

In [None]:
ada_model = AdaBoostRegressor(base_estimator = rf_reg, n_estimators = 100)

In [None]:
ada_model.fit(X_train, y_train)

In [None]:
ada_model.score(X_train, y_train)

In [None]:
ada_model.score(X_test, y_test)