# Exercise - Ensemble

In this exercise, we will focus on underage drinking. The data set contains data about high school students. Each row represents a single student. The columns include the characteristics of deidentified students. This is a binary classification task: predict whether a student drinks alcohol or not (this is the **alc** column: 1=Yes, 0=No). This is an important prediction task to detect underage drinking and deploy intervention techniques. 

## Description of Variables

The description of variables are provided in "Alcohol - Data Dictionary.docx"

## Goal

Use the **alcohol.csv** data set and build a model to predict **alc**. 

# Read and Prepare the Data

In [76]:
# Common imports

import pandas as pd
import numpy as np

np.random.seed(42)

# Get the data

In [77]:
#We will predict the "price" value in the data set:

alcohol = pd.read_csv("alcohol.csv")
alcohol.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,health,absences,gender,alc
0,18,2,1,4,2,0,5,4,2,5,2,M,1
1,18,4,3,1,0,0,4,4,2,3,9,M,1
2,15,4,3,2,3,0,5,3,4,5,0,F,0
3,15,3,3,1,4,0,4,3,3,3,10,F,0
4,17,3,2,1,2,0,5,3,5,5,2,M,1


# Split data (train/test)

In [78]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(alcohol, test_size=0.3)

# Data Prep

In [79]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

## Separate the target variable 

In [80]:
train_target = train['alc']
test_target = test['alc']

train_inputs = train.drop(['alc'], axis=1)
test_inputs = test.drop(['alc'], axis=1)

## Feature Engineering: Derive a new column

Examples:
- Ratio of study time to travel time
- Student is younger than 18 or not
- Average of father's and mother's level of education
- (etc.)

In [81]:
def new_col(df):
    #Create a copy so that we don't overwrite the existing dataframe
    df1 = df.copy()
    
    df1['traveltime'] = np.where(df1['studytime'] > 0, 1, 0)
    
    return df1[['traveltime']]
    # You can use this to check whether the calculation is made correctly:
    #return df1

In [82]:
#Let's test the new function:

# Send train set to the function we created
new_col(train)

Unnamed: 0,traveltime
12759,1
4374,1
8561,1
10697,1
19424,1
...,...
16850,1
6265,1
11284,1
860,1


##  Identify the numeric, binary, and categorical columns

In [83]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [84]:
numeric_columns

['age',
 'Medu',
 'Fedu',
 'traveltime',
 'studytime',
 'failures',
 'famrel',
 'freetime',
 'goout',
 'health',
 'absences']

In [85]:
categorical_columns

['gender']

In [86]:
feat_eng_columns = ['studytime']

# Pipeline

In [87]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [88]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [89]:
# Create a pipeline for the transformed column here
my_new_column = Pipeline(steps=[('my_new_column', FunctionTransformer(new_col))])

In [90]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('trans', my_new_column, feat_eng_columns)
        ],
        remainder='passthrough')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for TRAIN

In [91]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[ 0.66643886,  0.96597412,  0.90362635, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.66643886, -0.93881619, -1.68666277, ...,  0.        ,
         1.        ,  1.        ],
       [ 0.66643886,  0.33104402,  0.04019664, ...,  0.        ,
         1.        ,  1.        ],
       ...,
       [ 0.66643886, -2.20867639, -2.55009248, ...,  1.        ,
         0.        ,  1.        ],
       [ 1.6195814 , -0.30388608, -1.68666277, ...,  0.        ,
         1.        ,  1.        ],
       [ 1.6195814 , -0.30388608, -2.55009248, ...,  0.        ,
         1.        ,  1.        ]])

In [92]:
train_x.shape

(23800, 14)

# Tranform: transform() for TEST

In [93]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[-1.23984621,  0.33104402,  1.76705606, ...,  1.        ,
         0.        ,  1.        ],
       [-1.23984621, -0.30388608,  0.04019664, ...,  0.        ,
         1.        ,  1.        ],
       [-0.28670367,  0.33104402,  0.04019664, ...,  0.        ,
         1.        ,  1.        ],
       ...,
       [ 0.66643886, -0.30388608,  0.04019664, ...,  1.        ,
         0.        ,  1.        ],
       [-1.23984621, -0.93881619,  0.04019664, ...,  0.        ,
         1.        ,  1.        ],
       [-1.23984621,  0.96597412,  0.04019664, ...,  1.        ,
         0.        ,  1.        ]])

In [94]:
test_x.shape

(10200, 14)

# Calculate the Baseline

In [95]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_target)

In [96]:
from sklearn.metrics import accuracy_score

In [97]:
#Baseline Train Accuracy
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_target, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.5234873949579832


In [98]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_target, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.5194117647058824


# Train a voting classifier 

In [142]:
#Each model should have predict_proba() function. Otherwise, you can't use it for soft voting
#We can't use sgd, because it doesn't have predict_proba() function.

from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression

dtree_clf = DecisionTreeClassifier(max_depth=10)
log_clf = LogisticRegression(multi_class='multinomial', solver = 'lbfgs', C=10, max_iter=1000)

voting_clf = VotingClassifier(
            estimators=[('dt', dtree_clf), 
                        ('lr', log_clf)],
            voting='soft')

voting_clf.fit(train_x, train_target)

In [143]:
#Train accuracy

train_y_pred = voting_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8441596638655462


In [144]:
#Test accuracy

test_y_pred = voting_clf.predict(test_x)

test_acc = accuracy_score(test_target
                          , test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8156862745098039


In [145]:
for clf in (dtree_clf, log_clf, voting_clf):
    clf.fit(train_x, train_target.ravel())
    test_y_pred = reg.predict(test_x)
    print(clf.__class__.__name__, 'Test acc=', (accuracy_score(test_target, test_y_pred)))

DecisionTreeClassifier Test acc= 0.756078431372549
LogisticRegression Test acc= 0.756078431372549
VotingClassifier Test acc= 0.756078431372549


# Train a bagging classifier

In [102]:
from sklearn.ensemble import BaggingClassifier 


#If you want to do pasting, change "bootstrap=False"
#n_jobs=-1 means use all CPU cores
#bagging automatically performs soft voting

bag_clf = BaggingClassifier( 
            SGDClassifier(), n_estimators=50, 
            max_samples=1000, bootstrap=True, n_jobs=-1) 

bag_clf.fit(train_x, train_target)

    

In [103]:
#Train accuracy

train_y_pred = bag_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8213865546218487


In [104]:
#Test accuracy

test_y_pred = bag_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8196078431372549


# Train a random forest classifier

In [105]:
from sklearn.ensemble import RandomForestClassifier 

rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=10, n_jobs=-1) 

rnd_clf.fit(train_x, train_target)

In [106]:
#Train accuracy

train_y_pred = rnd_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8450420168067226


In [107]:
#Test accuracy

test_y_pred = rnd_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8129411764705883


In [108]:
rnd_clf.feature_importances_

array([0.12457402, 0.27787848, 0.10215276, 0.10149182, 0.17348636,
       0.00296765, 0.04832718, 0.02507193, 0.01898882, 0.02872073,
       0.03671622, 0.02769015, 0.0265439 , 0.00538999])

In [109]:
# Round to two decimals
np.round(rnd_clf.feature_importances_,2)

array([0.12, 0.28, 0.1 , 0.1 , 0.17, 0.  , 0.05, 0.03, 0.02, 0.03, 0.04,
       0.03, 0.03, 0.01])

# Train an adaboost classifier

In [110]:
from sklearn.ensemble import AdaBoostClassifier 


ada_clf = AdaBoostClassifier( 
            DecisionTreeClassifier(max_depth=5), n_estimators=50, 
            learning_rate=0.1) 


ada_clf.fit(train_x, train_target)

In [111]:
#Train accuracy

train_y_pred = ada_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8365126050420169


In [112]:
#Test accuracy

test_y_pred = ada_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8238235294117647


# Train a gradient boosting classifier

In [113]:
#Use GradientBoosting

from sklearn.ensemble import GradientBoostingClassifier

gbclf = GradientBoostingClassifier(max_depth=2, n_estimators=100, learning_rate=0.1) 

gbclf.fit(train_x, train_target)

In [114]:
#Train accuracy

train_y_pred = gbclf.predict(train_x)

train_acc = accuracy_score(train_target, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8190336134453782


In [115]:
#Test accuracy

test_y_pred = gbclf.predict(test_x)

test_acc = accuracy_score(test_target, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8116666666666666


In [119]:
for x in range(1,30):
    gbclf = GradientBoostingClassifier(max_depth=3, n_estimators=x, learning_rate=1.0) 
    gbclf.fit(train_x, train_target.ravel())
    
    train_predictions = gbclf.predict(train_x)
    test_predictions = gbclf.predict(test_x)
    
    train_accuracy = round(accuracy_score(train_target, train_predictions),4)
    test_accuracy = round(accuracy_score(test_target, test_predictions),4)
    
    print('# Estimators = {}'.format(x) + "     " + 'Train accuracy = {}'.format(train_accuracy) + "   "
         'Test accuracy = {}'.format(test_accuracy))

# Estimators = 1     Train accuracy = 0.7613   Test accuracy = 0.764
# Estimators = 2     Train accuracy = 0.7738   Test accuracy = 0.7756
# Estimators = 3     Train accuracy = 0.7893   Test accuracy = 0.7899
# Estimators = 4     Train accuracy = 0.8007   Test accuracy = 0.798
# Estimators = 5     Train accuracy = 0.8036   Test accuracy = 0.8015
# Estimators = 6     Train accuracy = 0.8099   Test accuracy = 0.808
# Estimators = 7     Train accuracy = 0.8145   Test accuracy = 0.8099
# Estimators = 8     Train accuracy = 0.815   Test accuracy = 0.8129
# Estimators = 9     Train accuracy = 0.8187   Test accuracy = 0.8125
# Estimators = 10     Train accuracy = 0.8213   Test accuracy = 0.8147
# Estimators = 11     Train accuracy = 0.8213   Test accuracy = 0.8167
# Estimators = 12     Train accuracy = 0.8216   Test accuracy = 0.8164
# Estimators = 13     Train accuracy = 0.8223   Test accuracy = 0.8174
# Estimators = 14     Train accuracy = 0.8244   Test accuracy = 0.8182
# Estimators = 15  