# Voting

**Every model has its strengths and weaknesses. Ensemble models can be beneficial by combining individual models 
to help hide the weaknesses of an individual model.**

**I will use the Voting Classifier to get the Titanic predictions**

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [2]:
raw_train_data = pd.read_csv('./../../dataset/titanic/train.csv')
raw_test_data = pd.read_csv('./../../dataset/titanic/test.csv')
original_test_data = pd.read_csv('./../../dataset/titanic/test.csv')

### The 4C's Correcting, Completing, Creating and Converting

#### Correcting: delete unreasonable values or useless columns

In [3]:
delete_columns = ['PassengerId', 'Ticket', 'Cabin']
raw_train_data.drop(delete_columns, axis=1, inplace=True)
raw_test_data.drop(delete_columns, axis=1, inplace=True)

#### Completing: filling null values

In [4]:
# Null Age values filled with the median value 
raw_train_data['Age'].fillna(raw_train_data['Age'].median(), inplace=True)
raw_test_data['Age'].fillna(raw_train_data['Age'].median(), inplace=True)

# Null Fare values filled with the median value
raw_train_data['Fare'].fillna(raw_train_data['Fare'].median(), inplace=True)
raw_test_data['Fare'].fillna(raw_train_data['Fare'].median(), inplace=True)



# Null Embarked values filled with the mode
raw_train_data['Embarked'].fillna(raw_train_data['Embarked'].mode()[0], inplace=True)
raw_test_data['Embarked'].fillna(raw_train_data['Embarked'].mode()[0], inplace=True)

#### Creating: Feature creation

- **Title:** extract the title from the name column
- **Bins:** create bins for features like Age, or Fare
- **FamilySize:** use columns like **SibSp** and **Parch** to know the number of family members
- **IsAlone:** if the passenger had any family member aboard

In [5]:
raw_train_data['Title'] = raw_train_data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
title_filter = (raw_train_data['Title'].value_counts() < 10)
raw_train_data['Title'] = raw_train_data['Title'].apply(lambda x: 'Misc' if title_filter.loc[x] == True else x)

raw_train_data['AgeBin'] = pd.cut(raw_train_data['Age'], 5)
raw_train_data['FareBin'] = pd.qcut(raw_train_data['Fare'], 4)

raw_train_data['FamilySize'] = raw_train_data['SibSp'] + raw_train_data['Parch'] + 1

raw_train_data['IsAlone'] = 1
raw_train_data['IsAlone'].loc[raw_train_data['FamilySize'] > 1] = 0

train_data = raw_train_data.drop(['Name', 'Age', 'Fare'], axis=1)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,Title,AgeBin,FareBin,FamilySize,IsAlone
0,0,3,male,1,0,S,Mr,"(16.336, 32.252]","(-0.001, 7.91]",2,0
1,1,1,female,1,0,C,Mrs,"(32.252, 48.168]","(31.0, 512.329]",2,0
2,1,3,female,0,0,S,Miss,"(16.336, 32.252]","(7.91, 14.454]",1,1
3,1,1,female,1,0,S,Mrs,"(32.252, 48.168]","(31.0, 512.329]",2,0
4,0,3,male,0,0,S,Mr,"(32.252, 48.168]","(7.91, 14.454]",1,1


In [6]:
raw_test_data['Title'] = raw_test_data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
title_filter = (raw_test_data['Title'].value_counts() < 10)
raw_test_data['Title'] = raw_test_data['Title'].apply(lambda x: 'Misc' if title_filter.loc[x] == True else x)

raw_test_data['AgeBin'] = pd.cut(raw_test_data['Age'], 5)
raw_test_data['FareBin'] = pd.qcut(raw_test_data['Fare'], 4)

raw_test_data['FamilySize'] = raw_test_data['SibSp'] + raw_test_data['Parch'] + 1

raw_test_data['IsAlone'] = 1
raw_test_data['IsAlone'].loc[raw_test_data['FamilySize'] > 1] = 0

test_data = raw_test_data.drop(['Name', 'Age', 'Fare'], axis=1)
test_data.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,Title,AgeBin,FareBin,FamilySize,IsAlone
0,3,male,0,0,Q,Mr,"(30.502, 45.668]","(-0.001, 7.896]",1,1
1,3,female,1,0,S,Mrs,"(45.668, 60.834]","(-0.001, 7.896]",2,0
2,2,male,0,0,Q,Mr,"(60.834, 76.0]","(7.896, 14.454]",1,1
3,3,male,0,0,S,Mr,"(15.336, 30.502]","(7.896, 14.454]",1,1
4,3,female,1,1,S,Mrs,"(15.336, 30.502]","(7.896, 14.454]",3,0


#### Converting: Creating Dummy/Encoded Variables

In [7]:
encoder = LabelEncoder()
train_data['Sex_Code'] = encoder.fit_transform(train_data['Sex']) 
train_data['Embarked_Code'] = encoder.fit_transform(train_data['Embarked']) 
train_data['AgeBin_Code'] = encoder.fit_transform(train_data['AgeBin']) 
train_data['FareBin_Code'] = encoder.fit_transform(train_data['FareBin'])
train_data['Title_Code'] = encoder.fit_transform(train_data['Title'])

train_data.drop(['Sex', 'Embarked', 'Title', 'AgeBin', 'FareBin'], axis=1, inplace=True)
train_data.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,FamilySize,IsAlone,Sex_Code,Embarked_Code,AgeBin_Code,FareBin_Code,Title_Code
0,0,3,1,0,2,0,1,2,1,0,3
1,1,1,1,0,2,0,0,0,2,3,4
2,1,3,0,0,1,1,0,2,1,1,2
3,1,1,1,0,2,0,0,2,2,3,4
4,0,3,0,0,1,1,1,2,2,1,3


In [8]:
encoder2 = LabelEncoder()
test_data['Sex_Code'] = encoder2.fit_transform(test_data['Sex']) 
test_data['Embarked_Code'] = encoder2.fit_transform(test_data['Embarked']) 
test_data['AgeBin_Code'] = encoder2.fit_transform(test_data['AgeBin']) 
test_data['FareBin_Code'] = encoder2.fit_transform(test_data['FareBin'])
test_data['Title_Code'] = encoder2.fit_transform(test_data['Title'])

test_data.drop(['Sex', 'Embarked', 'Title', 'AgeBin', 'FareBin'], axis=1, inplace=True)

test_data.head()

Unnamed: 0,Pclass,SibSp,Parch,FamilySize,IsAlone,Sex_Code,Embarked_Code,AgeBin_Code,FareBin_Code,Title_Code
0,3,0,0,1,1,1,1,2,0,3
1,3,1,0,2,0,0,2,3,0,4
2,2,0,0,1,1,1,1,4,1,3
3,3,0,0,1,1,1,2,1,1,3
4,3,1,1,3,0,0,2,1,1,4


### Train Test Split

In [9]:
X = train_data.drop(['Survived'], axis=1)
y = train_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#### K-Neighbors Classifier

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [11]:
knn = KNeighborsClassifier()
# Cretae a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors' : np.arange(1, 25)}

# Use GridSearch to test all the values for n_neighbors
knn_gs = GridSearchCV(knn, params_knn, cv=5)

# fit the model to training data
knn_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [12]:
# get the best model
knn_best = knn_gs.best_estimator_

# check the best n_neighbors value
print(knn_gs.best_params_)

{'n_neighbors': 11}


#### Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

# create a new RandomForestClassifier
rf_clf = RandomForestClassifier()

# parameters dictionary 
params_rf = {'n_estimators' : [50, 100, 200]}

# use gridSearch to test all values 
rf_gs = GridSearchCV(rf_clf, params_rf, cv=5)

# fit model to training data
rf_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [50, 100, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
# Save the best model
rf_best = rf_gs.best_estimator_

# check the best n_estimators value
print(rf_gs.best_params_)

{'n_estimators': 200}


#### Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression

# create a new LogisticRegression model
log_reg = LogisticRegression()

# fit the model to the training data
log_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

#### Checking accuracy scores

In [16]:
print('KNN: {}'.format(knn_best.score(X_test, y_test)))
print('RandomForest: {}'.format(rf_best.score(X_test, y_test)))
print('LogisticRegression: {}'.format(log_reg.score(X_test, y_test)))

KNN: 0.7892376681614349
RandomForest: 0.8071748878923767
LogisticRegression: 0.8026905829596412


<h3><font color='blue'>VotingClassifier <b>Score: 0.76076</b> </font></h3>

In [17]:
from sklearn.ensemble import VotingClassifier

# create a dictionary of our classifiers
estimators = [('knn', knn_best), ('rf', rf_best), ('log_reg', log_reg)]

# create the VotingClassifier
ensemble = VotingClassifier(estimators, voting='hard')

In [18]:
# fit the model to training data
ensemble.fit(X_train, y_train)

# test our model on the test data
ensemble.score(X_test, y_test)

y_predict = ensemble.predict(test_data)

In [19]:
submission = pd.DataFrame({'PassengerId' : original_test_data['PassengerId'], 'Survived':y_predict})
submission.to_csv('./submissions/submission.csv', index=False)

<h3><font color='blue'>Bagging (Boostrap Aggregating) <b>Score:0.76076</b></font></h3>

<img src='./img/bagging.jpeg' height='300' width='500'/>

In [20]:
# get some classifiers to evaluate
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier,ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC

In [21]:
seed = 1075
np.random.seed(seed)

In [22]:
# Create Classifiers
rf = RandomForestClassifier()
ext = ExtraTreesClassifier()
knn = KNeighborsClassifier()
svc = SVC()
rc = RidgeClassifier()

clf_array = [rf, ext, knn, svc, rc]

for clf in clf_array:
    scores = cross_val_score(clf, X, y, cv=10, n_jobs=-1)
    bagging_clf = BaggingClassifier(clf, max_samples=0.4, max_features=10, random_state=seed)
    bagging_scores = cross_val_score(bagging_clf, X, y, cv=10, n_jobs=-1)
    print ('Mean of Classifier: ', clf.__class__.__name__, format(scores.mean(), '.3f'), ' std (+/-): ', format(scores.std(), '.3f'))
    print ('Mean of Bagging: ', bagging_clf.__class__.__name__, format(scores.mean(), '.3f'), ' std (+/-): ', format(scores.std(), '.3f'))
    print('*'*70, '\n')

Mean of Classifier:  RandomForestClassifier 0.806  std (+/-):  0.035
Mean of Bagging:  BaggingClassifier 0.806  std (+/-):  0.035
********************************************************************** 

Mean of Classifier:  ExtraTreesClassifier 0.804  std (+/-):  0.040
Mean of Bagging:  BaggingClassifier 0.804  std (+/-):  0.040
********************************************************************** 

Mean of Classifier:  KNeighborsClassifier 0.791  std (+/-):  0.047
Mean of Bagging:  BaggingClassifier 0.791  std (+/-):  0.047
********************************************************************** 

Mean of Classifier:  SVC 0.831  std (+/-):  0.033
Mean of Bagging:  BaggingClassifier 0.831  std (+/-):  0.033
********************************************************************** 

Mean of Classifier:  RidgeClassifier 0.796  std (+/-):  0.032
Mean of Bagging:  BaggingClassifier 0.796  std (+/-):  0.032
********************************************************************** 



<h4><font color='blue'>Making predictions</font></h4>

In [23]:
svc = SVC()
bg_clf = BaggingClassifier(svc)
param_grid = {'n_estimators' : [10, 50, 100, 200], 'max_samples' : [0.1, 0.3, 0.6, 0.9, 1.0]}
bg_gs = GridSearchCV(bg_clf, param_grid, cv=5)
bg_gs.fit(X_train, y_train)
best_bc = bg_gs.best_estimator_ 
print(best_bc)

BaggingClassifier(base_estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.3, n_estimators=200, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)


In [24]:
print(X_train.shape)
print(test_data.shape)

(668, 10)
(418, 10)


In [25]:
best_bc.fit(X_train, y_train)
y_pred = best_bc.predict(test_data) 
y_predict = pd.DataFrame({'PassengerId' : original_test_data['PassengerId'], 'Survived':y_predict})
submission.to_csv('./submissions/submission_bg.csv', index=False)

<h1><font color='red'>Boosting</font><h1>
    <h2><font color='blue'>AdaBoost (Adaptative Boosting) - <b>Score: 0.77033</b></font><h2>    
    <h4>Converts a set of weak classifiers into a strong one</h4>
    
<h4>In this case we will create an AdaBoost classifier, implementing Decision Trees of depth 1 (a Stump)</h4>

In [31]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

reg_ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1))
scores_ada = cross_val_score(reg_ada, X, y, cv=6)
scores_ada.mean()

0.8080672954834028

In [32]:
reg_ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=None)

In [34]:
y_predict = reg_ada.predict(test_data) 
submission = pd.DataFrame({'PassengerId' : original_test_data['PassengerId'], 'Survived':y_predict})
submission.to_csv('./submissions/submission_ada.csv', index=False)

<h1><font color='blue'>XgBoost (Extreme Gradient Boost) <b>Score: 0.7990</b></font></h1>

In [35]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [37]:
scores_xgb = cross_val_score(xgb, X, y, cv=6)
scores_xgb.mean()

0.8238179454622409

In [36]:
y_predict = xgb.predict(test_data) 
submission = pd.DataFrame({'PassengerId' : original_test_data['PassengerId'], 'Survived':y_predict})
submission.to_csv('./submissions/submission_xgb.csv', index=False)

In [42]:
# Score: 0.78947

xgb_clf = XGBClassifier()
param_grid = {'max_depth' : [1,3,6,9], 'n_estimators' : [10, 50, 100, 200], 'learning_rate' : [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]}
xgb_gs = GridSearchCV(xgb_clf, param_grid, cv=5)
xgb_gs.fit(X_train, y_train)
xgb_best = xgb_gs.best_estimator_
print(xgb_best)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=1, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)


In [41]:
y_predict = xgb_best.predict(test_data) 
submission = pd.DataFrame({'PassengerId' : original_test_data['PassengerId'], 'Survived':y_predict})
submission.to_csv('./submissions/submission_xgb_grid_s.csv', index=False)