In [1]:
# importing required libraries
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [6]:
titanic = pd.read_csv('titanic_train.csv')

In [7]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age

In [8]:
titanic['Age'] = titanic[['Age','Pclass']].apply(impute_age,axis=1)

In [9]:
titanic.drop('Cabin',axis=1,inplace=True)

In [10]:
sex = pd.get_dummies(titanic['Sex'],drop_first=True)
embark = pd.get_dummies(titanic['Embarked'],drop_first=True)

In [11]:
titanic.drop(['Sex','Embarked','Name','Ticket','PassengerId'],axis=1,inplace=True)

In [12]:
titanic = pd.concat([titanic,sex,embark],axis=1)

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(titanic.drop('Survived',axis=1), 
                                                    titanic['Survived'], test_size=0.30, 
                                                    random_state=101)

In [14]:
model = XGBClassifier()

# fit the model with the training data
model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [15]:
print(model.feature_importances_)

[0.2087553  0.04810908 0.0524397  0.01806326 0.04572523 0.5767218
 0.         0.05018559]


In [16]:

# predict the target on the train dataset
predict_train = model.predict(X_train)
print('\nTarget on train data',predict_train) 

# Accuray Score on train dataset
accuracy_train = accuracy_score(y_train,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)



Target on train data [1 0 1 1 1 1 0 1 1 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 0
 0 0 1 1 0 0 1 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0
 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 0 1 0 0 1 0 0 0 0 1 1 0
 0 1 1 1 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 1 0
 0 0 1 0 0 1 1 1 0 0 0 1 1 0 1 1 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 1 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1
 1 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1
 0 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 1 0 0 1 1 0 1 1 1 0 0 0 1 0 0 0 0 0 1 0
 1 1 1 1 0 1 1 0 0 1 0 1 1 0 1 1 0 0 0 0 0 1 1 0 1 0 0 1 0 0 0 0 1 1 1 0 0
 0 0 0 1 0 0 1 1 1 1 1 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0
 1 

In [None]:

# predict the target on the test dataset
predict_test = model.predict(X_test)
print('\nTarget on test data',predict_test) 

# Accuracy Score on test dataset
accuracy_test = accuracy_score(y_test,predict_test)
print('\naccuracy_score on test dataset : ', accuracy_test)


Target on test data [0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 1 1 1 0 1 0
 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0
 1 1 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 1 1 1 1 1 0 0 1 0 1 0 0 1 0 1 1 1 0 0
 1 1 0 1 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0
 0 0 1 1 0 0 0 0 1 0 1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 1 1
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0
 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 1 1 0 1 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1
 0 0 1 1 0 1 1 0 1]

accuracy_score on test dataset :  0.8171641791044776


In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=10, scoring_fit='accuracy',
                       do_probabilities = False):
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train_data, y_train_data)
    
    if do_probabilities:
      pred = fitted_model.predict_proba(X_test_data)
    else:
      pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

In [19]:
param_grid = {
    'n_estimators': [400, 700], ## number of trees
    'colsample_bytree': [0.7, 0.8], ## ratio of columns to original set of columns
    'max_depth': [15,20,25], ##max depth of the tree
    'reg_alpha': [1.1, 1.2, 1.3],  ##L1 Regularization - adding weights to remove useless columns
    'reg_lambda': [1.1, 1.2, 1.3], ##L2 Regularization - adding weights to columns to avoid overfitting
    'subsample': [0.7, 0.8, 0.9] ## Ration of training dataset
}


model, pred = algorithm_pipeline(X_train, X_test, y_train, y_test, model, 
                                 param_grid, cv=5, scoring_fit='accuracy')


Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  5.0min finished


In [22]:
print(model.best_score_)
print(model.best_params_)

0.833083870967742
{'colsample_bytree': 0.8, 'max_depth': 15, 'n_estimators': 400, 'reg_alpha': 1.2, 'reg_lambda': 1.1, 'subsample': 0.8}


In [21]:
# predict the target on the test dataset
predict_test = model.predict(X_test)
print('\nTarget on test data',predict_test) 

# Accuracy Score on test dataset
accuracy_test = accuracy_score(y_test,predict_test)
print('\naccuracy_score on test dataset : ', accuracy_test)


Target on test data [0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0 1 0
 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 0
 1 1 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 1 1 1 1 1 0 0 1 0 1 0 1 1 0 1 1 1 0 0
 1 1 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 1
 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 1 1
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1
 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1
 0 0 1 1 0 1 1 0 1]

accuracy_score on test dataset :  0.8208955223880597
