In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 200)

from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [5]:
# Load data from .csv
titanic_train = pd.read_csv('train.csv', index_col='PassengerId')
titanic_test = pd.read_csv('test.csv', index_col='PassengerId')
y_sub = pd.read_csv('gender_submission.csv', index_col='PassengerId')

In [6]:
# Transform cabin and sex columns for training data, so they would become int type
titanic_train['Cabin'] = np.where(titanic_train['Cabin'].isna(), 0, 1)
titanic_train['Sex'] = np.where(titanic_train['Sex'] == 'male', 0, 1)
# Get column for each place, people embarked on titanic
titanic_train = titanic_train.join(pd.get_dummies(titanic_train['Embarked'], prefix='Embarked'))

In [7]:
# Transform cabin and sex columns for testing data, so they would become int type
titanic_test['Cabin'] = np.where(titanic_test['Cabin'].isna(), 0, 1)
titanic_test['Sex'] = np.where(titanic_test['Sex'] == 'male', 0, 1)
# Get column for each place, people embarked on titanic
titanic_test = titanic_test.join(pd.get_dummies(titanic_test['Embarked'], prefix='Embarked'))

In [8]:
# Drop unnecessary columns
X = titanic_train.drop(['Survived', 'Name', 'Ticket', 'Embarked'], axis=1)
X_val = titanic_test.drop(['Name', 'Ticket', 'Embarked'], axis=1)
# Get target values for training data
y = titanic_train.Survived

In [9]:
# Split train data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [10]:
# Create XGBoost model and fit it
xgb_model = XGBClassifier(objective='binary:logistic', n_jobs=8, random_state=0, n_estimators=110, max_depth=10,
learning_rate=0.1, early_stopping_rounds=10).fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

In [8]:
'''parameters = {
    'max_depth': range (5, 15, 1),
    'n_estimators': range(90, 110, 5),
    'learning_rate': [0.05, 0.03, 0.01]
}'''

"parameters = {\n    'max_depth': range (5, 15, 1),\n    'n_estimators': range(90, 110, 5),\n    'learning_rate': [0.05, 0.03, 0.01]\n}"

In [16]:
'''grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 8,
    cv = 10,
    verbose=True)'''

In [36]:
#grid_search.fit(X_train, y_train)

In [9]:
#xgb_model = grid_search.best_estimator_

In [11]:
# Evaluate models accuracy
accuracy_score(y_test, xgb_model.predict(X_test))

0.8305084745762712

In [13]:
# Predict on test.csv
pred = xgb_model.predict(X_val)

In [14]:
pred

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,