# Tabular Playground Series - Apr 2021

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier

In [2]:
train_dataset = pd.read_csv('train.csv')

X = train_dataset[['Pclass','Age','Sex','SibSp','Parch','Fare', 'Cabin','Embarked']]
y = train_dataset['Survived']

In [3]:
def preprocess_data(X):
   
    categorical_cols = ['Age', 'Pclass', 'Sex', 'Cabin', 'Fare', 'Fam_Size', 'Embarked']
    
    X['Age'] = pd.cut(X['Age'], [0, 10, 20, 30, 40, 50, 60, 70, 80], labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80'])
    X['Fare'] = pd.cut(X['Fare'], [0, 7.27, 8.62, 9.76, 10.96, 30.77, 35.32, 72.912, 126.902, 744.66], labels=['0-7.27', '7.27-8.62', '8.62-9.76', '9.76-10.96', '10.96-30.77', '30.77-35.32', '35.32-72.912','72.912-126.902', '126.902-744.66'])
    X['Fam_Size'] = X['SibSp'] + X['Parch'] + 1
    X['Fam_Size'] = pd.cut(X['Fam_Size'], [0, 1, 4, 7, 12, 18], labels=['Alone', 'Small', 'Medium', 'Large', 'Very Large'])
    X = X.drop(['SibSp', 'Parch'], axis=1)
    X['Cabin'] = X['Cabin'].replace('[ABCT].+', value = 'ABC', regex = True)
    X['Cabin'] = X['Cabin'].replace('[DE].+', value = 'DE', regex = True)
    X['Cabin'] = X['Cabin'].replace('[FG].+', value = 'FG', regex = True)
    X['Cabin'] = X['Cabin'].replace(np.nan, value = 'N', regex = True)
    
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent'))
    ])

    full_pipeline = ColumnTransformer([
        ('cat', cat_pipeline, categorical_cols)
    ])

    X = pd.DataFrame(full_pipeline.fit_transform(X))

    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoder.fit(X)
    X = encoder.transform(X)
    
    return X

In [5]:
X = preprocess_data(X)

In [6]:
X, y = shuffle(X, y, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [7]:
sub_dataset = pd.read_csv('test.csv')

X_sub = sub_dataset[['Pclass','Age','Sex','SibSp','Parch','Cabin','Fare','Embarked']]
submission_index = sub_dataset['PassengerId']

X_sub = preprocess_data(X_sub)

In [8]:
xgb_model = XGBClassifier(objective='binary:logistic', use_label_encoder=False,
                         learning_rate=0.01, n_jobs=-1,
                         n_estimators=500, max_depth=6)

xgb_model.fit(X_train, y_train,
             early_stopping_rounds=3,
              eval_set=[(X_test, y_test)],
             verbose=5)

[0]	validation_0-logloss:0.68956
[5]	validation_0-logloss:0.67266
[10]	validation_0-logloss:0.65732
[15]	validation_0-logloss:0.64338
[20]	validation_0-logloss:0.63068
[25]	validation_0-logloss:0.61909
[30]	validation_0-logloss:0.60849
[35]	validation_0-logloss:0.59878
[40]	validation_0-logloss:0.58987
[45]	validation_0-logloss:0.58170
[50]	validation_0-logloss:0.57421
[55]	validation_0-logloss:0.56732
[60]	validation_0-logloss:0.56099
[65]	validation_0-logloss:0.55516
[70]	validation_0-logloss:0.54978
[75]	validation_0-logloss:0.54481
[80]	validation_0-logloss:0.54029
[85]	validation_0-logloss:0.53611
[90]	validation_0-logloss:0.53225
[95]	validation_0-logloss:0.52868
[100]	validation_0-logloss:0.52541
[105]	validation_0-logloss:0.52240
[110]	validation_0-logloss:0.51964
[115]	validation_0-logloss:0.51707
[120]	validation_0-logloss:0.51470
[125]	validation_0-logloss:0.51250
[130]	validation_0-logloss:0.51049
[135]	validation_0-logloss:0.50863
[140]	validation_0-logloss:0.50692
[145]	v

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=-1, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [9]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.771

In [10]:
y_sub = xgb_model.predict(X_sub)
y_sub

array([0, 0, 1, ..., 0, 1, 1])

In [11]:
submission_index = pd.DataFrame(submission_index, dtype=np.int64)
y_sub = pd.DataFrame(y_sub, columns=['Survived'], dtype=np.int8)

In [12]:
submission_data = pd.concat([submission_index,y_sub], axis=1)
submission_data

Unnamed: 0,PassengerId,Survived
0,100000,0
1,100001,0
2,100002,1
3,100003,0
4,100004,1
...,...,...
99995,199995,1
99996,199996,0
99997,199997,0
99998,199998,1


In [None]:
submission_data.to_csv('submission.csv', index=False)