In [1]:
! pip install kaggle



In [2]:
! kaggle competitions download -c titanic

titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
import pandas as pd
import numpy as np
import sklearn

In [4]:
df = pd.read_csv('train.csv')

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
X = df.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1).values

y = df['Survived'].values


In [8]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

imp.fit(X)

X = imp.transform(X)

X = pd.DataFrame(X, columns= ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])

In [9]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [10]:
X.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [11]:
X['Sex'] = X['Sex'].apply(lambda x: 0 if x == 'male' else 1)

X['Sex'].unique()


array([0, 1])

In [12]:
X['Embarked'] = X['Embarked'].replace({'S': 0, 'C': 1, 'Q': 2})

X['Embarked'].unique()

array([0, 1, 2])

In [13]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.0,1,0,7.25,0
1,1,1,38.0,1,0,71.2833,1
2,3,1,26.0,0,0,7.925,0
3,1,1,35.0,1,0,53.1,0
4,3,0,35.0,0,0,8.05,0


In [14]:
X.var()

Pclass         0.699015
Sex            0.228475
Age          174.228695
SibSp          1.216043
Parch          0.649728
Fare        2469.436846
Embarked       0.404081
dtype: float64

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X = scaler.fit_transform(X)


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=1)

In [17]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()

log_model.fit(X_train, y_train)

print(log_model.score(X_train, y_train))
print(log_model.score(X_test, y_test))

0.7879213483146067
0.8324022346368715


In [18]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()

tree_model.fit(X_train, y_train)

print(tree_model.score(X_train, y_train))
print(tree_model.score(X_test, y_test))

0.9817415730337079
0.7988826815642458


In [19]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

xgb.fit(X_train, y_train)

print(xgb.score(X_train, y_train))
print(xgb.score(X_test, y_test))

0.9676966292134831
0.8547486033519553


In [20]:
from sklearn.model_selection import GridSearchCV

xgb_tuned = XGBClassifier()

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }


clf = GridSearchCV(xgb_tuned, params, n_jobs=5, 
                   cv=3, 
                   scoring='roc_auc',
                   verbose=2, refit=True
                  )

clf.fit(X_train, y_train)

Fitting 3 folds for each of 405 candidates, totalling 1215 fits
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child_weight=1, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child_weight=5, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child_weight=5, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child_weight=10, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child_weight=10, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=4, min_child_weight=1, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=4, min_child_weight=1, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=4, min_child_weight=5, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=4, min_chi

[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child_weight=1, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=4, min_child_weight=1, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=4, min_child_weight=1, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=4, min_child_weight=5, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=4, min_child_weight=5, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=4, min_child_weight=5, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=5, min_child_weight=1, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=5, min_child_weight=1, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=5, min_child_weight=5, subsample=0.8; total time=   0.0s
[CV] END colsample_

GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                   

In [21]:
print(clf.best_params_)

print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

{'colsample_bytree': 0.8, 'gamma': 1.5, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.6}
0.9609828266419124
0.890625


In [22]:
submission_df = pd.read_csv('test.csv')

submission_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [23]:
submission_df = submission_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1).values

submission_df = imp.transform(submission_df)

submission_df = pd.DataFrame(submission_df, columns= ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])

submission_df['Sex'] = submission_df['Sex'].apply(lambda x: 0 if x == 'male' else 1)

submission_df['Embarked'] = submission_df['Embarked'].replace({'S': 0, 'C': 1, 'Q': 2})

submission_df = scaler.transform(submission_df)

submission_df

array([[ 0.82737724, -0.73769513,  0.44973902, ..., -0.47367361,
        -0.49078316,  2.57919938],
       [ 0.82737724,  1.35557354,  1.39727132, ..., -0.47367361,
        -0.50747884, -0.56883712],
       [-0.36936484, -0.73769513,  2.53431007, ..., -0.47367361,
        -0.45336687,  2.57919938],
       ...,
       [ 0.82737724, -0.73769513,  0.75294936, ..., -0.47367361,
        -0.50244517, -0.56883712],
       [ 0.82737724, -0.73769513, -0.3461881 , ..., -0.47367361,
        -0.48633742, -0.56883712],
       [ 0.82737724, -0.73769513, -0.3461881 , ...,  0.76762988,
        -0.19824428,  1.00518113]])

In [24]:
predictions = clf.predict(submission_df)

predictions

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [25]:
submission = pd.DataFrame({'PassengerId': pd.read_csv('test.csv')['PassengerId'], 'Survived': predictions})

submission.to_csv('submission.csv', index=False)

[CV] END colsample_bytree=0.8, gamma=5, max_depth=4, min_child_weight=5, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=4, min_child_weight=5, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=4, min_child_weight=10, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=4, min_child_weight=10, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=4, min_child_weight=10, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=4, min_child_weight=10, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=4, min_child_weight=10, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=4, min_child_weight=10, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=5, min_child_weight=5, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, 

[CV] END colsample_bytree=0.8, gamma=5, max_depth=3, min_child_weight=5, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=3, min_child_weight=5, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=3, min_child_weight=5, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=3, min_child_weight=5, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=3, min_child_weight=5, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=3, min_child_weight=10, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=4, min_child_weight=10, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=4, min_child_weight=10, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=4, min_child_weight=10, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, ga

[CV] END colsample_bytree=0.8, gamma=5, max_depth=3, min_child_weight=10, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=3, min_child_weight=10, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=5, min_child_weight=1, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=5, min_child_weight=1, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=5, min_child_weight=1, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=5, min_child_weight=1, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=5, min_child_weight=5, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=5, min_child_weight=5, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.8, gamma=5, max_depth=5, min_child_weight=5, subsample=0.6; total time=   0.0s
[CV] END colsample_bytree=0.8, gamm