In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [33]:
train_data=pd.read_csv('data/train.csv')
test_data=pd.read_csv('data/test.csv')


In [3]:
train_data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,4,1,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S


In [34]:
def create_extra_features(data):
    data['Ticket_type'] = data['Ticket'].map(lambda x: str(x)[:3].replace('.', '').replace('/', '') if len(str(x).split()) > 1 else 'X').astype('category')
    data['Ticket_number'] = data['Ticket'].map(lambda x: str(x)[-1] if len(str(x).split()) > 1 else -1).astype('float')
    data['Cabin_type']=data['Cabin'].map(lambda x: str(x)[0] if not pd.isnull(x) else 'X').astype('category')
    data['Cabin_number']=data['Cabin'].map(lambda x: str(x)[1:] if not pd.isnull(x) else -1).astype('float')
    data['Family_size']=data['SibSp']+data['Parch']+1
    data['Categorical_age']=pd.cut(data['Age'],bins=[0,10,20,30,40,50,60,70,80,90,100],labels=[1,2,3,4,5,6,7,8,9,10])
    data['Categorical_fare']=pd.cut(data['Fare'],bins=[0,10,20,30,40,50,60,70,80,90,100,np.inf],labels=[1,2,3,4,5,6,7,8,9,10,11])
    return data

mean=train_data['Age'].mean()
train_data['Age'].fillna(mean, inplace=True)
train_data['Embarked'].fillna('S', inplace=True)
train_data['Embarked']=train_data['Embarked'].map({'S':0, 'C':1, 'Q':2})
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)
#test
mean=test_data['Age'].mean()
test_data['Age'].fillna(mean, inplace=True)
test_data['Embarked'].fillna('S', inplace=True)
test_data['Embarked']=test_data['Embarked'].map({'S':0, 'C':1, 'Q':2})
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)
train_data=create_extra_features(train_data)
test_data=create_extra_features(test_data)
train_data=train_data.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)
test_data=test_data.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)

In [38]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
# train_data['Embarked']=le.fit_transform(train_data['Embarked'])
train_data['Sex']=le.fit_transform(train_data['Sex'])
test_data['Sex']=le.fit_transform(test_data['Sex'])

# split

cat_columns=['Sex','Ticket_type','Ticket_number','Cabin_type','Cabin_number','Categorical_age','Categorical_fare']
# cast all categorical type as object

for col in cat_columns:
    train_data[col]=le.fit_transform(train_data[col])
    test_data[col]=le.fit_transform(test_data[col])
X=train_data.drop(['Survived'],axis=1)
y=train_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [39]:
%%time
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

params={'learning_rate':0.01,
        'max_depth':5,
        'n_estimators':1000,
        'tree_method':'gpu_hist',
        'enable_categorical':True,
        'random_state':0}
xgb=XGBClassifier(**params)
xgb.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=50,verbose=False)
y_pred=xgb.predict(X_test)
accuracy_score(y_test,y_pred)

CPU times: user 2.65 s, sys: 376 ms, total: 3.03 s
Wall time: 2.52 s


0.7833

In [7]:
train_data.isna().sum()

Survived            0
Pclass              0
Sex                 0
Age                 0
SibSp               0
Parch               0
Fare                0
Embarked            0
Ticket_type         0
Ticket_number       0
Cabin_type          0
Cabin_number        0
Family_size         0
Categorical_age     0
Categorical_fare    0
dtype: int64

In [40]:
%%time
# lightgbm

from lightgbm import LGBMClassifier
params={'learning_rate':0.07,

        'max_depth':5,
        'n_estimators':1000,
        'n_jobs':-1,
        }
lgb=LGBMClassifier(**params)
lgb.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=50,verbose=False)
y_pred=lgb.predict(X_test)
accuracy_score(y_test,y_pred)

CPU times: user 3.34 s, sys: 20.6 ms, total: 3.36 s
Wall time: 866 ms


0.78365

In [32]:
submission=pd.read_csv('data/sample_submission.csv')
submission['Survived']=lgb.predict(test_data)
submission.to_csv('submission.csv',index=False)

In [None]:
# lightgbm get 0.797 and 0.799 with this feature engineering!

In [41]:
import optuna
from lightgbm import LGBMClassifier

# lightgbm
def objective(trial):
    params={'learning_rate':trial.suggest_loguniform('learning_rate',1e-3,1e-1),
        'max_depth':trial.suggest_int('max_depth',3,13),
        'n_estimators':trial.suggest_int('n_estimators',100,3000),
        'n_jobs':-1,
        'random_state':0,
        }
    lgb=LGBMClassifier(**params)
    lgb.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=50,verbose=False)
    y_pred=lgb.predict(X_test)
    return accuracy_score(y_test,y_pred)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)

[32m[I 2022-07-13 03:39:45,700][0m A new study created in memory with name: no-name-21c35556-2158-4502-9499-608366b7867d[0m
[32m[I 2022-07-13 03:39:49,087][0m Trial 0 finished with value: 0.7836 and parameters: {'learning_rate': 0.011729725838599308, 'max_depth': 13, 'n_estimators': 1009}. Best is trial 0 with value: 0.7836.[0m
[32m[I 2022-07-13 03:39:57,430][0m Trial 1 finished with value: 0.7826 and parameters: {'learning_rate': 0.004681345896645719, 'max_depth': 4, 'n_estimators': 2123}. Best is trial 0 with value: 0.7836.[0m
[32m[I 2022-07-13 03:39:58,988][0m Trial 2 finished with value: 0.7837 and parameters: {'learning_rate': 0.005621302860535888, 'max_depth': 7, 'n_estimators': 257}. Best is trial 2 with value: 0.7837.[0m
[32m[I 2022-07-13 03:40:00,099][0m Trial 3 finished with value: 0.78435 and parameters: {'learning_rate': 0.04149376134265786, 'max_depth': 8, 'n_estimators': 1140}. Best is trial 3 with value: 0.78435.[0m
[32m[I 2022-07-13 03:40:01,844][0m Tri

In [24]:
params = study.best_params
params['n_jobs']=-1
params['random_state']=0
lgb=LGBMClassifier(**params)
lgb.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=50,verbose=False)
y_pred=lgb.predict(X_test)
accuracy_score(y_test,y_pred)



0.7806

In [21]:
%%time
# catboost 
from catboost import CatBoostClassifier
params={
      'cat_features':cat_columns,
    'task_type': 'GPU',
        'random_state':0}
cb=CatBoostClassifier(**params)
cb.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=50,verbose=False)
y_pred=cb.predict(X_test)
accuracy_score(y_test,y_pred)

CPU times: user 18.7 s, sys: 4.66 s, total: 23.3 s
Wall time: 15.4 s


0.7804

In [107]:
submission=pd.read_csv('data/sample_submission.csv')
submission['Survived']=cb.predict(test_data)
submission.to_csv('submission.csv',index=False)


In [None]:
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# catboost
def objective(trial):
    params={'learning_rate':trial.suggest_loguniform('learning_rate',1e-3,1e-1),'n_estimators':trial.suggest_int('n_estimators',100,3500),'max_depth':trial.suggest_int('max_depth',3,13), 'max_bin': trial.suggest_int('max_bin', 200, 400),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300), 'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0001, 1.0, log = True),'random_seed': 42,
'task_type': 'GPU',
        'loss_function': 'Logloss' }
    cat=CatBoostClassifier(**params)
    cat.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=50,verbose=False)
    y_pred=cat.predict(X_test)
    return accuracy_score(y_test,y_pred)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)
print(study.best_params)

[32m[I 2022-07-13 03:41:20,632][0m A new study created in memory with name: no-name-fe6c7446-9cbf-43cb-8370-be44dd16e2d4[0m
[32m[I 2022-07-13 03:41:21,979][0m Trial 0 finished with value: 0.78505 and parameters: {'learning_rate': 0.09921111190183465, 'n_estimators': 276, 'max_depth': 5, 'max_bin': 386, 'min_data_in_leaf': 264, 'l2_leaf_reg': 0.22108578233701476}. Best is trial 0 with value: 0.78505.[0m
[32m[I 2022-07-13 03:41:27,863][0m Trial 1 finished with value: 0.78305 and parameters: {'learning_rate': 0.003990814250827218, 'n_estimators': 2111, 'max_depth': 3, 'max_bin': 395, 'min_data_in_leaf': 38, 'l2_leaf_reg': 0.06851789642269748}. Best is trial 0 with value: 0.78505.[0m
[32m[I 2022-07-13 03:41:34,269][0m Trial 2 finished with value: 0.78375 and parameters: {'learning_rate': 0.011172832166959563, 'n_estimators': 2559, 'max_depth': 5, 'max_bin': 311, 'min_data_in_leaf': 63, 'l2_leaf_reg': 0.00250503331006581}. Best is trial 0 with value: 0.78505.[0m
[32m[I 2022-07-

{'learning_rate': 0.09921111190183465, 'n_estimators': 276, 'max_depth': 5, 'max_bin': 386, 'min_data_in_leaf': 264, 'l2_leaf_reg': 0.22108578233701476}


In [14]:
print(study.best_params)

{'learning_rate': 0.006993536952992658, 'n_estimators': 2587, 'max_depth': 6, 'max_bin': 327, 'min_data_in_leaf': 235, 'l2_leaf_reg': 0.22612238365483142}


In [43]:
# train on best params
params=study.best_params
params['task_type']='GPU'
params['loss_function']='Logloss'
cat=CatBoostClassifier(**params)
cat.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=50,verbose=False)
y_pred=cat.predict(X_test)
accuracy_score(y_test,y_pred)

0.7846

In [17]:
submission=pd.read_csv('data/sample_submission.csv')
submission['Survived']=cat.predict(test_data)
submission.to_csv('submission.csv',index=False)

In [None]:
import optuna
from xgboost import XGBClassifier
def objective(trial):
    params={'learning_rate':trial.suggest_loguniform('learning_rate',1e-3,1e-1),
        'max_depth':trial.suggest_int('max_depth',3,13),
        'n_estimators':trial.suggest_int('n_estimators',100,3000),
        'tree_method':'gpu_hist',
        'random_state':0,
        }
    xgb=XGBClassifier(**params)
    xgb.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=50,verbose=False)
    y_pred=xgb.predict(X_test)
    return accuracy_score(y_test,y_pred)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)

[32m[I 2022-07-13 03:47:53,718][0m A new study created in memory with name: no-name-5ceae411-775f-4814-a14b-5ea844d09103[0m
[32m[I 2022-07-13 03:47:54,533][0m Trial 0 finished with value: 0.78195 and parameters: {'learning_rate': 0.057330895103874714, 'max_depth': 4, 'n_estimators': 372}. Best is trial 0 with value: 0.78195.[0m
[32m[I 2022-07-13 03:47:59,766][0m Trial 1 finished with value: 0.78195 and parameters: {'learning_rate': 0.008797228625690387, 'max_depth': 9, 'n_estimators': 288}. Best is trial 0 with value: 0.78195.[0m
[32m[I 2022-07-13 03:48:02,342][0m Trial 2 finished with value: 0.7838 and parameters: {'learning_rate': 0.016699646816801694, 'max_depth': 7, 'n_estimators': 806}. Best is trial 2 with value: 0.7838.[0m
[32m[I 2022-07-13 03:48:05,529][0m Trial 3 finished with value: 0.78015 and parameters: {'learning_rate': 0.044956966160650966, 'max_depth': 10, 'n_estimators': 1042}. Best is trial 2 with value: 0.7838.[0m
[32m[I 2022-07-13 03:48:06,500][0m T

In [14]:
study.best_params

{'learning_rate': 0.03728054777542706, 'max_depth': 3, 'n_estimators': 2814}

In [45]:
params=study.best_params
params['tree_method']='gpu_hist'
params['random_state']=0
xgb=XGBClassifier(**params)
xgb.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=50,verbose=False)
y_pred=xgb.predict(X_test)
accuracy_score(y_test,y_pred)


0.78445

In [46]:
import joblib
joblib.dump(xgb,'xgb.pkl')
joblib.dump(lgb,'lgb.pkl')
joblib.dump(cat,'cat.pkl')


['cat.pkl']

In [65]:
# loading models
xgb=joblib.load('xgb.pkl')
lgb=joblib.load('lgb.pkl')
cat=joblib.load('cat.pkl')

In [None]:
# voting classifier
from sklearn.ensemble import VotingClassifier
voting=VotingClassifier(estimators=[('xgb',xgb),('lgb',lgb),('cat',cat)],voting='soft',verbose=0)
voting.fit(X_train,y_train)
y_pred=voting.predict(X_test)
accuracy_score(y_test,y_pred)


In [54]:
submission=pd.read_csv('data/sample_submission.csv')
submission['Survived']=voting.predict(test_data)
submission.to_csv('submission.csv',index=False)


It improves scores by at least 0.1 or something. scores are 0.79924
0.80015

In [None]:
# stacking
from sklearn.ensemble import StackingClassifier
clf=StackingClassifier(estimators=[('xgb',xgb),('lgb',lgb),('cat',cat)],stack_method='predict')
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
accuracy_score(y_test,y_pred)

In [68]:
submission=pd.read_csv('data/sample_submission.csv')
submission['Survived']=clf.predict(test_data)
submission.to_csv('submission.csv',index=False)

Now scores are 0.80107
0.80152