In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd

In [2]:
np.random.seed(42)
np.set_printoptions(threshold=np.inf)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('data/train.csv', index_col='PassengerId')
df_test = pd.read_csv('data/test.csv', index_col='PassengerId')
combine = pd.concat([df.drop('Survived',1), df_test])

# Feature engineering

In [4]:
# Inspired from https://www.kaggle.com/headsortails/pytanic

titles_dict = {'Capt.': 'Other',
               'Major.': 'Other',
               'Jonkheer.': 'Other',
               'Don.': 'Other',
               'Sir.': 'Other',
               'Dr.': 'Other',
               'Rev.': 'Other',
               'Countess.': 'Other',
               'Dona.': 'Other',
               'Mme.': 'Mrs',
               'Mlle.': 'Miss',
               'Ms.': 'Miss',
               'Mr.': 'Mr',
               'Mrs.': 'Mrs',
               'Miss.': 'Miss',
               'Master.': 'Master',
               'Lady.': 'Other'}



df.loc[62, 'Embarked'] = "C"
df.loc[830, 'Embarked'] = "C"

df_test.loc[1044, 'Fare'] = combine['Fare'][combine['Pclass'] == 3].dropna().median()

combine = pd.concat([df.drop('Survived',1), df_test])
y = df['Survived']

combine['Child'] = combine['Age'] <= 10
combine['Cabin_known'] = combine['Cabin'].isnull() == False
combine['Age_known'] = combine['Age'].isnull() == False
combine['Family'] = combine['SibSp'] + combine['Parch']
combine['Alone']  = (combine['SibSp'] + combine['Parch']) == 0
combine['Large_Family'] = (combine['SibSp']>2) | (combine['Parch']>3)
combine['Deck'] = combine['Cabin'].str[0]
combine['Deck'] = combine['Deck'].fillna(value='U')
combine['Ttype'] = combine['Ticket'].str[0]
combine['Title'] = combine['Name'].apply(lambda x: x.split(',')[1].split(' ')[1])
combine['Title'] = combine['Title'].map(titles_dict)
combine['Fare_cat'] = pd.DataFrame(np.floor(np.log10(combine['Fare'] + 1))).astype('int')
combine['Bad_ticket'] = combine['Ttype'].isin(['3','4','5','6','7','8','A','L','W'])
combine['Young'] = (combine['Age']<=30) | (combine['Title'].isin(['Master','Miss','Mlle']))
combine['Shared_ticket'] = np.where(combine.groupby('Ticket')['Name'].transform('count') > 1, 1, 0)
combine['Ticket_group'] = combine.groupby('Ticket')['Name'].transform('count')
combine['Fare_eff'] = combine['Fare']/combine['Ticket_group']
combine['Fare_eff_cat'] = np.where(combine['Fare_eff']>16.0, 2, 1)
combine['Fare_eff_cat'] = np.where(combine['Fare_eff']<8.5,0,combine['Fare_eff_cat'])

In [5]:
combine.loc[760, 'Title'] = 'Mrs'
combine.loc[648, 'Title'] = 'Mr'
combine.loc[695, 'Title'] = 'Mr'
combine.loc[1023, 'Title'] = 'Mr'
combine.loc[1094, 'Title'] = 'Mr'

In [6]:
from sklearn.preprocessing import OrdinalEncoder, Normalizer
from sklearn.impute import SimpleImputer

cat_features = ['Pclass', 'Sex', 'Embarked', 'Title',
               'Child', 'Cabin_known', 'Age_known', 'Alone', 'Large_Family',
               'Deck', 'Ttype', 'Title', 'Bad_ticket', 'Young', 'Shared_ticket',
               'Fare_cat', 'Fare_eff_cat']
cont_features = ['Age', 'Fare', 'Family', 'SibSp', 'Parch',
                'Ticket_group', 'Fare_eff']

# imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
# norm = Normalizer()
# imp_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
# enc = OrdinalEncoder()

# combine[cont_features] = imp_median.fit_transform(combine[cont_features])
# # combine[cont_features] = norm.fit_transform(combine[cont_features])
# # combine[cat_features] = imp_cat.fit_transform(combine[cat_features])
# combine[cat_features] = enc.fit_transform(combine[cat_features])
    
combine["Sex"] = combine["Sex"].astype("category")
combine["Sex"].cat.categories = [0,1]
combine["Sex"] = combine["Sex"].astype("int")

combine["Embarked"] = combine["Embarked"].astype("category")
combine["Embarked"].cat.categories = [0,1,2]
combine["Embarked"] = combine["Embarked"].astype("int")

combine["Deck"] = combine["Deck"].astype("category")
combine["Deck"].cat.categories = [0,1,2,3,4,5,6,7,8]
combine["Deck"] = combine["Deck"].astype("int")   

combine["Title"] = combine["Title"].astype("category")
combine["Title"].cat.categories = [0,1,2,3,4]
combine["Title"] = combine["Title"].astype("int")  

combine["Ttype"] = combine["Ttype"].astype("category")
combine["Ttype"].cat.categories = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
combine["Ttype"] = combine["Ttype"].astype("int") 

for cat in ['Child', 'Cabin_known', 'Age_known', 'Alone', 'Large_Family', 'Title', 'Bad_ticket', 'Young']:
    combine[cat] = combine[cat].astype("category")
    combine[cat] = combine[cat].astype("int")


combine.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)
    
df = combine.loc[:891]
df_test = combine.loc[892:]

In [15]:
df.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Child,Cabin_known,Age_known,Family,Alone,Large_Family,Deck,Ttype,Title,Fare_cat,Bad_ticket,Young,Shared_ticket,Ticket_group,Fare_eff,Fare_eff_cat
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,3,1,22.0,1,0,7.25,2,0,0,1,1,0,0,8,9,2,0,1,1,0,1,7.25,0
2,1,0,38.0,1,0,71.2833,0,0,1,1,1,0,0,2,13,3,1,0,0,1,2,35.64165,2
3,3,0,26.0,0,0,7.925,2,0,0,1,0,1,0,8,14,1,0,0,1,0,1,7.925,0
4,1,0,35.0,1,0,53.1,2,0,1,1,1,0,0,2,0,3,1,0,0,1,2,26.55,2
5,3,1,35.0,0,0,8.05,2,0,0,1,0,1,0,8,2,2,0,1,0,0,1,8.05,0


In [34]:
from sklearn.model_selection import train_test_split

df_train, df_val, y_train, y_val = train_test_split(df, y, test_size=0.2, random_state=42)

all_data = lgb.Dataset(df, label=y, categorical_feature=cat_features)
train_data = lgb.Dataset(df_train, label=y_train, categorical_feature=cat_features, free_raw_data=False)
test_data = lgb.Dataset(df_test)
validation_data = lgb.Dataset(df_val, label=y_val, reference=train_data, free_raw_data=False)

# Callbacks

In [9]:
def find_best_acc(preds, target):
        df = pd.DataFrame({'pred': preds,
                           'target': target})
        num_1 = int(df['target'].sum())
        num_0 = df.shape[0] - num_1
        df.sort_values(by='pred',ascending=False, inplace=True)
        cleaner = df.duplicated('pred', keep='last')
        
        df = df.assign(TP=df.loc[:, 'target'].cumsum(), TN=num_0 - (df.loc[:, 'target'] == 0).cumsum())
        
        df = df.loc[~cleaner, :]
        
        df = df.assign(ACC=(df.TP + df.TN)/len(target))
        
        return df.ACC.max()

def find_best_tresh(preds, target):
        df = pd.DataFrame({'pred': preds,
                           'target': target})
        num_1 = int(df['target'].sum())
        num_0 = df.shape[0] - num_1
        df.sort_values(by='pred',ascending=False, inplace=True)
        cleaner = df.duplicated('pred', keep='last')
        
        df = df.assign(TP=df.loc[:, 'target'].cumsum(), TN=num_0 - (df.loc[:, 'target'] == 0).cumsum())
        
        df = df.loc[~cleaner, :]
        
        df = df.assign(ACC=(df.TP + df.TN)/len(target))
        
        return df.loc[df.ACC.idxmax(), 'pred']
    
def optimal_accuracy(preds, train_data):
    labels = train_data.get_label()
    
    acc = find_best_acc(preds, labels)
    
    return 'o_acc', acc, True

# Train

In [38]:
from sklearn.model_selection import GridSearchCV

param = {'num_leaves': 10, 'max_depth':-1 ,'objective': 'binary', 'learning_rate': 0.05,
        'boosting': 'dart'}
num_round = 200
bst = lgb.train(param, train_data, num_round,
                valid_sets=[validation_data],
#                 categorical_feature=cat_features,
                feval=optimal_accuracy,
#                 nfold=4,
#                 verbose_eval=True
               )

# model = lgb.LGBMClassifier(num_leaves=31, objective='binary')

# param_grid = {
#     'learning_rate': [0.01, 0.1, 0.05],
#     'boosting_type': ['gbdt', 'dart'],
#     'num_leaves': [31, 15, 10, 5, 25, 50],
#     'max_depth': [-1, 2, 3, 4, 5],
#     'n_estimators': [20, 40, 100]
# }

# gbm = GridSearchCV(model, param_grid, cv=4)
# gbm.fit(df, y)

# print('Best parameters found by grid search are:', gbm.best_params_)

[1]	valid_0's binary_logloss: 0.657036	valid_0's o_acc: 0.837989
[2]	valid_0's binary_logloss: 0.63689	valid_0's o_acc: 0.837989
[3]	valid_0's binary_logloss: 0.617415	valid_0's o_acc: 0.849162
[4]	valid_0's binary_logloss: 0.600558	valid_0's o_acc: 0.843575
[5]	valid_0's binary_logloss: 0.585435	valid_0's o_acc: 0.837989
[6]	valid_0's binary_logloss: 0.572506	valid_0's o_acc: 0.837989
[7]	valid_0's binary_logloss: 0.560155	valid_0's o_acc: 0.837989
[8]	valid_0's binary_logloss: 0.565026	valid_0's o_acc: 0.837989
[9]	valid_0's binary_logloss: 0.552522	valid_0's o_acc: 0.837989
[10]	valid_0's binary_logloss: 0.54214	valid_0's o_acc: 0.837989
[11]	valid_0's binary_logloss: 0.532822	valid_0's o_acc: 0.832402
[12]	valid_0's binary_logloss: 0.536387	valid_0's o_acc: 0.832402
[13]	valid_0's binary_logloss: 0.527125	valid_0's o_acc: 0.832402
[14]	valid_0's binary_logloss: 0.518832	valid_0's o_acc: 0.832402
[15]	valid_0's binary_logloss: 0.510819	valid_0's o_acc: 0.832402
[16]	valid_0's binary

[134]	valid_0's binary_logloss: 0.418741	valid_0's o_acc: 0.832402
[135]	valid_0's binary_logloss: 0.418654	valid_0's o_acc: 0.837989
[136]	valid_0's binary_logloss: 0.4194	valid_0's o_acc: 0.837989
[137]	valid_0's binary_logloss: 0.419764	valid_0's o_acc: 0.837989
[138]	valid_0's binary_logloss: 0.419416	valid_0's o_acc: 0.837989
[139]	valid_0's binary_logloss: 0.419035	valid_0's o_acc: 0.837989
[140]	valid_0's binary_logloss: 0.419507	valid_0's o_acc: 0.837989
[141]	valid_0's binary_logloss: 0.41982	valid_0's o_acc: 0.837989
[142]	valid_0's binary_logloss: 0.420166	valid_0's o_acc: 0.837989
[143]	valid_0's binary_logloss: 0.419502	valid_0's o_acc: 0.837989
[144]	valid_0's binary_logloss: 0.419782	valid_0's o_acc: 0.837989
[145]	valid_0's binary_logloss: 0.4196	valid_0's o_acc: 0.837989
[146]	valid_0's binary_logloss: 0.419961	valid_0's o_acc: 0.837989
[147]	valid_0's binary_logloss: 0.420338	valid_0's o_acc: 0.837989
[148]	valid_0's binary_logloss: 0.420752	valid_0's o_acc: 0.837989


In [39]:
bst.save_model('lf10-lr0.05-ep200')

<lightgbm.basic.Booster at 0x2e8fe1b4dc8>

In [40]:
val_preds = bst.predict(df_val)
tresh = find_best_tresh(val_preds, y_val)

preds = bst.predict(df_test)
preds = (preds >= tresh).astype(int)

In [41]:
submission = pd.DataFrame()
submission['PassengerId'] = df_test.index
submission['Survived'] = preds
submission.to_csv('submission.csv', index=False)