In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')

PATH = "./dataset/"

train_df = pd.read_csv(PATH+"train.csv")
test_df = pd.read_csv(PATH+"test.csv")

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [8]:
train = train_df.drop(labels=['Ticket', 'Cabin', 'Embarked','Name'], axis=1)
test = test_df.drop(labels=['Ticket', 'Cabin', 'Embarked','Name'], axis=1)


train = train.replace('male', 1)
train = train.replace('female', 0)

test = test.replace('male', 1)
test = test.replace('female', 0)

train_mean_age = train.Age.mean()
test_mean_age = test.Age.mean()
test_mean_fare = test.Fare.mean()

train.Age = train.Age.fillna(train_mean_age)
test.Age = test.Age.fillna(test_mean_age)
test.Fare = test.Fare.fillna(test_mean_fare)

In [14]:
features = [c for c in train.columns if c not in ['PassengerId', 'Survived','Name']]
target = train['Survived']

In [15]:
target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [36]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.4,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': -1,  
    'metric':'auc',
    'min_data_in_leaf': 5,
    'min_sum_hessian_in_leaf': 3.0,
    #'num_leaves': 13,
    'device_type': 'cpu',
    #'max_bin': 63,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1,
    #'gpu_platform_id': 0,
    #'gpu_device_id': 0
}

folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=44000)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()
MAX_FOLDS = 0
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 100000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / (MAX_FOLDS+1)
    
    if fold_ == MAX_FOLDS:
        break

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.893266	valid_1's auc: 0.80987
[2000]	training's auc: 0.902729	valid_1's auc: 0.815584
[3000]	training's auc: 0.907924	valid_1's auc: 0.818182
[4000]	training's auc: 0.911716	valid_1's auc: 0.822857
[5000]	training's auc: 0.914756	valid_1's auc: 0.822857
[6000]	training's auc: 0.916819	valid_1's auc: 0.821299
[7000]	training's auc: 0.919332	valid_1's auc: 0.821818
[8000]	training's auc: 0.920809	valid_1's auc: 0.827013
[9000]	training's auc: 0.922437	valid_1's auc: 0.823896
[10000]	training's auc: 0.923677	valid_1's auc: 0.821299
[11000]	training's auc: 0.924692	valid_1's auc: 0.814545
Early stopping, best iteration is:
[8110]	training's auc: 0.921224	valid_1's auc: 0.828052
CV score: 0.50444 


In [38]:
d = {'PassengerId': test_df.PassengerId, 'Survived': np.round(predictions).astype(int)}
df = pd.DataFrame(data=d)
df.to_csv("./sub.csv", index=False)

In [37]:
MAX_FOLDS

0

In [33]:
df['Survived'].value_counts()

0    273
1    145
Name: Survived, dtype: int64

In [34]:
train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [35]:
df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
