In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold


PATH = "./dataset/"

train_df = pd.read_csv(PATH+"clean_train.csv")
test_df = pd.read_csv(PATH+"clean_test.csv")

In [2]:
train = train_df.drop(labels=['Ticket','Name'], axis=1)
test = test_df.drop(labels=['Ticket','Name'], axis=1)


train = train.replace('male', 1)
train = train.replace('female', 0)

test = test.replace('male', 1)
test = test.replace('female', 0)

train_mean_age = train.Age.mean()
test_mean_age = test.Age.mean()
test_mean_fare = test.Fare.mean()

train.Age = train.Age.fillna(train_mean_age)
test.Age = test.Age.fillna(test_mean_age)
test.Fare = test.Fare.fillna(test_mean_fare)

In [3]:
features = [c for c in train.columns if c not in ['PassengerId', 'Survived']]
target = train['Survived']

In [4]:
target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [10]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.4,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': -1,  
    'metric':'auc',
    'min_data_in_leaf': 2,
    #'min_sum_hessian_in_leaf': 3.0,
    #'num_leaves': 13,
    'device_type': 'cpu',
    #'max_bin': 63,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1,
    #'gpu_platform_id': 0,
    #'gpu_device_id': 0
}

folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=44000)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()
MAX_FOLDS = 0
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 100000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    '''
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    '''
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.split_n()
    
    #if fold_ == MAX_FOLDS:
    #    break

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.928032	valid_1's auc: 0.821299
[2000]	training's auc: 0.939967	valid_1's auc: 0.829091
[3000]	training's auc: 0.945466	valid_1's auc: 0.828052
[4000]	training's auc: 0.948651	valid_1's auc: 0.828052
Early stopping, best iteration is:
[1297]	training's auc: 0.932839	valid_1's auc: 0.834805
Fold 1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.930983	valid_1's auc: 0.798442
[2000]	training's auc: 0.942898	valid_1's auc: 0.800519
[3000]	training's auc: 0.948747	valid_1's auc: 0.801039
[4000]	training's auc: 0.951391	valid_1's auc: 0.802597
[5000]	training's auc: 0.952413	valid_1's auc: 0.8
[6000]	training's auc: 0.953857	valid_1's auc: 0.8
Early stopping, best iteration is:
[3565]	training's auc: 0.950052	valid_1's auc: 0.808312
Fold 2
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.937451	valid_1's auc: 0.778877

In [11]:
d = {'PassengerId': test_df.PassengerId, 'Survived': np.round(predictions).astype(int)}
df = pd.DataFrame(data=d)
df.to_csv("./cleaned_sub.csv", index=False)

In [14]:
predictions/=10

In [35]:
df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
