<a href="https://colab.research.google.com/github/8131maggie/kaggle/blob/master/titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 下準備
keggleのタイタニックcompetition/data のページ(
https://www.kaggle.com/c/titanic/data) から   
・test.csv   
・train.csv   
上記2ファイルをダウンロードし、Google-colab上にアップロードする(ローカルからドラッグアンドドロップでOK)

後は下のコードを実行すればkaggleにsubmitできる   
・submission.csv   
というファイルが作成されるのでkaggleに提出する


In [0]:
# Use LGBM
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

test_passenger_ids = test_df.pop('PassengerId')
train_df.drop(['PassengerId'], axis=1, inplace=True)

df_list = [train_df, test_df]

for df in df_list:    
    
    # Transform 'Sex'
    df.loc[df['Sex'] == 'female','Sex'] = 0
    df.loc[df['Sex'] == 'male','Sex'] = 1
    df['Sex'] = df['Sex'].astype('int8')

    df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

y = train_df.pop('Survived')

# Take a hold out set randomly
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.2, random_state=42)

# Create an LGBM dataset for training
categorical_features = ['Age', 'Sex', 'Pclass', 'SibSp', 'Parch', 'Fare']
train_data = lgb.Dataset(data=X_train, label=y_train, categorical_feature=categorical_features, free_raw_data=False)

# Create an LGBM dataset from the test
test_data = lgb.Dataset(data=X_test, label=y_test, categorical_feature=categorical_features, free_raw_data=False)

# Finally, create a dataset for the FULL training data to give us maximum amount of data to train on after 
# performance has been calibrate
final_train_set = lgb.Dataset(data=train_df, label=y, 
                               categorical_feature=categorical_features, free_raw_data=False)

lgb_params = {
    'boosting': 'dart',          # dart (drop out trees) often performs better
    'application': 'binary',     # Binary classification
    'learning_rate': 0.05,       # Learning rate, controls size of a gradient descent step
    'min_data_in_leaf': 20,      # Data set is quite small so reduce this a bit
    'feature_fraction': 0.7,     # Proportion of features in each boost, controls overfitting
    'num_leaves': 41,            # Controls size of tree since LGBM uses leaf wise splits
    'metric': 'binary_logloss',  # Area under ROC curve as the evaulation metric
    'drop_rate': 0.2,            # Dropout probability
              }


evaluation_results = {}
clf = lgb.train(train_set=train_data,
                 params=lgb_params,
                 valid_sets=[train_data, test_data], 
                 valid_names=['Train', 'Test'],
                 evals_result=evaluation_results,
                 num_boost_round=500,
                 early_stopping_rounds=100,
                 verbose_eval=50
                )

optimum_boost_rounds = clf.best_iteration

preds = np.round(clf.predict(X_test))
print('Accuracy score = \t {}'.format(accuracy_score(y_test, preds)))
print('Precision score = \t {}'.format(precision_score(y_test, preds)))
print('Recall score =   \t {}'.format(recall_score(y_test, preds)))
print('F1 score =      \t {}'.format(f1_score(y_test, preds)))

clf_final = lgb.train(train_set=final_train_set,
                      params=lgb_params,
                      num_boost_round=500,                    
                      )

y_pred = np.round(clf_final.predict(test_df)).astype(int)

output_df = pd.DataFrame({'PassengerId': test_passenger_ids, 'Survived': y_pred})
output_df.to_csv("submission.csv", index=False)



[50]	Train's binary_logloss: 0.502753	Test's binary_logloss: 0.508898
[100]	Train's binary_logloss: 0.478109	Test's binary_logloss: 0.489976
[150]	Train's binary_logloss: 0.459353	Test's binary_logloss: 0.474709
[200]	Train's binary_logloss: 0.441243	Test's binary_logloss: 0.465009
[250]	Train's binary_logloss: 0.443344	Test's binary_logloss: 0.470668
[300]	Train's binary_logloss: 0.42155	Test's binary_logloss: 0.463775
[350]	Train's binary_logloss: 0.410914	Test's binary_logloss: 0.464199
[400]	Train's binary_logloss: 0.403528	Test's binary_logloss: 0.462805
[450]	Train's binary_logloss: 0.39807	Test's binary_logloss: 0.464639
[500]	Train's binary_logloss: 0.388235	Test's binary_logloss: 0.469024
Accuracy score = 	 0.8156424581005587
Precision score = 	 0.8253968253968254
Recall score =   	 0.7027027027027027
F1 score =      	 0.7591240875912408
