# ライブラリのインポート

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import xgboost as xgb
import lightgbm as lgb
import shap
from sklearn.metrics import log_loss

# 最低限の特徴量エンジニアリング
1. 使わない特徴量を削除
2. カテゴリカルデータをラベルエンコーディング

In [2]:
df = pd.read_csv("train.csv")

In [3]:
drop_cols = ['PassengerId', 'Cabin', 'Name']
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination','VIP']

In [4]:
df_drop = df.drop(drop_cols, axis=1)

In [5]:
for c in cat_cols:
    le = LabelEncoder()
    le.fit(df_drop[c])
    df_drop[c] = le.transform(df_drop[c])

for c in cat_cols:
    df_drop[c] = df_drop[c].astype('category')

# 訓練データの分割

In [6]:

X = df_drop.drop(['Transported'], axis=1)
y = df_drop['Transported']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=0 )
# 最初にデータを訓練データとテストデータに分割します。例えば、全体の20%をテストデータとします。
# 次に、訓練データをさらに訓練データと検証データに分割します。例えば、訓練データのうちさらに25%を検証データとします。
# ここで、訓練データの75%と検証データの25%の割合になりますが、全データの中ではそれぞれ60%と20%になります。
X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.20, shuffle=True, stratify=y_train, random_state=0 )

"""
最初にデータを訓練データとテストデータに分割
次に訓練データをさらに訓練データと検証データに分割
X_train, y_train : 訓練データ
X_val, y_val : 検証データ
X_test, y_test : テストデータ
"""

'\n最初にデータを訓練データとテストデータに分割\n次に訓練データをさらに訓練データと検証データに分割\nX_train, y_train : 訓練データ\nX_val, y_val : 検証データ\nX_test, y_test : テストデータ\n'

# モデルの構築

In [8]:

lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val, y_val)
lgb_test = lgb.Dataset(X_test, y_test)


# カスタム評価関数の定義
def lgb_accuracy_score(y_true, y_pred):
    y_pred = np.round(y_pred)  # 確率を0または1に丸める
    return 'accuracy', accuracy_score(y_true, y_pred), True



params = {
    'objective': 'binary',
    'num_leaves': 10,
    'seed': 0,
    'verbose': -1,
}

In [9]:
model = lgb.train(params,
                      lgb_train,
                      num_boost_round=10000,
                      valid_sets=[lgb_train, lgb_val],
                      valid_names=['train', 'valid'],
                      callbacks=[lgb.early_stopping(100),
                      lgb.log_evaluation(50)])

Training until validation scores don't improve for 100 rounds
[50]	train's binary_logloss: 0.399923	valid's binary_logloss: 0.433612
[100]	train's binary_logloss: 0.373982	valid's binary_logloss: 0.431466
[150]	train's binary_logloss: 0.355042	valid's binary_logloss: 0.432554
Early stopping, best iteration is:
[86]	train's binary_logloss: 0.380251	valid's binary_logloss: 0.430244


# テストデータで検証

In [10]:
def predict(submi_data,models):
    output = np.zeros_like(models[0].predict(submi_data))
    for i in range(len(models)):
        output += models[i].predict(submi_data)
       
        
    output = output / len(models)
    return np.round(output).astype(bool)

In [11]:
ac_score = accuracy_score(predict(X_test,[model]), y_test)
pr_score = precision_score(predict(X_test,[model]), y_test)
rc_score = recall_score(predict(X_test,[model]),y_test)
f1 = f1_score(predict(X_test,[model]), y_test)

print('accuracy = %.10f' % (ac_score))
print('precision = %.10f' % (pr_score))
print('recall = %.10f' % (rc_score))
print('F1-score = %.10f' % (f1))

accuracy = 0.7843588269
precision = 0.8321917808
recall = 0.7617554859
F1-score = 0.7954173486
