# ライブラリのインポート

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import xgboost as xgb
import lightgbm as lgb
import shap
from sklearn.metrics import log_loss


# 特徴量エンジニアリング
1. 使わない特徴量を削除
2. カテゴリカルデータをラベルエンコーディング
3. 欠損値を埋める。(LightGBM)

In [2]:
df = pd.read_csv("train.csv")

In [3]:
drop_cols = ['PassengerId', 'Cabin', 'Name']
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination','VIP']

In [4]:
df_drop = df.drop(drop_cols, axis=1)

In [5]:
# RoomService の欠損値を条件に基づいて埋める
df_drop['RoomService'] = df_drop.apply(
    lambda row: 0 if pd.isna(row['RoomService']) and row['CryoSleep'] else row['RoomService'],
    axis=1
)


In [6]:
for c in cat_cols:
    le = LabelEncoder()
    le.fit(df_drop[c])
    df_drop[c] = le.transform(df_drop[c])

for c in cat_cols:
    df_drop[c] = df_drop[c].astype('category')

In [7]:
columns_with_nulls = [col for col in df_drop.columns if df_drop[col].isna().any()]
data = df_drop.drop(['Transported'], axis=1)
for c in columns_with_nulls:
    print('-'*100)
    print(c)

# 欠損値がある行とない行を分ける
    train_data = data[data[c].notna()]
    test_data = data[data[c].isna()]

# 訓練データと検証データ
    X_train, X_val, y_train, y_val = train_test_split(
        train_data.drop(c, axis=1), 
        train_data[c],
        test_size=0.2,
        random_state=42
    )
    objective = 'multiclass' if c in ['HomePlanet', 'CryoSleep', 'Destination','VIP','Deck','Side', 'IsAlone' ,'GroupSize'] else 'regression'

# LightGBMのパラメータ
    params = {
        'objective': objective,  # 目的変数が連続値の場合は'regression', カテゴリの場合は'multiclass'
        'learning_rate': 0.1,
        'lambda_l1' : 0.1,
        'num_leaves': 20,
        'random_state': 4
    }

# 訓練データセットを作成
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

# モデルの訓練
    model = lgb.train(params,
                      dtrain,
                      num_boost_round=10000,
                      valid_sets=[dtrain, dval],
                      valid_names=['train', 'valid'],
                      callbacks=[lgb.early_stopping(100),
                      lgb.log_evaluation(500)])

# テストデータで欠損値を予測
    predicted_values = model.predict(test_data.drop(c, axis=1))

# 欠損値の予済結果を元のデータセットに埋める
    data.loc[data[c].isna(), c] = predicted_values

----------------------------------------------------------------------------------------------------
Age
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003913 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1293
[LightGBM] [Info] Number of data points in the train set: 6811, number of used features: 9
[LightGBM] [Info] Start training from score 28.770518
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[32]	train's l2: 160.77	valid's l2: 174.119
----------------------------------------------------------------------------------------------------
RoomService
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000723 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

# 訓練データの分割

In [8]:

#X = df_drop.drop(['Transported'], axis=1)
X = data
y = df_drop['Transported']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=0 )
# 最初にデータを訓練データとテストデータに分割します。例えば、全体の20%をテストデータとします。
# 次に、訓練データをさらに訓練データと検証データに分割します。例えば、訓練データのうちさらに25%を検証データとします。
# ここで、訓練データの75%と検証データの25%の割合になりますが、全データの中ではそれぞれ60%と20%になります。
X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.20, shuffle=True, stratify=y_train, random_state=0 )

"""
最初にデータを訓練データとテストデータに分割
次に訓練データをさらに訓練データと検証データに分割
X_train, y_train : 訓練データ
X_val, y_val : 検証データ
X_test, y_test : テストデータ
"""

'\n最初にデータを訓練データとテストデータに分割\n次に訓練データをさらに訓練データと検証データに分割\nX_train, y_train : 訓練データ\nX_val, y_val : 検証データ\nX_test, y_test : テストデータ\n'

# モデルの構築

In [10]:


lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val, y_val)
lgb_test = lgb.Dataset(X_test, y_test)


# カスタム評価関数の定義
def lgb_accuracy_score(y_true, y_pred):
    y_pred = np.round(y_pred)  # 確率を0または1に丸める
    return 'accuracy', accuracy_score(y_true, y_pred), True



params = {
    'objective': 'binary',
    'num_leaves': 10,
    'seed': 0,
    'verbose': -1,
}

In [11]:
model = lgb.train(params,
                      lgb_train,
                      num_boost_round=10000,
                      valid_sets=[lgb_train, lgb_val],
                      valid_names=['train', 'valid'],
                      callbacks=[lgb.early_stopping(100),
                      lgb.log_evaluation(50)])

Training until validation scores don't improve for 100 rounds
[50]	train's binary_logloss: 0.401303	valid's binary_logloss: 0.43261
[100]	train's binary_logloss: 0.376098	valid's binary_logloss: 0.43037
[150]	train's binary_logloss: 0.356845	valid's binary_logloss: 0.434739
Early stopping, best iteration is:
[83]	train's binary_logloss: 0.383278	valid's binary_logloss: 0.42965


# テストデータで検証

In [12]:
def predict(submi_data,models):
    output = np.zeros_like(models[0].predict(submi_data))
    for i in range(len(models)):
        output += models[i].predict(submi_data)
       
        
    output = output / len(models)
    return np.round(output).astype(bool)

In [13]:
ac_score = accuracy_score(predict(X_test,[model]), y_test)
pr_score = precision_score(predict(X_test,[model]), y_test)
rc_score = recall_score(predict(X_test,[model]),y_test)
f1 = f1_score(predict(X_test,[model]), y_test)

print('accuracy = %.10f' % (ac_score))
print('precision = %.10f' % (pr_score))
print('recall = %.10f' % (rc_score))
print('F1-score = %.10f' % (f1))

accuracy = 0.7906843013
precision = 0.8436073059
recall = 0.7650103520
F1-score = 0.8023887079
