# ライブラリのインポート

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import xgboost as xgb
import lightgbm as lgb
import shap


# 特徴量エンジニアリング
1. 使わない特徴量を削除
2. カテゴリカルデータをラベルエンコーディング 
3. 欠損値を埋める。(LightGBM)
4. データの特性から特徴量を追加

In [39]:
df = pd.read_csv("train.csv")

In [40]:
cabin_split = df['Cabin'].str.split('/', expand=True)
# Deck 列を作成
df['Deck'] = cabin_split[0]    
# Side 列を作成し、条件に基づいて数値を割り当て
df['Side'] = cabin_split[2].map({'P': True, 'S': False})
df['Side'] = df['Side'].astype(bool)

# PassengerId から GroupID と Person Number を抽出
df['GroupID'] = df['PassengerId'].apply(lambda x: x.split('_')[0])
# Group Size を計算
group_sizes = df.groupby('GroupID').size()
df['GroupSize'] = df['GroupID'].apply(lambda x: group_sizes[x])
# Is Alone 特徴量を追加
df['IsAlone'] = (df['GroupSize'] == 1).astype(int)

In [41]:
drop_cols = ['PassengerId', 'Cabin', 'Name','GroupID']
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination','VIP','Deck','Side', 'IsAlone' ,'GroupSize']

In [42]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# # カラム数に基づいて行と列の数を設定

# cols = 3  # 3列で表示
# rows = (len(cat_cols) + cols - 1) // cols  # 必要な行数を計算

# # サブプロットの設定
# fig, axes = plt.subplots(rows, cols, figsize=(12, 5 * rows))  # 各サブプロットのサイズを調整
# axes = axes.flatten()  # 1Dに変換してインデックスでアクセスしやすくする

# # 各カラムについてプロット
# for i, name in enumerate(cat_cols):
#     sns.countplot(x=name, hue='Transported', data=df, ax=axes[i])
#     axes[i].set_title(f'{name} vs. Transported')
#     axes[i].tick_params(axis='x', rotation=45)  # x軸のラベルを回転して表示

# # 余分なサブプロットを非表示
# for j in range(i + 1, len(axes)):
#     axes[j].axis('off')

# plt.tight_layout()
# plt.show()


In [43]:
df_drop = df.drop(drop_cols, axis=1)

In [44]:
# RoomService の欠損値を条件に基づいて埋める
df_drop['RoomService'] = df_drop.apply(
    lambda row: 0 if pd.isna(row['RoomService']) and row['CryoSleep'] else row['RoomService'],
    axis=1
)


In [45]:
for c in cat_cols:
    le = LabelEncoder()
    le.fit(df_drop[c])
    df_drop[c] = le.transform(df_drop[c])

for c in cat_cols:
    df_drop[c] = df_drop[c].astype('category')

In [46]:
columns_with_nulls = [col for col in df_drop.columns if df_drop[col].isna().any()]
data = df_drop.drop(['Transported'], axis=1)
for c in columns_with_nulls:
    print('-'*100)
    print(c)

# 欠損値がある行とない行を分ける
    train_data = data[data[c].notna()]
    test_data = data[data[c].isna()]

# 訓練データと検証データ
    X_train, X_val, y_train, y_val = train_test_split(
        train_data.drop(c, axis=1), 
        train_data[c],
        test_size=0.2,
        random_state=42
    )
    objective = 'multiclass' if c in ['HomePlanet', 'CryoSleep', 'Destination','VIP','Deck','Side', 'IsAlone' ,'GroupSize'] else 'regression'

# LightGBMのパラメータ
    params = {
        'objective': objective,  # 目的変数が連続値の場合は'regression', カテゴリの場合は'multiclass'
        'learning_rate': 0.1,
        'lambda_l1' : 0.1,
        'num_leaves': 20,
        'random_state': 4
    }

# 訓練データセットを作成
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

# モデルの訓練
    model = lgb.train(params,
                      dtrain,
                      num_boost_round=10000,
                      valid_sets=[dtrain, dval],
                      valid_names=['train', 'valid'],
                      callbacks=[lgb.early_stopping(100),
                      lgb.log_evaluation(500)])

# テストデータで欠損値を予測
    predicted_values = model.predict(test_data.drop(c, axis=1))

# 欠損値の予済結果を元のデータセットに埋める
    data.loc[data[c].isna(), c] = predicted_values

----------------------------------------------------------------------------------------------------
Age
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001834 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1317
[LightGBM] [Info] Number of data points in the train set: 6811, number of used features: 13
[LightGBM] [Info] Start training from score 28.770518
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[30]	train's l2: 150.751	valid's l2: 166.763
----------------------------------------------------------------------------------------------------
RoomService
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000984 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGB

In [47]:
data['total_usage'] = data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa']].sum(axis=1)
data['is_youth'] = np.where(data['Age'] >= 40, 0, 1)

# 訓練データの分割

In [48]:

#X = df_drop.drop(['Transported'], axis=1)
X = data
y = df_drop['Transported']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=0 )
"""
最初にデータを訓練データとテストデータに分割
次に訓練データをさらに訓練データと検証データに分割
X_train, y_train : 訓練データ
X_val, y_val : 検証データ
X_test, y_test : テストデータ
"""

'\n最初にデータを訓練データとテストデータに分割\n次に訓練データをさらに訓練データと検証データに分割\nX_train, y_train : 訓練データ\nX_val, y_val : 検証データ\nX_test, y_test : テストデータ\n'

# モデルの構築(クロスバリデーション)

In [50]:


params = {
    'objective': 'binary',
    'num_leaves': 10,
    'seed': 0,
    'verbose': -1,
}

In [51]:

def crossVal(X_train, y_train, params):
    # 格納用データの作成
    valid_scores = []
    models = []
    oof = np.zeros(len(X_train))

    # KFoldを用いて学習データを5分割してモデルを作成
    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train)):
        X_tr = X_train.iloc[tr_idx]
        X_va = X_train.iloc[va_idx]
        y_tr = y_train.iloc[tr_idx]
        y_va = y_train.iloc[va_idx]
    
        lgb_train = lgb.Dataset(X_tr, y_tr)
        lgb_eval = lgb.Dataset(X_va, y_va, reference=lgb_train)

        # 最適化ハイパーパラメータを読み込み
        model = lgb.train(params,
                      lgb_train,
                      num_boost_round=10000,
                      valid_sets=[lgb_train, lgb_eval],
                      valid_names=['train', 'valid'],
                      callbacks=[lgb.early_stopping(100),
                      lgb.log_evaluation(500)])

        y_va_pred = model.predict(X_va, num_iteration=model.best_iteration)
        score = mean_absolute_error(y_va, y_va_pred)
        print(f'fold {fold+1} MAE valid: {score:.2f}')
        print('')

        # スコア、モデル、予測値の格納
        valid_scores.append(score)
        models.append(model)
        oof[va_idx] = y_va_pred
    return valid_scores, models,oof

# クロスバリデーションの平均スコア
valid_scores, models_lg,oof = crossVal(X_train, y_train, params)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[343]	train's binary_logloss: 0.278369	valid's binary_logloss: 0.377027
fold 1 MAE valid: 0.24

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[167]	train's binary_logloss: 0.328394	valid's binary_logloss: 0.368127
fold 2 MAE valid: 0.25

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[235]	train's binary_logloss: 0.30402	valid's binary_logloss: 0.412744
fold 3 MAE valid: 0.26

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[101]	train's binary_logloss: 0.351977	valid's binary_logloss: 0.415968
fold 4 MAE valid: 0.27

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[175]	train's binary_logloss: 0.327097	valid's binary_logloss: 0.371737
fold 5 MAE valid: 0.25

Training until validation scores don

In [52]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6954 entries, 6617 to 3235
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   HomePlanet    6954 non-null   category
 1   CryoSleep     6954 non-null   category
 2   Destination   6954 non-null   category
 3   Age           6954 non-null   float64 
 4   VIP           6954 non-null   category
 5   RoomService   6954 non-null   float64 
 6   FoodCourt     6954 non-null   float64 
 7   ShoppingMall  6954 non-null   float64 
 8   Spa           6954 non-null   float64 
 9   VRDeck        6954 non-null   float64 
 10  Deck          6954 non-null   category
 11  Side          6954 non-null   category
 12  GroupSize     6954 non-null   category
 13  IsAlone       6954 non-null   category
 14  total_usage   6954 non-null   float64 
 15  is_youth      6954 non-null   int64   
dtypes: category(8), float64(7), int64(1)
memory usage: 544.9 KB


In [53]:
xgb_params ={'n_estimators':5000,
             'max_depth': 10,
             'learning_rate': 0.01,
             'random_state':0,
             }


model = xgb.XGBRegressor(**xgb_params, enable_categorical=True)

model.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_test, y_test)],
          early_stopping_rounds=50,
          verbose=100)

[0]	validation_0-rmse:0.49719	validation_1-rmse:0.49774




[100]	validation_0-rmse:0.33559	validation_1-rmse:0.39978
[200]	validation_0-rmse:0.27931	validation_1-rmse:0.38693
[268]	validation_0-rmse:0.25887	validation_1-rmse:0.38728


In [54]:


def crossVal(X_train, y_train, xgb_params):
    valid_scores = []
    models = []
    oof = np.zeros(len(X_train))

    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train)):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]

        model = xgb.XGBRegressor(**xgb_params, enable_categorical=True)
        model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr)], early_stopping_rounds=50, verbose=100)
        y_va_pred = model.predict(X_va)  # 予測値の計算

        score = mean_absolute_error(y_va, y_va_pred)
        print(f'fold {fold+1} MAE valid: {score:.2f}')

        valid_scores.append(score)
        models.append(model)
        oof[va_idx] = y_va_pred

    return valid_scores, models, oof

# クロスバリデーションの平均スコア
valid_scores, models_xg, oof = crossVal(X_train, y_train, xgb_params)
print(f'Average MAE across all folds: {np.mean(valid_scores):.2f}')


[0]	validation_0-rmse:0.49710




[100]	validation_0-rmse:0.33198
[200]	validation_0-rmse:0.27342
[300]	validation_0-rmse:0.24190
[400]	validation_0-rmse:0.21843
[500]	validation_0-rmse:0.20579
[600]	validation_0-rmse:0.19948
[700]	validation_0-rmse:0.19422
[800]	validation_0-rmse:0.18580
[900]	validation_0-rmse:0.17826
[1000]	validation_0-rmse:0.17327
[1100]	validation_0-rmse:0.16916
[1200]	validation_0-rmse:0.16633
[1300]	validation_0-rmse:0.16310
[1400]	validation_0-rmse:0.16058
[1500]	validation_0-rmse:0.15752
[1600]	validation_0-rmse:0.15436
[1700]	validation_0-rmse:0.15186
[1800]	validation_0-rmse:0.15023
[1900]	validation_0-rmse:0.14887
[2000]	validation_0-rmse:0.14700
[2100]	validation_0-rmse:0.14592
[2200]	validation_0-rmse:0.14380
[2300]	validation_0-rmse:0.14149
[2400]	validation_0-rmse:0.13980
[2500]	validation_0-rmse:0.13800
[2600]	validation_0-rmse:0.13688
[2700]	validation_0-rmse:0.13391
[2800]	validation_0-rmse:0.13189
[2900]	validation_0-rmse:0.13040
[3000]	validation_0-rmse:0.12865
[3100]	validation_0



[100]	validation_0-rmse:0.32765
[200]	validation_0-rmse:0.26813
[300]	validation_0-rmse:0.23671
[400]	validation_0-rmse:0.21397
[500]	validation_0-rmse:0.20229
[600]	validation_0-rmse:0.19382
[700]	validation_0-rmse:0.18536
[800]	validation_0-rmse:0.17902
[900]	validation_0-rmse:0.17189
[1000]	validation_0-rmse:0.16647
[1100]	validation_0-rmse:0.16250
[1200]	validation_0-rmse:0.15935
[1300]	validation_0-rmse:0.15655
[1400]	validation_0-rmse:0.15443
[1500]	validation_0-rmse:0.15166
[1600]	validation_0-rmse:0.14856
[1700]	validation_0-rmse:0.14586
[1800]	validation_0-rmse:0.14340
[1900]	validation_0-rmse:0.14164
[2000]	validation_0-rmse:0.13968
[2100]	validation_0-rmse:0.13751
[2200]	validation_0-rmse:0.13501
[2300]	validation_0-rmse:0.13369
[2400]	validation_0-rmse:0.13251
[2500]	validation_0-rmse:0.13059
[2600]	validation_0-rmse:0.12951
[2700]	validation_0-rmse:0.12831
[2800]	validation_0-rmse:0.12690
[2900]	validation_0-rmse:0.12603
[3000]	validation_0-rmse:0.12554
[3100]	validation_0



[100]	validation_0-rmse:0.32984
[200]	validation_0-rmse:0.27034
[300]	validation_0-rmse:0.24216
[400]	validation_0-rmse:0.22400
[500]	validation_0-rmse:0.20891
[600]	validation_0-rmse:0.19643
[700]	validation_0-rmse:0.19044
[800]	validation_0-rmse:0.18267
[900]	validation_0-rmse:0.17517
[1000]	validation_0-rmse:0.17009
[1100]	validation_0-rmse:0.16526
[1200]	validation_0-rmse:0.16011
[1300]	validation_0-rmse:0.15502
[1400]	validation_0-rmse:0.15213
[1500]	validation_0-rmse:0.14919
[1600]	validation_0-rmse:0.14541
[1700]	validation_0-rmse:0.14360
[1800]	validation_0-rmse:0.14123
[1900]	validation_0-rmse:0.13919
[2000]	validation_0-rmse:0.13825
[2100]	validation_0-rmse:0.13567
[2200]	validation_0-rmse:0.13163
[2300]	validation_0-rmse:0.12911
[2400]	validation_0-rmse:0.12817
[2500]	validation_0-rmse:0.12608
[2600]	validation_0-rmse:0.12480
[2700]	validation_0-rmse:0.12358
[2800]	validation_0-rmse:0.12225
[2900]	validation_0-rmse:0.12144
[3000]	validation_0-rmse:0.12071
[3100]	validation_0



[100]	validation_0-rmse:0.32886
[200]	validation_0-rmse:0.27007
[300]	validation_0-rmse:0.23909
[400]	validation_0-rmse:0.21754
[500]	validation_0-rmse:0.20174
[600]	validation_0-rmse:0.19348
[700]	validation_0-rmse:0.18641
[800]	validation_0-rmse:0.18088
[900]	validation_0-rmse:0.17562
[1000]	validation_0-rmse:0.16920
[1100]	validation_0-rmse:0.16567
[1200]	validation_0-rmse:0.16193
[1300]	validation_0-rmse:0.15995
[1400]	validation_0-rmse:0.15688
[1500]	validation_0-rmse:0.15425
[1600]	validation_0-rmse:0.15202
[1700]	validation_0-rmse:0.14954
[1800]	validation_0-rmse:0.14515
[1900]	validation_0-rmse:0.14329
[2000]	validation_0-rmse:0.14151
[2100]	validation_0-rmse:0.13985
[2200]	validation_0-rmse:0.13770
[2300]	validation_0-rmse:0.13600
[2400]	validation_0-rmse:0.13396
[2500]	validation_0-rmse:0.13295
[2600]	validation_0-rmse:0.13139
[2700]	validation_0-rmse:0.12979
[2800]	validation_0-rmse:0.12874
[2900]	validation_0-rmse:0.12783
[3000]	validation_0-rmse:0.12725
[3100]	validation_0



[100]	validation_0-rmse:0.32972
[200]	validation_0-rmse:0.27122
[300]	validation_0-rmse:0.23987
[400]	validation_0-rmse:0.21851
[500]	validation_0-rmse:0.20308
[600]	validation_0-rmse:0.19382
[700]	validation_0-rmse:0.18766
[800]	validation_0-rmse:0.17917
[900]	validation_0-rmse:0.17259
[1000]	validation_0-rmse:0.16739
[1100]	validation_0-rmse:0.16349
[1200]	validation_0-rmse:0.16020
[1300]	validation_0-rmse:0.15794
[1400]	validation_0-rmse:0.15329
[1500]	validation_0-rmse:0.15102
[1600]	validation_0-rmse:0.14878
[1700]	validation_0-rmse:0.14574
[1800]	validation_0-rmse:0.14389
[1900]	validation_0-rmse:0.14226
[2000]	validation_0-rmse:0.14097
[2100]	validation_0-rmse:0.13808
[2200]	validation_0-rmse:0.13591
[2300]	validation_0-rmse:0.13449
[2400]	validation_0-rmse:0.13355
[2500]	validation_0-rmse:0.13312
[2600]	validation_0-rmse:0.13244
[2700]	validation_0-rmse:0.13029
[2800]	validation_0-rmse:0.12857
[2900]	validation_0-rmse:0.12768
[3000]	validation_0-rmse:0.12675
[3100]	validation_0

# テストデータで検証

In [55]:
def predict(submi_data,models):
    output = np.zeros_like(models[0].predict(submi_data))
    for i in range(len(models)):
        output += models[i].predict(submi_data)
       
        
    output = output / len(models)
    return np.round(output).astype(bool)

In [56]:
model_ensemble = models_xg + models_lg

In [59]:
ac_score = accuracy_score(predict(X_test,model_ensemble), y_test)
pr_score = precision_score(predict(X_test,model_ensemble), y_test)
rc_score = recall_score(predict(X_test,model_ensemble), y_test)
f1 = f1_score(predict(X_test,model_ensemble), y_test)

print('accuracy = %.10f' % (ac_score))
print('precision = %.10f' % (pr_score))
print('recall = %.10f' % (rc_score))
print('F1-score = %.10f' % (f1))

accuracy = 0.7975848189
precision = 0.8059360731
recall = 0.7950450450
F1-score = 0.8004535147
