In [1]:
import numpy as np # 今回は明示的には使わない
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb # LightGBM でモデルを作成する
from sklearn.metrics import accuracy_score, roc_auc_score #精度評価に使用
import pickle # モデルの保存・読み込みに使用

In [2]:
# 自作のモジュールを読み込み
import sys
sys.path.append("../input/modeling-and-evaluation/")
import optuna_hypara as my_optuna

In [3]:
# 各種定数の定義
# 乱数シード
RANDOM_SEED = 42
# target となる特徴量
TARGET_COULMN_NAME = "Exited"
# 削除する特徴量のリスト
DROP_COLUMNS = ["id", "CustomerId", "Surname"]
# one-hot encoding する特徴量のリスト
ONE_HOT_ENCODING_COLUMNS = ["Geography", "Gender"]
# ベストモデルのパス
BEST_MODEL_PATH = "../working/best_lgbm_model.pkl"

In [4]:
# 学習データの読み込み
train_df =  pd.read_csv("../input/tdse-inclass-competition-0/train.csv")

## 前処理を行う

### target encodingを行う

In [5]:
from sklearn import base
from sklearn.model_selection import KFold

class KFoldTargetEncoderTrain(base.BaseEstimator,
                               base.TransformerMixin):
    """How to use.
    targetc = KFoldTargetEncoderTrain('Feature','Target',n_fold=5)
    new_train = targetc.fit_transform(train)
    """
    def __init__(
        self,
        colnames,
        targetName,
        n_fold=5,
        verbosity=True,
        discardOriginal_col=False
    ):
        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col

    def fit(self, X, y=None):
        return self

    def transform(self,X):        
        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)       

        mean_of_target = X[self.targetName].mean()
        kf = KFold(
            n_splits = self.n_fold,
            # shuffle = False,
            # random_state=2019, # random_state is not working for shuffle=False
            )        
        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
        X[col_mean_name] = np.nan       

        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())
            # X[col_mean_name].fillna(mean_of_target, inplace = True)  # nanになってしまったところは平均値で埋める --(1)

        if self.verbosity:            
            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,self.targetName, 
                                                                                  np.corrcoef(X[self.targetName].values,encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
        return X


class TargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
    """How to use.
    test_targetc = TargetEncoderTest(new_train,
                                      'Feature',
                                      'Feature_Kfold_Target_Enc')
    new_test = test_targetc.fit_transform(test)
    """

    def __init__(self,train,colNames,encodedName):

        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName

    def fit(self, X, y=None):
        return self

    def transform(self,X):       
        mean =  self.train[[self.colNames, self.encodedName]].groupby(self.colNames).mean().reset_index() 

        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]
            X[self.encodedName] = X[self.colNames]
        X = X.replace({self.encodedName: dd})
        return X

In [6]:
# 後でモジュールにしたいので、必要なモジュールとともに関数化しておく
# 明らかに出力がおかしいので使わんほうがよさそう...?
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

def holdout_ts_wo_smoothing(
    train_df: pd.DataFrame,
    categorical_col: str, # list[str]にもできるらしいがちょっと面倒なので今回は1つのみ
    target_col: str,
    n_splits: int = 5,
    random_seed: int = 42,
) -> pd.DataFrame:
    """
    holdout法を用いたtarget encoding
    cf. https://blog.amedama.jp/entry/target-mean-encoding-types
    
    Args:
        train_df (pd.DataFrame): 学習データ
        random_seed (int): 乱数シード

    Returns:
        pd.DataFrame: target encoding されたデータ
    """
    # 生成した特徴量をいれるpd.Seriesを用意
    generated_feature = pd.Series(
        np.empty(train_df.shape[0]),
        name=f"{categorical_col}_target_encoded",
        index=train_df.index,
    )

    # sumとcountを計算
    agg_df = train_df.groupby(categorical_col).agg({target_col: ["sum", "count"]})

    # holdsオブジェクトを用意する
    folds = StratifiedKFold(
        n_splits=n_splits,
        shuffle=True, random_state=random_seed,
    )

    # K-Foldでtarget encodingを行う
    for _, holdout_idx in folds.split(train_df, train_df[target_col]):
        # holdout法で分割
        holdout_df = train_df.iloc[holdout_idx]

        # holdoutしたデータのsumとcountを取得
        holdout_agg_df = holdout_df.groupby(categorical_col).agg({target_col: ["sum", "count"]})

        # 全体のsumとcountからholdoutしたデータのsumとcountを引く
        train_agg_df = agg_df - holdout_agg_df

        # holdoutしたデータの平均を計算
        # 分母に1を足しているのは、分母が0になるのを防ぐため
        holdout_mean = holdout_df.apply(
            lambda row:
            train_agg_df.loc[row[categorical_col]][(target_col, "sum")] / (1 + train_agg_df.loc[row[categorical_col]][(target_col, "count")]),
            axis=1,
        )

        # 計算値を格納してreturn用のpd.Seriesに追加
        generated_feature[holdout_idx] = holdout_mean

        return generated_feature


In [7]:
targetc = KFoldTargetEncoderTrain(
    "CustomerId",
    "Exited",
    n_fold=5,
)
new_train_df = targetc.fit_transform(train_df)
new_train_df["CustomerId_Kfold_Target_Enc"].isnull().sum()

Correlation between the new feature, CustomerId_Kfold_Target_Enc and, Exited is nan.


10758

あとでsqliteで書き直す

In [8]:
new_train_df.to_csv("../working/new_train_df.csv")

### 不要なカラムを削除し、one-hot encodingを行う

In [9]:
# 特徴量。Surname, id, CustomerIDは削除
X_train_df = new_train_df.drop(
    [TARGET_COULMN_NAME] + DROP_COLUMNS,
    axis=1,
) 
# one-hot encoding
X_train_df = pd.get_dummies(
    X_train_df,
    columns=ONE_HOT_ENCODING_COLUMNS,
)
# 目的変数
y_train_df = new_train_df[TARGET_COULMN_NAME]

In [10]:
# テストデータの読み込み
test_df =  pd.read_csv("../input/tdse-inclass-competition-0/test.csv")
# customer_encodeの読み込み
customer_encode_df = pd.read_csv("../input/modeling-and-evaluation/customer_encode.csv")
# CustomerIdを使ってtrainの"CustomerId_Kfold_Target_Enc"を持ってくる
new_test_df = pd.merge(test_df, customer_encode_df, on="CustomerId", how="left")

In [11]:
len(new_test_df)

110023

In [12]:
len(test_df)

110023

In [13]:
len(new_test_df) == len(test_df)

True

In [14]:
# 学習データに合わせて特徴量を加工
X_test_df = new_test_df.drop(
    DROP_COLUMNS,
    axis=1,
)
# one-hot encoding
X_test_df = pd.get_dummies(
    X_test_df,
    columns=ONE_HOT_ENCODING_COLUMNS,
)

## optunaを用いて最適なモデルを探索する
ベストモデルは `model_path` にpklで吐き出される。

In [15]:
my_optuna.optimize_lgbm_hyperparameters(
    X_train_df,
    y_train_df,
    n_trials=100,
    seed=RANDOM_SEED,
    model_path=BEST_MODEL_PATH,
)

[I 2024-06-11 11:52:08,871] A new study created in memory with name: no-name-5fda85cb-fbe9-42ea-a798-f21602f45e63
[I 2024-06-11 11:52:12,403] Trial 0 finished with value: 0.8906431510147873 and parameters: {'num_leaves': 87, 'feature_fraction': 0.7082781558933926, 'bagging_fraction': 0.7780871916436569, 'bagging_freq': 7, 'min_child_samples': 66}. Best is trial 0 with value: 0.8906431510147873.
[I 2024-06-11 11:52:14,590] Trial 1 finished with value: 0.8907559321438968 and parameters: {'num_leaves': 30, 'feature_fraction': 0.8907295383717303, 'bagging_fraction': 0.555431043776644, 'bagging_freq': 5, 'min_child_samples': 25}. Best is trial 1 with value: 0.8907559321438968.
[I 2024-06-11 11:52:18,868] Trial 2 finished with value: 0.8896535856010148 and parameters: {'num_leaves': 109, 'feature_fraction': 0.7059685869483721, 'bagging_fraction': 0.6430069401829588, 'bagging_freq': 3, 'min_child_samples': 28}. Best is trial 1 with value: 0.8907559321438968.
[I 2024-06-11 11:52:23,771] Trial 

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013384 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1081
[LightGBM] [Info] Number of data points in the train set: 132027, number of used features: 14
[LightGBM] [Info] Start training from score 0.211820
Best trial:
  Value: 0.891648740896794
  Params: 
    num_leaves: 31
    feature_fraction: 0.651779042436904
    bagging_fraction: 0.905720797273426
    bagging_freq: 6
    min_child_samples: 39


<optuna.study.study.Study at 0x7a71a256a470>

## モデルを読み込んで予測する

In [16]:
# モデルの読み込み
with open(BEST_MODEL_PATH, 'rb') as f:
    model = pickle.load(f)

In [17]:
# テストデータに対する予測
y_pred_np = model.predict(X_test_df)

In [18]:
y_pred_np

array([0.01446969, 0.8471564 , 0.02994665, ..., 0.00918885, 0.16377055,
       0.20911039])

In [19]:
# 提出用ファイルの作成
submission_df = pd.DataFrame({
    "id": test_df["id"],
    "Exited": y_pred_np,
})

In [20]:
submission_df.head()

Unnamed: 0,id,Exited
0,165034,0.01447
1,165035,0.847156
2,165036,0.029947
3,165037,0.24144
4,165038,0.359321


In [21]:
submission_df.to_csv("submission.csv", index=False)