Count Encodingのベースライン


## Import Packages

In [None]:
import os
import warnings
from pathlib import Path

import japanize_matplotlib
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

warnings.simplefilter("ignore")


In [None]:
# https://github.com/nyk510/vivid/blob/master/vivid/utils.py
from time import time

def decorate(s: str, decoration=None):
    if decoration is None:
        decoration = '★' * 20

    return ' '.join([decoration, str(s), decoration])

class Timer:
    def __init__(self, logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None, sep=' ', verbose=0):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None
        self.verbose = verbose

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        if self.verbose is None:
            return
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)


## Define Paths

In [None]:
ROOT_DIR = Path('../')
DATA_DIR = ROOT_DIR / 'data/input'
OUTPUT_DIR = ROOT_DIR / 'data/outputs'


## Load Data

In [None]:
train_df = pd.read_csv(DATA_DIR / Path("train.csv"))
test_df = pd.read_csv(DATA_DIR / Path("test.csv"))
era_df = pd.read_csv(DATA_DIR / Path("era.csv"))

display(train_df.head(5))
display(test_df.head(5))
display(era_df.head(5))


## Merge Data
era_dfとtrain, testをmerge<br>
最低限の前処理を行ってからmergeを行う。

In [None]:
era_list = [
    '鎌倉', '元', '古墳', '江戸', '高麗', '室町', '昭和', '大正', '唐', '桃山',
    '奈良', '縄文', '南宋', '南北朝', '飛鳥', '平安', '北宋', '明', '明治', '弥生'
]

era_mapping = {}
for era in era_list:
    era_mapping[era + '前期'] = era
    era_mapping[era + '中期'] = era
    era_mapping[era + '後期'] = era
    era_mapping[era + '末期'] = era
    era_mapping[era + '時代'] = era


In [None]:
def split_era(row):
    if '～' in row['時代']:
        if len(row['時代'].split('～'))>2:
            start = row['時代'].split('～')[0]
            end = row['時代'].split('～')[-1]
        else:
            start, end = row['時代'].split('～')
        row['開始時代'] = start
        row['終了時代'] = end
    else:
        row['開始時代'] = row['時代']
        row['終了時代'] = row['時代']
    return row


def add_era(row, era_df):
    if row['開始時代']==row['終了時代']:
        return row
    else:
        if row['開始時代'] in era_df['時代'].values:
            row['開始'] = era_df.loc[era_df['時代'] == row['開始時代'], '開始'].iloc[0]

        if row['終了時代'] in era_df['時代'].values:
            row['終了'] = era_df.loc[era_df['時代'] == row['終了時代'], '終了'].iloc[0]
    return row


In [None]:
train_df['時代'] = train_df['時代'].replace(era_mapping)
test_df['時代'] = test_df['時代'].replace(era_mapping)


In [None]:
train_df = train_df.merge(era_df, how="left", on="時代")
test_df = test_df.merge(era_df, how="left", on="時代")


In [None]:
train_df['時代'].fillna('不明', inplace=True)
train_df = train_df.apply(split_era, axis=1)

test_df['時代'].fillna('不明', inplace=True)
test_df = test_df.apply(split_era, axis=1)


In [None]:
train_df = train_df.apply(lambda row: add_era(row, era_df), axis=1)
test_df = test_df.apply(lambda row: add_era(row, era_df), axis=1)


## Feature Engineering

In [None]:
def add_split_prefecture(train: pd.DataFrame, test: pd.DataFrame) -> None:
    train[['所有者住所', '所在都道府県']] = train['都道府県 ※美工品は「所有者住所（所在都道府県）」'].str.extract(r'([^\（\）]+)(?:（([^）]+)）)?')
    test[['所有者住所', '所在都道府県']] = test['都道府県 ※美工品は「所有者住所（所在都道府県）」'].str.extract(r'([^\（\）]+)(?:（([^）]+)）)?')


In [None]:
add_split_prefecture(train_df, test_df)


### Count Encoding

In [None]:
categoricals = ["種別2", "所有者名", '所有者住所', '所在都道府県']
class AbstractBaseBlock:
    def fit(self, input_df: pd.DataFrame, y=None):
        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame) -> pd.DataFrame:
        raise NotImplementedError()


In [None]:
def read_whole_df():
    return pd.concat([
        train_df, test_df
    ], ignore_index=True)

class CountEncodingBlock(AbstractBaseBlock):
    """CountEncodingを行なう block"""
    def __init__(self, column: str):
        self.column = column

    def fit(self, input_df, y=None):
#         vc = input_df[self.column].value_counts()
        master_df = read_whole_df()
        vc = master_df[self.column].value_counts()
        self.count_ = vc
        return self.transform(input_df)

    def transform(self, input_df):
        out_df = pd.DataFrame()
        out_df[self.column] = input_df[self.column].map(self.count_)
        return out_df.add_prefix('CE_')


In [None]:
feature_blocks = [
    *[CountEncodingBlock(c) for c in ["種別2", "所有者名", '所有者住所', '所在都道府県']]
]


In [None]:
def run_blocks(input_df, blocks, y=None, test=False):
    out_df = pd.DataFrame()

    print(decorate('start run blocks...'))

    with Timer(prefix='run test={}'.format(test)):
        for block in feature_blocks:
            with Timer(prefix='\t- {}'.format(str(block))):
                if not test:
                    out_i = block.fit(input_df, y=y)
                else:
                    out_i = block.transform(input_df)

            assert len(input_df) == len(out_i), block
            name = block.__class__.__name__
            out_df = pd.concat([out_df, out_i], axis=1)

    return out_df


In [None]:
train_ce_df = run_blocks(train_df, blocks=feature_blocks)
test_ce_df = run_blocks(test_df, blocks=feature_blocks, test=True)


In [None]:
train_df_ = pd.merge(train_df, train_ce_df, left_index=True, right_index=True)
test_df_ = pd.merge(test_df, test_ce_df, left_index=True, right_index=True)


## Train

In [None]:
target = "is_kokuhou"

features = ["緯度", "経度", "開始", "終了", 'CE_種別2', 'CE_所有者名', 'CE_所有者住所',
       'CE_所在都道府県']


In [None]:
categoricals = ['CE_種別2', 'CE_所有者名', 'CE_所有者住所', 'CE_所在都道府県']


In [None]:
params = {
    "n_estimators": 50000,
    "boosting_type": "gbdt",
    "metric": "auc",
    "objective": "binary",
    "n_jobs": -1,
    "seed": 42,
    "learning_rate": 0.01,
    "verbose": -1,
}

oof_pred = np.zeros(len(train_df))
y_pred = np.zeros(len(test_df))
models = []
cv_scores = {}
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
for fold, (train_index, test_index) in enumerate(
    skf.split(train_df_[features], train_df_[target])
):

    print(f"====== fold {fold} ======")

    x_train, x_val = (
        train_df_.copy().iloc[train_index][features],
        train_df_.copy().iloc[test_index][features],
    )
    y_train, y_val = (
        train_df_.iloc[train_index][target],
        train_df_.iloc[test_index][target],
    )

    test = test_df_[features]

    # create Dataset
    train_set = lgb.Dataset(
        x_train, y_train, categorical_feature=categoricals, free_raw_data=False
    )
    val_set = lgb.Dataset(
        x_val, y_val, categorical_feature=categoricals, free_raw_data=False
    )

    # train
    verbose_eval = 100
    model = lgb.train(
        params,
        train_set,
        valid_sets=[train_set, val_set],
        callbacks=[
            lgb.early_stopping(
                stopping_rounds=100,
                verbose=True),
            lgb.log_evaluation(verbose_eval)]
    )

    models.append(model)

    fold_pred = model.predict(x_val)

    score = roc_auc_score(y_val, fold_pred)
    cv_scores[f"cv{fold}"] = score

    oof_pred[test_index] = fold_pred

    y_pred += model.predict(test) / 5

    print(f"cv score: {score}")

oof_score = roc_auc_score(train_df[target], oof_pred)
print(f"OOF score: {oof_score}")


In [None]:
feature_importances = []

for model in models:
    feature_importances.append(model.feature_importance(importance_type="gain"))

feature_importances = np.array(feature_importances)
feature_importance_df = pd.DataFrame(feature_importances, columns=features)
sorted_features = feature_importance_df.median().sort_values(ascending=False).index
sorted_feature_importance_df = feature_importance_df[sorted_features]

plt.figure(figsize=(12, 6))
sns.boxplot(data=sorted_feature_importance_df, orient="h")
plt.xlabel("Importance")
plt.show()


In [None]:
train_df["oof"] = oof_pred
sns.distplot(train_df[train_df["is_kokuhou"] == 0]["oof"])
sns.distplot(train_df[train_df["is_kokuhou"] == 1]["oof"])


## Predict

In [None]:
def find_optimal_threshold(y_true, y_scores):
    thresholds = np.linspace(0, 1, 100)
    best_threshold = 0
    best_score = 0

    for threshold in thresholds:
        y_pred = (y_scores > threshold).astype(int)
        score = f1_score(y_true, y_pred)

        if score > best_score:
            best_score = score
            best_threshold = threshold

    return best_threshold, best_score

y_true = train_df['is_kokuhou'].values
y_scores = oof_pred

best_threshold, best_f1_score = find_optimal_threshold(y_true, y_scores)
print(f"Best threshold: {best_threshold}") # 最適な閾値
print(f"Best F1 Score: {best_f1_score}")


y_pred = (y_pred > best_threshold).astype(int)
print(y_pred)
print(y_pred.shape)


In [None]:
test_df["is_kokuhou"] = y_pred


In [None]:
test_df[["is_kokuhou"]].to_csv(OUTPUT_DIR / "submission_ver3_7_ce.csv", index=False)
