# ライブラリ読み込み

In [None]:
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import KFold

import lightgbm as lgbm

plt.style.use("ggplot")
%matplotlib inline

# 関数読み込み

In [None]:
def rmsle(ans, pred):
    return np.sqrt(np.square(np.log1p(ans) - np.log1p(pred)).mean())

def rmse(ans, pred):
    return np.sqrt(np.square(ans - pred).mean())

In [None]:
def train_test_cat_plot(merged_data, cat_cols, n_col=2):
    n_col = 2
    n_row = len(cat_cols) // n_col + 1

    fig, axes = plt.subplots(n_row,n_col, figsize=(n_col*6, n_row*4))

    for cat_col, ax in zip(cat_cols, axes.flatten()):

        tmp = pd.pivot_table(merged_data, index=cat_col,
                                           columns="train", values='age', aggfunc="count")

        left = np.arange(len(tmp))
        train_ = tmp[0].values / tmp[0].values.sum()
        test_ = tmp[1].values / tmp[1].values.sum()
        
        ax.bar(left, train_ , width=0.4, label="train")
        ax.bar(left+0.5, test_, width=0.4, label="test")
        
        ax.set_ylim(0, max(train_.max(), test_.max())*1.1)
        ax.legend()
        ax.set_xticks(left+0.25) 
        ax.set_xticklabels(tmp.index.values)
        ax.set_title(cat_col)
    plt.show()
    return

In [None]:
def KFold_lgbm_ensemble(train_data, params, num_round=1000, K=5):
    kf = KFold(n_splits=K, random_state=2019, shuffle=True)
    split = kf.split(train_data)
    models = []
    rmsles = []
    for train_idx, val_idx in split:
        train_x = train_data.drop("charges", axis=1).values
        train_y = train_data["charges"].map(lambda y: np.log1p(y)).values
        train_x, valid_x = train_x[train_idx], train_x[val_idx]
        train_y, valid_y = train_y[train_idx], train_y[val_idx]

        # 学習
        d_train = lgbm.Dataset(train_x, label=train_y)
        d_valid = lgbm.Dataset(valid_x, label=valid_y)
        
        model = lgbm.train(params = lgbm_params,
                                           train_set =d_train,
                                           valid_sets = d_valid,
                                           num_boost_round = num_round,
                                           early_stopping_rounds = 20,
                                           verbose_eval=50)

        models.append(model)

        valid_pred = model.predict(valid_x)
        rmsles.append(rmse(valid_y, valid_pred))
        
    return models, rmsles

# データ読み込み

In [None]:
DATA_PATH = "data/"
train_data = pd.read_csv(DATA_PATH+"sample_train.csv")
test_data = pd.read_csv(DATA_PATH+"sample_test.csv")
answer_data = pd.read_csv(DATA_PATH+"sample_answer.csv")

In [None]:
train_data.head()

In [None]:
test_data.head()

# データの確認

In [None]:
# 値のチェック
train_data.info()

In [None]:
# null　チェック
train_data.isnull().sum()

In [None]:
# 特殊なカテゴリーがあるかチェック
cat_cols = ["region", "smoker", "sex"]
for cat_col in cat_cols:
    print(train_data[cat_col].value_counts())

In [None]:
# 各種統計量
train_data.describe()

In [None]:
sns.pairplot(train_data,
                     vars=["charges", "region", "smoker", "children", "bmi", "sex", "age"])
plt.savefig('data/EDA/pairplot.png')
plt.close()

In [None]:
for col in cat_cols:
    sns.pairplot(train_data, hue = col,
                         vars=["charges", "children", "bmi", "age"])
    plt.savefig(f'data/EDA/{col}_pairplot.png')
    plt.close()

In [None]:
# train, testの分布を確認
train_data["train"] = 1
test_data["train"] = 0
merged_data = pd.concat([train_data, test_data], sort=False)

sns.pairplot(merged_data, hue="train",
                     vars=["children", "bmi", "age"])
plt.savefig(f'data/EDA/train_test_pairplot.png')
plt.close()

In [None]:
train_test_cat_plot(merged_data,  cat_cols)

# ベースライン
KFold Ensemble

In [None]:
train_data["children"] = train_data["children"].map(lambda x : str(x))
test_data["children"] = test_data["children"].map(lambda x : str(x))

train_data = train_data.drop("id", axis=1)
test_data = test_data.drop("id", axis=1)
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

In [None]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    "n_estimators": 1000,
    'num_leaves': 20,
    'max_depth': 6, 
    'learning_rate': 0.01,
    'verbose': -1, 
    "num_threads": 10}

## シングルモデル

In [None]:
kf = KFold(n_splits=4, random_state=2019, shuffle=True)
split = kf.split(train_data)
train_idx, val_idx = split.__next__()

In [None]:
train_x = train_data.drop("charges", axis=1).values
train_y = train_data["charges"].map(lambda y: np.log1p(y)).values
train_x, valid_x = train_x[train_idx], train_x[val_idx]
train_y, valid_y = train_y[train_idx], train_y[val_idx]

# 学習
d_train = lgbm.Dataset(train_x, label=train_y)
d_valid = lgbm.Dataset(valid_x, label=valid_y)
model = lgbm.train(params = lgbm_params,
                                   train_set =d_train,
                                   valid_sets = d_valid,
                                   num_boost_round = 1000,
                                   early_stopping_rounds = 20,
                                   verbose_eval=50)

### Private Leaderboard
0.4154697625778434

In [None]:
test_pred = model.predict(test_data.values)
test_pred = np.exp(test_pred) - 1
print(rmsle(answer_data["charges"].values, test_pred))

## K-Fold Ensemble

In [None]:
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

In [None]:
cv_rmsles = []
for i in range(9):
    K =i+2
    models, rmsles = KFold_lgbm_ensemble(train_data, lgbm_params, K=K)
    cv_rmsles.append(np.mean(rmsles))

In [None]:
plt.plot(cv_rmsles)
plt.xticks(np.arange(10), np.arange(10)+2);

### Private Leaderboard
0.41444564295755526

In [None]:
K = 8
models, rmsles = KFold_lgbm_ensemble(train_data, lgbm_params, K=K)

In [None]:
test_pred = np.zeros(test_data.values.shape[0])
for model in models:
    test_pred_ = model.predict(test_data.values)
    test_pred_ = np.exp(test_pred_) - 1
    test_pred += test_pred_
test_pred = test_pred / K

print(rmsle(answer_data["charges"].values, test_pred))

# lgbm with Denoising AutoEncoder

In [None]:
from sklearn.preprocessing import QuantileTransformer

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
def swap_noise(array, noise_level=0.2):
    '''(i, j)要素を確率的に(i', j)に変えるノイズ
    '''
    n_row, n_col = merged_data.values.shape
    rands = np.random.uniform(0, 1, size=(n_row, n_col))
    copy_array = np.array(merged_data.values)
    for col in range(n_col):
        for row in range(n_row):
            if rands[row, col] < noise_level:
                swap_target_row = np.random.randint(0, n_row)
                copy_array[row, col] = array[swap_target_row, col]
    return copy_array

In [None]:
class AutoEncoder(nn.Module):
    def __init__(self, input_size):
        super(AutoEncoder, self).__init__()
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(input_size, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, 100) # 取得する中間表現
        self.fc4 = nn.Linear(100, 100)
        self.fc5 = nn.Linear(100, input_size)
        # 初期化
        nn.init.kaiming_normal_(self.fc1.weight)
        nn.init.kaiming_normal_(self.fc2.weight)
        nn.init.kaiming_normal_(self.fc3.weight)
        nn.init.kaiming_normal_(self.fc4.weight)
        nn.init.kaiming_normal_(self.fc5.weight)
    
    def forward(self, x):
        h = self.relu(self.fc1(x))
        h = self.relu(self.fc2(h))
        h = self.relu(self.fc3(h))
        h = self.relu(self.fc4(h))
        out = self.fc5(h)
        return out
    
    def get_representation(self, x):
        h = self.relu(self.fc1(x))
        h = self.relu(self.fc2(h))
        h = self.relu(self.fc3(h))
        return h

def train(model, data_loader, loss_func, optimizer, device=torch.device("cpu")):
    model.train()
    running_loss = 0
    for row_data in data_loader:
        optimizer.zero_grad()
        row_data = row_data.to(device)
        outputs = model(row_data)
        loss = loss_func(outputs, row_data)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()

    train_loss = running_loss / len(data_loader)
    return train_loss

def get_representation(model, array):
    model.eval()
    with torch.no_grad():
        inputs = torch.Tensor(array)
        outputs = model.get_representation(inputs)
    return outputs

In [None]:
class TableData(Dataset):
    def __init__(self, data):
        self.data = np.array(data)
        self.data_num = self.data.shape[0]
    
    def __len__(self):
        return self.data_num
    
    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
# 読み込み
train_data = pd.read_csv(DATA_PATH+"sample_train.csv")
test_data = pd.read_csv(DATA_PATH+"sample_test.csv")
train_y = train_data["charges"].values

num_cols = ["bmi", "age"]
cat_cols = ["children", "region", "smoker", "sex"]

# dae用にmerge
train_data["train"] = 1
test_data["train"] = 0
merged_data = pd.concat([train_data, test_data], sort=False)
train_flag = merged_data["train"].values
merged_data = merged_data.drop(["id", "charges", "train"], axis=1)
merged_data["children"] = merged_data["children"].map(lambda x: str(x))

In [None]:
# rankgauss
rankgauss_transformer = QuantileTransformer(n_quantiles = 100, random_state=2019,
                                                                                  output_distribution="normal")
merged_data[num_cols] = rankgauss_transformer.fit_transform(merged_data[num_cols])

# one hot
noised_merged_data = pd.DataFrame(swap_noise(merged_data.values),
                                                                  columns=merged_data.columns)
noised_merged_data = pd.get_dummies(noised_merged_data, columns=cat_cols)

In [None]:
dataset = TableData(noised_merged_data.values.astype("float32"))
loader = DataLoader(dataset, batch_size=128, shuffle=True)

In [None]:
# gpu/cpu
device = torch.device("cuda:9")

# モデル
input_size = noised_merged_data.values.shape[1]
dae_model = AutoEncoder(input_size)
dae_model = dae_model.to(device)

#Loss, Optimizer
criterion = nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(dae_model.parameters(), lr=learning_rate)

In [None]:
for i in range(300):
    if (i+1) % 20 == 0:
        print(train(dae_model, loader, criterion, optimizer, device))
dae_model.to(torch.device("cpu"))

In [None]:
### DAEによる変換

# test, trainで同様の変換になるように
merged_data["train"] = train_flag
one_hot_merged_data = pd.get_dummies(merged_data, columns=cat_cols)
train_x = one_hot_merged_data.query("train == 1").drop("train", axis=1)
test_x = one_hot_merged_data.query("train == 0").drop("train", axis=1)

# 変換
train_x = get_representation(dae_model, train_x.values).numpy()
test_x =  get_representation(dae_model, test_x.values).numpy()
train_data_ = pd.DataFrame(train_x)
train_data_["charges"] = train_y

## シングルモデル

In [None]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    "n_estimators": 1000,
    'num_leaves': 20,
    'max_depth': 6, 
    'learning_rate': 0.01,
    'verbose': -1, 
    "num_threads": 10}

In [None]:
kf = KFold(n_splits=4, random_state=2019, shuffle=True)
split = kf.split(train_data_)
train_idx, val_idx = split.__next__()

train_x = train_data_.drop("charges", axis=1).values
train_y = train_data_["charges"].map(lambda y: np.log1p(y)).values
train_x, valid_x = train_x[train_idx], train_x[val_idx]
train_y, valid_y = train_y[train_idx], train_y[val_idx]

# 学習
d_train = lgbm.Dataset(train_x, label=train_y)
d_valid = lgbm.Dataset(valid_x, label=valid_y)
model = lgbm.train(params = lgbm_params,
                                   train_set =d_train,
                                   valid_sets = d_valid,
                                   num_boost_round = 1000,
                                   early_stopping_rounds = 20,
                                   verbose_eval=50)

### Private Leaderbord
0.4840280135653281

In [None]:
test_pred = model.predict(test_x)
test_pred = np.exp(test_pred) - 1
print(rmsle(answer_data["charges"].values, test_pred))

## K-Fold Ensembel

In [None]:
cv_rmsles = []
for i in range(9):
    K =i+2
    models, rmsles = KFold_lgbm_ensemble(train_data_, lgbm_params, K=K)
    cv_rmsles.append(np.mean(rmsles))

In [None]:
plt.plot(cv_rmsles)
plt.xticks(np.arange(10), np.arange(10)+2);

### Private Leaderbord
0.4649929851114924

In [None]:
K = 8
models, rmsles = KFold_lgbm_ensemble(train_data_, lgbm_params, K=K)

In [None]:
test_pred = np.zeros(test_x.shape[0])
for model in models:
    test_pred_ = model.predict(test_x)
    test_pred_ = np.exp(test_pred_) - 1
    test_pred += test_pred_
test_pred = test_pred / K

print(rmsle(answer_data["charges"].values, test_pred))