# パッケージの読み込み

In [2]:
import os
# 数値処理
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures

# グラフ描画
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Meiryo'

# 前処理
## データの読み込み

In [3]:
# データの配置場所
input_dir = 'C:/Users/rhira/Documents/GitHub'\
            '/signate-competition-Liver-disease-beginner/data/input'
# トレーニングデータ
train = pd.read_csv(os.path.join(input_dir[:], 'train.csv'))
# テストデータ
test = pd.read_csv(os.path.join(input_dir[:], 'test.csv'))

## 特徴量とターゲットに分離

In [4]:
# トレーニングデータからターゲットをドロップ
train_feature = train.drop(['disease'], axis=1)
# トレーニングデータからターゲットを抽出
train_target = train['disease']

## 特徴量の結合

In [5]:
# 縦方向に結合
feature = pd.concat([train_feature, test], axis=0)

## ダミー変数化

In [6]:
feature_dummy = pd.get_dummies(feature, drop_first=True)

## 交互作用項の追加

In [11]:
# 2次の交互作用
quadratic = PolynomialFeatures(degree = 2, include_bias = False)
feature_quadratic = pd.DataFrame(quadratic.fit_transform(feature_dummy),
                                 columns=quadratic.get_feature_names(feature_dummy.columns))
# 3次の交互作用
cubic = PolynomialFeatures(degree = 3, include_bias = False)
feature_cubic = pd.DataFrame(cubic.fit_transform(feature_dummy),
                             columns=cubic.get_feature_names(feature_dummy.columns))

# データの保存

## 特徴量の分割

In [25]:
# 交互作用無
train_dummied = feature_dummy[:len(train)].reset_index(drop=True)
test_dummied = feature_dummy[len(train):].reset_index(drop=True)
# 2次交互作用
train_quadratic = feature_quadratic[:len(train)] .reset_index(drop=True)
test_quadratic = feature_quadratic[len(train):].reset_index(drop=True)
# 3次交互作用
train_cubic = feature_cubic[:len(train)].reset_index(drop=True)
test_cubic = feature_cubic[len(train):].reset_index(drop=True)

## 保存

In [27]:
# データの配置場所
output_dir = 'C:/Users/rhira/Documents/GitHub'\
             '/signate-competition-Liver-disease-beginner/features'

# 出力
# 交互作用無
train_dummied.to_feather(os.path.join(output_dir, 'train_dummied_feature.feather'))
test_dummied.to_feather(os.path.join(output_dir, 'test_dummied_feature.feather'))
# 2次交互作用
train_quadratic.to_feather(os.path.join(output_dir, 'train_qiadratic_feature.feather'))
test_quadratic.to_feather(os.path.join(output_dir, 'test_quadratic_feature.feather'))
# 3次交互作用
train_cubic.to_feather(os.path.join(output_dir, 'train_cubic_feature.feather'))
test_cubic.to_feather(os.path.join(output_dir, 'test_cubic_feature.feather'))
# ターゲット
pd.DataFrame(train_target).to_feather(os.path.join(output_dir, 'train_target.feather'))