In [None]:
# 下記セルを実行すると、authorization codeの入力を求められます。
# 出力されたリンク先をクリックし、Googleアカウントにログインし、
# authorization codeをコピーし、貼り付けをおこなってください。
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os 
project = '100knock-process-visualization'
chapter = 7
os.chdir(f'/content/drive/MyDrive/{project}/chapter-{chapter}/')

# chapter 7 機械学習加工10ノック


## ノック81: 機械学習で予測したいデータを分けよう


In [None]:
import seaborn as sns
dataset = sns.load_dataset('titanic')
dataset

In [None]:
label = dataset.pop('survived')
label

## ノック82: TrainデータとTestデータに分割しよう

In [None]:
from sklearn.model_selection import train_test_split
train_ds, test_ds, train_label, test_label = train_test_split(dataset, label, random_state=2021, stratify=label)

In [None]:
train_ds

In [None]:
test_ds

## ノック83: データを機械学習に適した形式へ変換しよう

In [None]:
train_ds.drop(columns=['embark_town', 'alive'], inplace=True)
train_ds.head()

In [None]:
import pandas as pd
one_hot_encoded = pd.get_dummies(train_ds)
one_hot_encoded.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoded = train_ds.copy()
class_encoder = LabelEncoder()
label_encoded['class'] = class_encoder.fit_transform(label_encoded['class'])
label_encoded.head()

In [None]:
one_hot_encoded = pd.get_dummies(one_hot_encoded, columns=['pclass'])
one_hot_encoded.head()

In [None]:
one_hot_encoded = one_hot_encoded.replace({True: 1, False: 0})
one_hot_encoded.head()

In [None]:
train_ds = one_hot_encoded

## ノック84: 外れ値の検出をしよう

In [None]:
q = train_ds.quantile([1 / 4, 3 / 4])
q1, q3 = q.loc[1 / 4], q.loc[3 / 4]
iqr = q3 - q1
mx = q3 + 1.5 * iqr
mn = q1 - 1.5 * iqr

In [None]:
((train_ds > mx) | (train_ds < mn)).sum()

# ノック85: データ分布を見てスケーリング手法を考えよう

In [None]:
train_ds.describe()

In [None]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(ncols=4, figsize=(20, 5))
axes[0].hist(train_ds.age)
axes[1].hist(train_ds.sibsp)
axes[2].hist(train_ds.parch)
axes[3].hist(train_ds.fare)

In [None]:
from scipy import stats
import numpy as np
bins, bin_edges = np.histogram(train_ds.age.dropna(), bins="auto")
stat, p = stats.chisquare(bins)
f'χ二乗検定のp値: {p}' # p >= 0.05 ではないので一様性はない

In [None]:
stat, p = stats.shapiro(bins)
f'シャピロウィルク検定のp値: {p}' # p >= 0.05なので、正規性がある

In [None]:
bins, bin_edges = np.histogram(train_ds.sibsp.dropna(), bins="auto")
stat, p = stats.chisquare(bins)
print(f'χ二乗検定のp値: {p}') # p >= 0.05 ではないので一様性はない

stat, p = stats.shapiro(bins)
print(f'シャピロウィルク検定のp値: {p}') # p >= 0.05ではないので正規性はない

In [None]:
bins, bin_edges = np.histogram(train_ds.parch.dropna(), bins="auto")
stat, p = stats.chisquare(bins)
print(f'χ二乗検定のp値: {p}') # p >= 0.05 ではないので一様性はない

stat, p = stats.shapiro(bins)
f'シャピロウィルク検定のp値: {p}' # p >= 0.05ではないので正規性はない

In [None]:
bins, bin_edges = np.histogram(train_ds.fare.dropna(), bins="auto")
stat, p = stats.chisquare(bins)
print(f'χ二乗検定のp値: {p}') # p >= 0.05 ではないので一様性はない

stat, p = stats.shapiro(bins)
f'シャピロウィルク検定のp値: {p}' # p >= 0.05ではないので正規性はないv

## ノック86: 分布に従ってスケーリングをやってみよう

In [None]:
from sklearn.preprocessing import RobustScaler, StandardScaler

age_scaler = StandardScaler()
sibsp_scaler = RobustScaler()
parch_scaler = RobustScaler()
fare_scaler = RobustScaler()

In [None]:
train_ds['age'] = age_scaler.fit_transform(train_ds['age'].values.reshape(-1, 1))
train_ds['sibsp'] = sibsp_scaler.fit_transform(train_ds['sibsp'].values.reshape(-1, 1))
train_ds['parch'] = parch_scaler.fit_transform(train_ds['parch'].values.reshape(-1, 1))
train_ds['fare'] = fare_scaler.fit_transform(train_ds['fare'].values.reshape(-1, 1))

In [None]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(ncols=4, figsize=(20, 5))
axes[0].hist(train_ds.age)
axes[1].hist(train_ds.sibsp)
axes[2].hist(train_ds.parch)
axes[3].hist(train_ds.fare)

## ノック87: スケーラーを保存しよう

In [None]:
import os
os.makedirs('data/scalers/', exist_ok=1)
!ls data

In [None]:
import pickle
with open('data/scalers/age_scaler.pkl', mode='wb') as f:
  pickle.dump(age_scaler, f)

!ls data/scalers/ 

In [None]:
with open('data/scalers/sibsp_scaler.pkl', mode='wb') as f:
  pickle.dump(sibsp_scaler, f)

with open('data/scalers/parch_scaler.pkl', mode='wb') as f:
  pickle.dump(parch_scaler, f)

with open('data/scalers/fare_scaler.pkl', mode='wb') as f:
  pickle.dump(fare_scaler, f)

In [None]:
with open('data/scalers/age_scaler.pkl', mode='rb') as f:
  age_scaler = pickle.load(f)

age_scaled = test_ds.copy()
age_scaled['age'] = age_scaler.transform(age_scaled['age'].values.reshape(-1, 1))
age_scaled.head() # ageカラムに着目

## ノック88: 欠損値の処理をやってみよう


In [None]:
train_ds.isna().sum()

In [None]:
from sklearn.impute import SimpleImputer

age_imputer = SimpleImputer(strategy='median')
train_ds['age'] = age_imputer.fit_transform(train_ds['age'].values.reshape(-1, 1))
train_ds

In [None]:
train_ds.isna().sum()

In [None]:
os.makedirs('data/imputers/', exist_ok=1)
with open('data/imputers/age_imputer.pkl', mode='wb') as f:
  pickle.dump(age_imputer, f)
!ls data/imputers/

## ノック89: 学習時のサンプル比率を調整しよう

In [None]:
train_label.value_counts()

In [None]:
from imblearn.under_sampling import RandomUnderSampler
under_sampler = RandomUnderSampler(random_state=2021)
under_sampled_train_ds, under_sampled_train_label = under_sampler.fit_resample(train_ds, train_label)
under_sampled_train_ds.shape

In [None]:
from imblearn.over_sampling import RandomOverSampler 
over_sampler = RandomOverSampler(random_state=2021)
over_sampled_train_ds, over_sampled_train_label = over_sampler.fit_resample(train_ds, train_label)
over_sampled_train_ds.shape

# ノック90: テストデータの前処理をしよう

In [None]:
test_ds.head()

In [None]:
test_ds.drop(columns=['embark_town', 'alive'], inplace=True)

In [None]:
test_ds = pd.get_dummies(test_ds)
test_ds = pd.get_dummies(test_ds, columns=['pclass'])
test_ds.replace({True: 1, False: 0}, inplace=True)

In [None]:
test_ds = test_ds.merge(train_ds, how='left')
test_ds = test_ds[train_ds.columns]
test_ds

In [None]:
with open('data/scalers/age_scaler.pkl', mode='rb') as f:
  age_scaler = pickle.load(f)
with open('data/scalers/sibsp_scaler.pkl', mode='rb') as f:
  sibsp_scaler = pickle.load(f)
with open('data/scalers/parch_scaler.pkl', mode='rb') as f:
  parch_scaler = pickle.load(f)
with open('data/scalers/fare_scaler.pkl', mode='rb') as f:
  fare_scaler = pickle.load(f)  

test_ds['age'] = age_scaler.transform(test_ds.age.values.reshape(-1, 1))
test_ds['sibsp'] = age_scaler.transform(test_ds.sibsp.values.reshape(-1, 1))
test_ds['parch'] = age_scaler.transform(test_ds.parch.values.reshape(-1, 1))
test_ds['fare'] = age_scaler.transform(test_ds.fare.values.reshape(-1, 1))

In [None]:
with open('data/imputers/age_imputer.pkl', mode='rb') as f:
  age_imputer = pickle.load(f)

test_ds['age'] = age_imputer.transform(test_ds.age.values.reshape(-1, 1))

In [None]:
test_ds