# kaggleのtitanicデータ（二値分類）

# ライブラリの読み込み

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import matplotlib.pyplot as plt

In [4]:
import japanize_matplotlib

In [None]:
import seaborn as sns
sns.set()

# データの読み込み

In [None]:
train_raw = pd.read_csv('train.csv')
test_raw = pd.read_csv('test.csv')

Survival：0 = No, 1 = Yes

In [None]:
train_raw.head()

In [None]:
train_raw.shape

In [None]:
test_raw.head()

In [None]:
test_raw.shape

In [None]:
train = train_raw.copy()
test = test_raw.copy()

In [None]:
train = train.drop('Survived', axis=1)
train['Survived'] = train_raw['Survived']
train

# 基礎集計

In [None]:
plt.hist(train['Survived'])

In [None]:
train.info()

In [None]:
train.isnull().sum()

## s1生き残り・s0死亡で比較

In [None]:
train_s0 = train[train['Survived']==0]
train_s0.describe(include='all')

In [None]:
train_s1 = train[train['Survived']==1]
train_s1.describe(include='all')

### age（年齢）

In [None]:
plt.hist(train_s1['Age'])

In [None]:
plt.hist(train_s0['Age'])

### Pclass（Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)）

In [None]:
plt.hist(train_s1['Pclass'])

In [None]:
plt.hist(train_s0['Pclass'])

### sex（male or female）

In [None]:
plt.hist(train_s1['Sex'])

In [None]:
plt.hist(train_s0['Sex'])

### sibsp（Number of Siblings/Spouses Aboard）

Number of Siblings/Spouses Aboard　⇒　一緒に乗船した兄弟、配偶者の数

In [None]:
plt.hist(train_s1['SibSp'])

In [None]:
plt.hist(train_s0['SibSp'])

### parch（Number of Parents/Children Aboard）

Number of Parents/Children Aboard　⇒　一緒に乗船した親子の数

In [None]:
plt.hist(train_s1['Parch'])

In [None]:
plt.hist(train_s0['Parch'])

### fare（Passenger fare）

In [None]:
plt.hist(train_s1['Fare'])

In [None]:
plt.hist(train_s0['Fare'])

### embarked（Port of Embarkation）

In [None]:
print('生き残り：C = Cherbourg→',len(train_s1[train_s1['Embarked']=='C']))
print('生き残り：Q = Queenstown→',len(train_s1[train_s1['Embarked']=='Q']))
print('生き残り：S = Southampton→',len(train_s1[train_s1['Embarked']=='S']))

In [None]:
print('死亡：C = Cherbourg→',len(train_s0[train_s0['Embarked']=='C']))
print('死亡：Q = Queenstown→',len(train_s0[train_s0['Embarked']=='Q']))
print('死亡：S = Southampton→',len(train_s0[train_s0['Embarked']=='S']))

# 前処理

## 欠損地処理

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

### ageの欠損値処理

In [None]:
train['Age'] = train['Age'].fillna(train['Age'].median())

In [None]:
test['Age'] = test['Age'].fillna(test['Age'].median())

In [None]:
print(train.isnull().sum())
print(test.isnull().sum())

## 文字列データを数値に変換

In [None]:
# male=0,female=1に変換
train['Sex'][train['Sex'] == 'male'] = 0
train['Sex'][train['Sex'] == 'female'] = 1
test['Sex'][test['Sex'] == 'male'] = 0
test['Sex'][test['Sex'] == 'female'] = 1

In [None]:
train['Sex']

In [None]:
# S(Southampton)=0,C(Cherbourg)=1, Q(Queenstown)=2
train["Embarked"][train["Embarked"] == "S" ] = 0
train["Embarked"][train["Embarked"] == "C" ] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
test["Embarked"][test["Embarked"] == "S" ] = 0
test["Embarked"][test["Embarked"] == "C" ] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2

In [None]:
train = train.fillna({'Embarked': 0})
train.isnull().sum()

In [None]:
train['Sex'] = train['Sex'].astype(int)
train['Embarked'] = train['Embarked'].astype(int)

In [None]:
train.head()

In [None]:
features_col = ['Pclass', 'Age', 'Sex', 'Fare', 'SibSp', 'Parch', 'Embarked']

train_y = train['Survived'].values
train_X = train[features_col].values

In [None]:
print(train_y.shape)
print(train_X.shape)

## 変数間の相関を可視化

### 相関係数が-0.7以下、または0.7以上のものは相関高い？

In [None]:
# 分散・共分散行列を見やすいヒートマップ形式にて出力
sns.heatmap(train[features_col].corr(), annot=True, fmt='.2f')

# xgboost

## GridSearchCVによるパラメータチューニング

In [None]:
from xgboost import XGBClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {
    'n_estimators':[i for i in range(10, 100, 10)],
    'learning_rate':[10, 1, 0.1, 0.01],
    'max_depth':[i for i in range(1, 5, 1)],
    'random_state':[3],
}

In [None]:
# parameters ={'max_depth':[3,4,5,6,7,8],
#             'min_child_weight':[1,2,3,4,5],
#             'gamma':[i/10.0 for i in range(0,6)],
#             'subsample':[i/10.0 for i in range(6,11)],
#             'colsample_bytree':[i/10.0 for i in range(6,11)],
#             'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
#             'n_estimators':[1000,2000],
#             'reg_lambda':[1e-5, 1e-2, 0.1, 1],
#             'learning_rate':[0.1,0.2,0.3]
#             }

In [None]:
clf = GridSearchCV(XGBClassifier(), parameters, cv=5)

In [None]:
# clf.fit(train_X, train_y)

In [None]:
# clf.best_params_

In [None]:
# clf.best_score_

In [None]:
from xgboost import XGBClassifier

In [None]:
model_XGB = XGBClassifier(max_depth=2, learning_rate=1, n_estimators=60, random_state=3)

In [None]:
model_XGB.fit(train_X, train_y)

In [None]:
model_XGB.score(train_X, train_y)

## 予測

In [None]:
test_X = test[features_col].values

In [None]:
prediction = model_XGB.predict(test_X)

In [None]:
prediction.shape

In [None]:
print(prediction)

## 学習モデルの評価

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
#訓練・テストデータに分割
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2, shuffle=True)

In [None]:
y_pred_train = model_XGB.predict(X_train)
y_pred_test = model_XGB.predict(X_test)

# Accuracy の計算
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
print('Accuracy(train):', acc_train)
print('Accuracy(test):', acc_test)

In [None]:
sub = pd.read_csv('gender_submission.csv', index_col=None)
sub.head()

In [None]:
sub['Survived'] = prediction

In [None]:
sub.to_csv('titanic_xgboost.csv', index=False)

## 特徴量の重要度可視化

### 可視化①

In [None]:
plt.barh(train[features_col].columns.values, model_XGB.feature_importances_)

### 可視化②

In [None]:
feature_important = model_XGB.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
data.nlargest(40, columns="score").plot(kind='barh', figsize = (20,10)) ## plot top 40 features

### 可視化③

In [None]:
from xgboost import plot_importance

In [None]:
plot_importance(model_XGB)
plt.show()

### 可視化④

In [None]:
feature_imp = pd.Series(model_XGB.feature_importances_, index=train[features_col].columns.values).sort_values(ascending=False)

In [None]:
sns.barplot(x=feature_imp, y=feature_imp.index)

plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.show()

### 可視化⑤

In [None]:
_, ax = plt.subplots(figsize=(12, 12))
plot_importance(model_XGB.get_booster(),
                ax=ax,
                importance_type='gain')

### 可視化⑥

In [None]:
importances = pd.Series(model_XGB.feature_importances_, index=train[features_col].columns.values)
importances = importances.sort_values()
importances.plot(kind = 'barh')
plt.title('XGBoost Feature Importance')
plt.show()

## 決定木の可視化

### 可視化①

In [None]:
from dtreeviz.trees import *

In [None]:
import graphviz

In [None]:
type(train[features_col])

In [None]:
viz = dtreeviz(model_XGB,
               x_data=train[features_col],
               y_data=train['Survived'],
               target_name='y',
               feature_names=train[features_col].columns.tolist(),
               tree_index=0)

In [None]:
display(viz)

### 可視化②

In [None]:
from xgboost import plot_tree

In [None]:
plot_tree(model_XGB, figsize=(20, 20))
plt.show()

### 可視化③

In [None]:
import xgboost as xgb

In [None]:
fig, ax = plt.subplots(figsize=(30, 30))
xgb.plot_tree(model_XGB, num_trees=10, ax=ax)
plt.show()

### 可視化④

In [None]:
xgb.to_graphviz(model_XGB, num_trees=0)

# LightGBM

処理速度 → LightGBM > XGBoost > catboost

LightGBMはカテゴリー変数を扱えるが、XGBoostはカテゴリー変数を扱えない

## GridSearchCVによるパラメータチューニング

In [None]:
import lightgbm as lgbm

In [None]:
from lightgbm import LGBMClassifier

In [None]:
train

In [None]:
train[features_col].info()

In [None]:
train_y = train['Survived'].values
train_X = train[features_col].values

In [None]:
train_y.shape

In [None]:
train_X.shape

In [None]:
parameters = {"max_depth": [10, 25, 50, 75],
              "learning_rate" : [0.001,0.01,0.05,0.1],
              "num_leaves": [100,300,900,1200],
              "n_estimators": [100,200,500]
             }

In [None]:
clf = GridSearchCV(LGBMClassifier(), parameters, cv=5)

In [None]:
# clf.fit(train_X, train_y)

In [None]:
# clf.best_params_

In [None]:
# clf.best_score_

In [None]:
from lightgbm import LGBMClassifier

In [None]:
model_LGBM = LGBMClassifier(max_depth=25, learning_rate=0.01, n_estimators=500, num_leaves=100)

In [None]:
model_LGBM.fit(train_X, train_y)

In [None]:
model_LGBM.score(train_X, train_y)

In [None]:
train['Survived'].info()

In [None]:
train[features_col].info()

## 予測

In [None]:
prediction = model_LGBM.predict(test_X)

In [None]:
prediction.shape

In [None]:
print(prediction)

## 学習モデルの評価

In [None]:
#訓練・テストデータに分割
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2, shuffle=True)

In [None]:
y_pred_train = model_LGBM.predict(X_train)
y_pred_test = model_LGBM.predict(X_test)

# Accuracy の計算
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)
print('Accuracy(train):', acc_train)
print('Accuracy(test):', acc_test)

In [None]:
sub['Survived'] = prediction

In [None]:
sub.to_csv('titanic_lightgbm.csv', index=False)

## 特徴量の重要度可視化

### 可視化①

In [None]:
plt.barh(train[features_col].columns.values, model_LGBM.feature_importances_)

### 可視化②

In [None]:
feature_imp = pd.Series(model_LGBM.feature_importances_, index=train[features_col].columns.values).sort_values(ascending=False)

In [None]:
sns.barplot(x=feature_imp, y=feature_imp.index)

plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.show()

In [None]:
importances = pd.Series(model_LGBM.feature_importances_, index=train[features_col].columns.values)
importances = importances.sort_values()
importances.plot(kind = 'barh')
plt.title('LightGBM Feature Importance')
plt.show()

## 決定木の可視化

### 可視化①

In [None]:
ax = lgbm.plot_tree(model_LGBM, tree_index=0, figsize=(20, 20), show_info=['split_gain'])
plt.show()
graph = lgbm.create_tree_digraph(model_LGBM, tree_index=0, format='png', name='Tree')

In [None]:
display(graph)

## 生存確率を出力

In [None]:
import lightgbm as lgb

In [None]:
train_y = train['Survived']
train_X = train[features_col]

In [None]:
train_X.info()

In [None]:
#訓練・テストデータに分割
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2, shuffle=True)

In [None]:
train_set = lgb.Dataset(X_train, y_train)
valid_set = lgb.Dataset(X_test, y_test)

In [None]:
params = {
    "objective" : "binary",
    "metric" : "binary_logloss"
}

In [None]:
result_data = {}
model = lgb.train(
    params = params,
    train_set = train_set,
    valid_sets = [train_set, valid_set],
    num_boost_round = 100,
    early_stopping_rounds = 5,
    verbose_eval = 50,
    evals_result = result_data
)

In [None]:
pred = model.predict(test_X)
print(pred)

In [None]:
test_ = test.copy()
test_['Survived'] = pred
test_.sort_values(by="Survived",ascending=False) 