# Binary Classification with a Bank Churn Dataset2-4
## (現在のスコアの確認)
## 1. ライブラリ

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

## 2. データの整理
### 2.1. データ取入れ

In [2]:
# Binary Classification with a Bank Churn Dataset のデータセット
dir_path = '/kaggle/input/playground-series-s4e1/'
test = pd.read_csv(dir_path + 'test.csv')       # テストデータ
train = pd.read_csv(dir_path + 'train.csv')     # 訓練データ
samp_sub = pd.read_csv(dir_path + 'sample_submission.csv')

### 2.2. 型変換

In [3]:
# 国データを整数データに変換するカテゴリ
Geography_cat = {'France'  : 0,
                 'Spain'   : 1,
                 'Germany' : 2}
# 性別データを整数データに変換するカテゴリ
Gender_cat = {'Female' : 0,
              'Male'   : 1}

# 全データの型変換
for data in [train, test]:
    data.Geography = data.Geography.map(Geography_cat)
    data.Gender = data.Gender.map(Gender_cat)

### 2.3. 作成した特徴量

In [4]:
# CustomerIdの使用回数
CI_used_cat = {}
for ci in set(train.CustomerId.unique()).union(set(test.CustomerId.unique())):
    CI_used_cat[ci] = len(train[train.CustomerId==ci]) + len(test[test.CustomerId==ci])
for data in [train, test]:
    data['CI_used'] = data.CustomerId.map(CI_used_cat)
    
# 各CustomerIdの口座解除率
CI_rate_cat = {}
rate_t = train.Exited.sum()/len(train)
for ci in train.CustomerId.unique():
    # CustomerIdとその(train中の)口座解除確率を対応させる
    CI_rate_cat[ci] = (len(train[(train.CustomerId==ci)&(train.Exited==1)])+rate_t)/(len(train[train.CustomerId==ci])+1)
for ci in set(test.CustomerId.unique())-set(train.CustomerId.unique()):
    # trainにないCustomerIdをtrain全体の口座解除確率に対応させる
    CI_rate_cat[ci] = rate_t
for data in [train, test]:
    data['CI_rate'] = data.CustomerId.map(CI_rate_cat)
    
# Surnameの現れる回数
sur_used_cat = {}
for s in set(train.Surname.unique()).union(set(test.Surname.unique())):
    sur_used_cat[s] = len(train[train.Surname==s]) + len(test[test.Surname==s])
for data in [train, test]:
    data['Sur_used'] = data.Surname.map(sur_used_cat)
    
# 各Surnameの口座解除率
sur_rate_cat = {}
for s in train.Surname.unique():
    sur_rate_cat[s] = (len(train[(train.Surname==s)&(train.Exited==1)])+rate_t)/(len(train[train.Surname==s])+1)
for s in set(test.Surname.unique())-set(train.Surname.unique()):
    sur_rate_cat[s] = rate_t
for data in [train, test]:
    data['Sur_rate'] = data.Surname.map(sur_rate_cat)

### 2.4. スケーリング

In [5]:
for data in [train, test]:
    data.CreditScore = (data.CreditScore-data.CreditScore.min())/(data.CreditScore.max()-data.CreditScore.min())
    data.Geography = data.Geography/2
    data.Age = data.Age/100
    data.Tenure = data.Tenure/10
    data.Balance = (data.Balance-data.Balance.min())/(data.Balance.max()-data.Balance.min())
    data.NumOfProducts = data.NumOfProducts/4
    data.EstimatedSalary = (data.EstimatedSalary-data.EstimatedSalary.min())/(data.EstimatedSalary.max()-data.EstimatedSalary.min())
    data.CI_used = data.CI_used/200
    data.Sur_used = data.Sur_used/5000

## 3. testデータの予測
### 3.1. 予測・出力関数

In [6]:
def predictor(model, features, train=train, test=test):
    model.fit(train[features], train.Exited)
    prediction = model.predict(test[features])
    output = pd.DataFrame({'id':test.id, 'Exited':prediction})
    output.to_csv('submission.csv', index=False)

### 3.2. 予測

In [7]:
# ランダムフォレスト1
# features = ['CreditScore', 'Geography', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'IsActiveMember', 'EstimatedSalary',
#             'CI_used', 'CI_rate', 'Sur_used', 'Sur_rate']
# rfc1 = RandomForestClassifier(criterion='log_loss', max_depth=15, max_features=2, n_estimators=4000, random_state = 0, n_jobs=-1)
# predictor(rfc1, features)
# Private Score:0.71188
# Public Score: 0.70750

# ランダムフォレスト2
# features = ['CreditScore', 'Geography', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'IsActiveMember', 'EstimatedSalary',
#             'CI_used', 'CI_rate', 'Sur_used']
# rfc2 = RandomForestClassifier(criterion='log_loss', max_depth=15, max_features=2, n_estimators=4000, random_state = 0, n_jobs=-1)
# predictor(rfc2, features)
# Private Score:0.71000
# Public Score: 0.71011

# ランダムフォレスト3
# features = ['CreditScore', 'Geography', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'IsActiveMember', 'EstimatedSalary',
#             'CI_used', 'CI_rate']
# rfc3 = RandomForestClassifier(criterion='log_loss', max_depth=15, max_features=2, n_estimators=4000, random_state = 0, n_jobs=-1)
# predictor(rfc3, features)
# Private Score:0.73760
# Public Score: 0.73091

# ランダムフォレスト4
# features = ['CreditScore', 'Geography', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'IsActiveMember', 'EstimatedSalary',
#             'CI_used']
# rfc4 = RandomForestClassifier(criterion='log_loss', max_depth=15, max_features=2, n_estimators=4000, random_state = 0, n_jobs=-1)
# predictor(rfc4, features)
# Private Score:0.73972
# Public Score: 0.73193

# ランダムフォレスト5
# features = ['CreditScore', 'Geography', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'IsActiveMember', 'EstimatedSalary']
# rfc5 = RandomForestClassifier(criterion='log_loss', max_depth=15, max_features=2, n_estimators=4000, random_state = 0, n_jobs=-1)
# predictor(rfc5, features)
# Private Score:0.74212
# Public Score: 0.73364

# ランダムフォレスト6
# features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
# rfc6 = RandomForestClassifier(criterion='log_loss', max_depth=15, max_features=2, n_estimators=4000, random_state = 0, n_jobs=-1)
# predictor(rfc6, features)
# Private Score:0.74328
# Public Score: 0.73824

# ニューラルネットワーク1
# features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 
#              'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
# mlp1 = MLPClassifier((100), activation='logistic', solver='lbfgs', max_iter=400)
# predictor(mlp1, features)
# Private Score:0.74576
# Public Score: 0.73979

# ニューラルネットワーク2
# features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 
#              'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
# mlp2 = MLPClassifier((100, 100), activation='logistic', solver='lbfgs', max_iter=400)
# predictor(mlp2, features)
# Private Score:0.74817
# Public Score: 0.74073

# ニューラルネットワーク3
# features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
# mlp3 = MLPClassifier((100, 100, 100), activation='logistic', solver='lbfgs', max_iter=400)
# predictor(mlp3, features)
# Private Score:0.50000
# Public Score: 0.50000

# ニューラルネットワーク4
# features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
# mlp4 = MLPClassifier((200, 200, 200), activation='logistic', solver='lbfgs', max_iter=400)
# predictor(mlp4, features)
# Private Score:0.50000
# Public Score: 0.50000

# ニューラルネットワーク5
# features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
# mlp5 = MLPClassifier((200, 200), activation='logistic', solver='lbfgs', max_iter=400)
# predictor(mlp5, features)
# Private Score:0.73846
# Public Score: 0.73244

# ニューラルネットワーク6
# features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
# mlp6 = MLPClassifier((300, 300), activation='logistic', solver='lbfgs', max_iter=400)
# predictor(mlp6, features)
# Private Score:0.73271
# Public Score: 0.72550

# ニューラルネットワーク7
# features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
# mlp7 = MLPClassifier((100, 100), activation='logistic', solver='adam', learning_rate='adaptive', learning_rate_init=0.1)
# predictor(mlp7, features)
# Private Score:0.69875
# Public Score: 0.68688

# ニューラルネットワーク8
# features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
# mlp8 = MLPClassifier((200, 200), activation='logistic', solver='adam', learning_rate='adaptive', learning_rate_init=0.1)
# predictor(mlp8, features)
# Private Score:0.79426
# Public Score: 0.79138

# ニューラルネットワーク9
# features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
# mlp9 = MLPClassifier((100, 100, 100), activation='logistic', solver='adam', learning_rate='adaptive', learning_rate_init=0.1)
# predictor(mlp9, features)
# Private Score:0.50000
# Public Score: 0.50000

# ニューラルネットワーク10
# features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
# mlp10 = MLPClassifier((100, 100), activation='relu', solver='adam', learning_rate='adaptive', learning_rate_init=0.1)
# predictor(mlp10, features)
# Private Score:0.76654
# Public Score: 0.75907

# ニューラルネットワーク11
# features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
# mlp11 = MLPClassifier((200, 200), activation='relu', solver='adam', learning_rate='adaptive', learning_rate_init=0.1)
# predictor(mlp11, features)
# Private Score:0.71848
# Public Score: 0.71095

# ニューラルネットワーク12
# features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 
#             'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
# mlp12 = MLPClassifier((100, 100, 100), activation='relu', solver='adam', learning_rate='adaptive', learning_rate_init=0.1)
# predictor(mlp12, features)
# Private Score:0.71214
# Public Score: 0.70377