In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e1/sample_submission.csv
/kaggle/input/playground-series-s4e1/train.csv
/kaggle/input/playground-series-s4e1/test.csv


# **銀行解約データセットを使用した二項分類**
Binary Classification with a Bank Churn Dataset(https://www.kaggle.com/competitions/playground-series-s4e1/code?competitionId=65711&sortBy=voteCount)

# 参考資料
**書籍**
- Python実践データ分析100本ノック


**Kaggle**
- [[PlayGround S4E1] 📊 EDA + 🤖 Modeling [XGBoost]](https://www.kaggle.com/code/akhiljethwa/playground-s4e1-eda-modeling-xgboost)
- [🏛️Binary Classification💸 | Bank Churn💰| EDA📈](https://www.kaggle.com/code/tarundirector/binary-classification-bank-churn-eda)


**自分で作成したファイル**
- 肝硬変の転帰の多クラス予測

**その他**


In [2]:
# ライブラリ・データセットのインポート
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

pd.options.display.float_format = '{:.5f}'.format
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.preprocessing import LabelEncoder, MinMaxScaler

## 実行時間を調べるために使う
import datetime
import time
import math

start_time = time.time()

def changeHMS(s):
    h = math.floor(s / 3600)
    if h > 0:
        s = s - h * 3600
        indi_h = str(h) + 'h'
    else:
        indi_h = ''
    m = math.floor(s / 60)
    if m > 0:
        indi_m = str(m) + 'm'
    else:
        indi_m = ''
    s = math.floor(s % 60)
    time = indi_h + indi_m + str(s) + 's'
    return time

FILE_PATH = '/kaggle/input/playground-series-s4e1/'
OUTPUT_DIR = '/kaggle/working/'

train = pd.read_csv(FILE_PATH + 'train.csv')
test = pd.read_csv(FILE_PATH + 'test.csv')

train_id = train['id']
train.drop('id', axis=1, inplace=True)

test_id = test['id']
test.drop('id', axis=1, inplace=True)

target = train['Exited']
target_col = 'Exited'

target_name = str(train.iloc[:, [-1]].columns.tolist()) # カラム数-2の値が目的変数

df = pd.concat([train, test], axis=0)
df.reset_index(inplace=True)

# 説明変数をデータ型ごとに代入する
numerical_features = df._get_numeric_data().columns
categorical_features = df.drop(numerical_features, axis=1).columns

# 前処理
# 欠損値の補完
def missing_values(df):

    return df

# 外れ値の削除
def outlier(df):

    return df

# MinMaxScaler(正規化)
def scaling_MinMaxScaler(df):
    df_scale = df[numerical_features]
    sc = MinMaxScaler()
    df[numerical_features] = pd.DataFrame(sc.fit_transform(df_scale), columns = df_scale.columns)

    return df

# 特徴量エンジニアリング
# 特徴量の作成
def create_new_features(df):
    # Salary to Balance Ratio
    df['SalaryToBalanceRatio'] = df['EstimatedSalary'] / (df['Balance'] + 1)  # Adding 1 to avoid division by zero

    # Credit Card and Active Member Interaction
    df['CreditCard_ActiveMember'] = df['HasCrCard'] * df['IsActiveMember']


    # ABP Interaction
    df['ABPInteraction'] = df['Age'] * df['Balance'] / df['NumOfProducts']

    df['Geo_Gender'] = df['Geography'] + '_' + df['Gender']
    
    df['TotalProductUsage'] = df['Tenure'] * df['NumOfProducts']
            
    df['AgeAdjustedProductUsage'] = df['NumOfProducts']/((df['Tenure']+1) * df['Age'])
    
    df['Customer_Status'] = df['Tenure'].apply(lambda x: 'New' if x < 2 else 'Long-term')
    
    return df

# ビンニング
def binning(df):
    # Age Group
    age_bins = [0, 18, 30, 40, 50, 60, 100]
    df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=False, right=False)
    
    # Balance Category
    balance_bins = [-np.inf, 0, 10000, 50000, 100000, np.inf]
    df['BalanceCategory'] = pd.cut(df['Balance'], bins=balance_bins, labels=False, right=False)
    
    # Credit Score Category
    credit_bins = [0, 500, 600, 700, 800, 900]
    df['CreditScoreCategory'] = pd.cut(df['CreditScore'], bins=credit_bins, labels=False, right=False)
    
    tenure_bins = [0, 2, 5, 10, np.inf]
    df['TenureGroup'] = pd.cut(df['Tenure'], bins=tenure_bins, labels=False, right=False)
    
    return df

# カテゴリ変数のエンコーディング
# One-Hot Encoding
def one_hot_encoding(df, cat_cols):
    df = pd.get_dummies(df, columns=cat_cols)

    return df

# LabelEncoding
def label_encoder(df):
    categorical_columns = df.select_dtypes(include=['object']).columns

    for column in categorical_columns:
        df[column] = df[column].fillna('').astype('str') # 欠損値の補完をする
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])

    return df

# pd.get_dummies
def get_dummies(df, cat_cols):
    df = pd.get_dummies(df, columns=cat_cols)
    
    return df

# 特徴量の選択
# 特徴量の重要度評価
def feature_importance_evaluation(df):
    # データを対数変換する
    
    # 訓練データをX(説明変数)とy（目的変数）に分割する
    X = df.select_dtypes(include=['float', 'int'])
    X = X.drop([target_col], axis=1) # 目的変数を指定する
    y = target # 目的変数を指定する

    

    # 特徴量の重要度評価
    clf = LGBMClassifier(
        random_state=42,
    )

    clf.fit(X, y)
    importance = clf.feature_importances_

    feature_importance = pd.DataFrame(data=importance, index=X.columns, columns=['importance']) \
        .sort_values(ascending=True, by='importance')

    return feature_importance

# 特徴量の削除
def drop_columns(df):
    drop_list = [
        'CustomerId', 'Surname'
    ]
    dropped_df = df.drop(columns=drop_list)

    return dropped_df

# データセットの更新
# 前処理
df = missing_values(df)
df = outlier(df)
# df = scaling_MinMaxScaler(df) # 標準化したい時のみ実行する

# 特徴量エンジニアリング
df = create_new_features(df)
df = missing_values(df)
df = drop_columns(df)

ohe_cols = [] # One-Hot Encodingしたい水準数の少ないカラムを指定する
dummies_cols = ['Geography','Gender','IsActiveMember','HasCrCard','NumOfProducts', 'Geo_Gender', 'Customer_Status'] # pd.get_dummiesしたい水準数の少ないカラムを指定する
df = one_hot_encoding(df, ohe_cols)
df = get_dummies(df, dummies_cols)
# df = label_encoder(df)

train = df[:165034]
test = df[165034:]

train_x = train.drop(columns=[target_col])
train_y = target
test_x = test.drop(columns=[target_col])

X = train_x.values
y = train_y.values
# y = y.astype(int)

df.to_csv(OUTPUT_DIR + 'data.csv', index=False)

# 確認 (data_import.py)
def file_to_xy(filename):
    data = pd.read_csv(filename, index_col=0)
    print(f'読み込み完了 {filename}')
    train = data[:165034].reset_index(drop=True)
    test = data[165034:].reset_index(drop=True).drop(target_col, axis=1) # 目的変数を指定する
    # 目的変数と説明変数に分割
    X = train.drop([target_col], axis=1) # 目的変数を指定する
    y = train[target_col].values # 目的変数を指定する

    return data,test,train,X,y

filename = 'data.csv'
data,test,train,X,y = file_to_xy(OUTPUT_DIR + filename)

読み込み完了 /kaggle/working/data.csv


In [3]:
X = X.copy()
y = y.copy()

# 機械学習モデルの構築・学習・予測

## ライブラリのインポート

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

## データの分割

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

## モデルの学習

In [6]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

### 分割データの予測

In [7]:
y_test_pred = dt.predict(X_valid)
print(y_test_pred)

[0. 0. 1. ... 0. 0. 0.]


In [8]:
results_test = pd.DataFrame({"y_valid": y_valid,
                           "y_pred": y_test_pred})
results_test = results_test.astype(int)
results_test.head()

Unnamed: 0,y_valid,y_pred
0,0,0
1,0,0
2,1,1
3,0,1
4,0,0


## モデルの予測