In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s3e26/sample_submission.csv
/kaggle/input/playground-series-s3e26/train.csv
/kaggle/input/playground-series-s3e26/test.csv
/kaggle/input/cirrhosis-patient-survival-prediction/cirrhosis.csv


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss,make_scorer

In [3]:
# ライブラリ・データセットのインポート
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder

## 実行時間を調べるために使う
import datetime
import time
import math

start_time = time.time()

def changeHMS(s):
    h = math.floor(s / 3600)
    if h > 0:
        s = s - h * 3600
        indi_h = str(h) + 'h'
    else:
        indi_h = ''
    m = math.floor(s / 60)
    if m > 0:
        indi_m = str(m) + 'm'
    else:
        indi_m = ''
    s = math.floor(s % 60)
    time = indi_h + indi_m + str(s) + 's'
    return time

FILE_PATH = '/kaggle/input/playground-series-s3e26/'
OUTPUT_DIR = '/kaggle/working/'

train = pd.read_csv(FILE_PATH + 'train.csv')
test = pd.read_csv(FILE_PATH + 'test.csv')

test_id = test['id']

target = train['Status']

target_name = str(train.iloc[:, [18]].columns.tolist()) # カラム数-2の値が目的変数

df = pd.concat([train, test], axis=0)
df.reset_index(inplace=True)

# 説明変数をデータ型ごとに代入する
numerical_features = df._get_numeric_data().columns
categorical_features = df.drop(numerical_features, axis=1).columns
numerical_features = numerical_features.drop('id')

# 前処理
# 欠損値の補完
def missing_values(df):

    return df

# 外れ値の削除
def outlier(df):

    return df

# MinMaxScaler(正規化)
def scaling(df):
    df_scale = df[numerical_features]
    sc = MinMaxScaler()
    df[numerical_features] = pd.DataFrame(sc.fit_transform(df_scale), columns = df_scale.columns)

    return df

# 特徴量エンジニアリング
# 特徴量の作成
def create_new_features(df):
    # 血小板減少症インジケーター
    threshold_platelets = 150 # 閾値
    df['thrombocytopenia'] = np.where(df['Platelets'] < threshold_platelets, 1, 0)

    # アルカリ性リン酸塩ホスファターゼの上昇指標：
    threshold_alk_phos_upper = 147
    threshold_alk_phos_lower = 44
    df['elevated_alk_phos'] = np.where((df['Alk_Phos'] > threshold_alk_phos_upper) | (df['Alk_Phos'] < threshold_alk_phos_lower), 1, 0)

    # 正常な銅レベル
    normal_copper_range = (62, 140)
    df['normal_copper'] = np.where((df['Copper'] >= normal_copper_range[0]) & (df['Copper'] <= normal_copper_range[1]), 1, 0)

    # アルブミンの正常範囲
    normal_albumin_range = (3.4, 5.4)
    df['normal_albumin'] = np.where((df['Albumin'] >= normal_albumin_range[1]), 1, 0)

    # 正常なビリルビンレベル
    normal_bilirubin_range = (0.2, 1.2)
    df['normal_bilirubin'] = np.where((df['Bilirubin'] >= normal_bilirubin_range[0]) & (df['Bilirubin'] <= normal_bilirubin_range[1]), 1, 0)

    # 診断日
    df['DiagnosisDays'] = df['Age'] - df['N_Days']

    # ビリルビン * アルブミン
    df['Bilirubin_Albumin'] = df['Bilirubin'] * df['Albumin']

    # 症状のスコア
    symptom_columns = ['Ascites', 'Hepatomegaly', 'Spiders']
    df['Symptom_Score'] = df[symptom_columns].sum(axis=1)

    # 肝臓機能
    liver_columns = ['Bilirubin', 'Albumin', 'Alk_Phos', 'SGOT']
    df['Liver_Function_Index'] = df[liver_columns].mean(axis=1)

    # リスクスコア
    df['Risk_Score'] = df['Bilirubin'] + df['Albumin'] - df['Alk_Phos']

    # 時間特徴量
    df['Diag_Year'] = (df['N_Days'] / 365).astype(int)
    df['Diag_Month'] = ((df['N_Days'] % 365) / 30).astype(int)

    return df

# 年齢に関する特徴量の追加
def convert_days_to_years(age_in_days):
    days_in_year = 365.25
    age_in_years = age_in_days / days_in_year

    return age_in_years

def add_cols(df):
    age = list(df.Age)
    age_in_year = []
    for i in age:
        age_in_year.append(int(convert_days_to_years(i)))
    df['Age_in_year'] = pd.Series(age_in_year)

    return df

# カテゴリ変数のエンコーディング
# One-Hot Encoding
def one_hot_encoding(df, cat_cols):
    df = pd.get_dummies(df, columns=cat_cols)
    # 目的変数を数値化(int64)する
    df['Status'] = df['Status'].map({"D": 0, "CL": 1, "C": 2})
    # 一緒にラベルエンコーディングすると、1,2,3になってしまいモデル学習できないため分割する

    return df

# LabelEncoding
def label_encoder(df):
    categorical_columns = df.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        df[column] = df[column].fillna('').astype('str') # 欠損値の補完をする
        label_encoder = LabelEncoder()
        df[column] = label_encoder.fit_transform(df[column])

    return df

# 特徴量の選択
# 特徴量の重要度評価
def feature_importance_evaluation(df):
    # データを対数変換する

    # 訓練データをX(説明変数)とy（目的変数）に分割する
    X = df.select_dtypes(include=['float', 'int'])
    X = X.drop(['Status'], axis=1) # 目的変数を指定する
    y = target # 目的変数を指定する

    for column in X.columns.tolist():
        X[column] = X[column].apply(lambda x: np.log(x + 1))

    # 特徴量の重要度評価
    lgb = LGBMClassifier(
        random_state=42,
    )

    lgb.fit(X, y)
    importance = lgb.feature_importances_

    feature_importance = pd.DataFrame(data=importance, index=X.columns, columns=['importance']) \
        .sort_values(ascending=True, by='importance')

    return feature_importance

# 特徴量の削除
def drop_columns(df):
    drop_list = [
        'index'
    ]
    dropped_df = df.drop(columns=drop_list)

    return dropped_df

# データセットの更新
# 前処理
df = add_cols(df)
df = missing_values(df)
df = outlier(df)
# df = scaling(df) # 標準化したい時のみ実行する

# 特徴量エンジニアリング
df = create_new_features(df)
df = drop_columns(df)

cat_cols = ['Edema', 'Stage'] # One-Hot Encodingしたい水準数の少ないカラムを指定する
df = one_hot_encoding(df, cat_cols)
df = label_encoder(df)

train = df[df.loc[:, 'id'] < 7905]
test = df[df.loc[:, 'id'] >= 7905]

train_x = train.drop(columns=['Status', 'id'])
train_y = target
test_x = test.drop(columns=['Status', 'id'])

X = train_x.values
y = train_y.values
# y = y.astype(int)

df.head()

# ID削除
df.drop("id", axis=1, inplace=True)

df.to_csv(OUTPUT_DIR + 'data.csv', index=False)

# 確認 (data_import.py)
def file_to_xy(filename):
    data = pd.read_csv(filename, index_col=0)
    print(f'読み込み完了 {filename}')
    train = data[:7905].reset_index(drop=True)
    test = data[7905:].reset_index(drop=True).drop('Status', axis=1)
    # 目的変数と説明変数に分割
    X = train.drop('Status', axis=1)
    y = train['Status'].values
    return data,test,train,X,y

filename = OUTPUT_DIR + 'data.csv'
data,test,train,X,y = file_to_xy(filename)

読み込み完了 /kaggle/working/data.csv


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [5]:
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
numerical_columns = df.select_dtypes(exclude=['object']).columns.tolist()

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

cat_feature_indices = [df.columns.get_loc(col) for col in categorical_columns]
cat_columns= categorical_columns[:-1]
models = [
    ('CatBoost', CatBoostClassifier(iterations=300, random_seed=42, logging_level='Silent', cat_features=cat_columns)),
    ('Gradient_Boosting', GradientBoostingClassifier(n_estimators=150, random_state=42)),
    ('LightGBM',   LGBMClassifier(n_estimators=150, random_state=42, categorical_feature=cat_feature_indices))
]

sum_prediction=0
# Train and evaluate each model
for model_name, model in models:
    model.fit(X_train, y_train)
    predictions = model.predict_proba(X_test)
    sum_prediction= predictions + sum_prediction
    loss = log_loss(y_test, predictions)
    print(f'Log Loss - {model_name}: {loss}')
    
Ensemble_pred= sum_prediction/5
Ensemble_logloss= log_loss(y_test, Ensemble_pred)
print(f'Log Loss - Ensemble: {Ensemble_logloss}')

Log Loss - CatBoost: 0.4592786069304633
Log Loss - Gradient_Boosting: 0.4417445827289941
Log Loss - LightGBM: 0.49196989587726075
Log Loss - Ensemble: 0.4402829831313499


In [7]:
test_df= test.copy()
# test_df['Age'] = test_df['Age'] // 365.25
# test_df['N_Days'] = test_df['N_Days'] // 365.25
# test_df.rename(columns={'N_Days': 'N_years'}, inplace=True)
for colum in cat_columns:
    test_df[colum]=le.fit_transform(test_df[colum])

In [8]:
X_train = X.copy()
y_train = y.copy()
X_test = test_df  

sum_predictions=0
for model_name, model in models:
    model.fit(X_train, y_train)
    predictions = model.predict_proba(X_test)
    sum_predictions=sum_predictions+predictions

ensemble_predictions = sum_predictions/3

In [9]:
ensemble_predictions

array([[0.39756985, 0.01914351, 0.58328664],
       [0.18717854, 0.14815842, 0.66466304],
       [0.96105437, 0.01347337, 0.02547226],
       ...,
       [0.04392351, 0.00387841, 0.95219809],
       [0.00695053, 0.00391933, 0.98913014],
       [0.6902713 , 0.01205439, 0.29767431]])

In [10]:
predictions_df = pd.DataFrame({
    'id': test_id,
    'Status_C': ensemble_predictions[:, 0],
    'Status_CL': ensemble_predictions[:, 1],
    'Status_D': ensemble_predictions[:, 2]
})
predictions_df

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.397570,0.019144,0.583287
1,7906,0.187179,0.148158,0.664663
2,7907,0.961054,0.013473,0.025472
3,7908,0.044338,0.002263,0.953399
4,7909,0.107533,0.019644,0.872823
...,...,...,...,...
5266,13171,0.034039,0.032474,0.933486
5267,13172,0.019761,0.001898,0.978341
5268,13173,0.043924,0.003878,0.952198
5269,13174,0.006951,0.003919,0.989130


In [11]:
predictions_df.to_csv('submission.csv', index=False)