In [43]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datascience-github/LICENSE
/kaggle/input/datascience-github/GithubwithKaggle/processor-e22-horses-1221.ipynb
/kaggle/input/datascience-github/GithubwithKaggle/eda-e22-horses-1221.ipynb
/kaggle/input/datascience-github/GithubwithKaggle/.gitkeep
/kaggle/input/datascience-github/EDA/EDA_Horses.ipynb
/kaggle/input/datascience-github/EDA/EDA_Smoker.ipynb
/kaggle/input/datascience-github/EDA/EDA_titanic.ipynb
/kaggle/input/datascience-github/EDA/EDA-2_Cirrhosis.ipynb
/kaggle/input/datascience-github/EDA/EDA_Credit.ipynb
/kaggle/input/datascience-github/EDA/EDA-2_Mohs-Hardness.ipynb
/kaggle/input/datascience-github/EDA/.gitkeep
/kaggle/input/datascience-github/EDA/EDA_Software.ipynb
/kaggle/input/datascience-github/Slide/Multi-Class Prediction of Cirrhosis Outcomes.pdf
/kaggle/input/datascience-github/Slide/Binary Classification with a Software Defects Dataset.pdf
/kaggle/input/datascience-github/Slide/Binary Prediction of Smoker Status using Bio-Signals.pdf
/kaggle/input/datasc

# 処理

In [44]:
# ライブラリ・データセットのインポート
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder

## 実行時間を調べるために使う
import datetime
import time
import math

start_time = time.time()

def changeHMS(s):
    h = math.floor(s / 3600)
    if h > 0:
        s = s - h * 3600
        indi_h = str(h) + 'h'
    else:
        indi_h = ''
    m = math.floor(s / 60)
    if m > 0:
        indi_m = str(m) + 'm'
    else:
        indi_m = ''
    s = math.floor(s % 60)
    time = indi_h + indi_m + str(s) + 's'
    return time

FILE_PATH = '/kaggle/input/playground-series-s3e26/'
OUTPUT_DIR = '/kaggle/working/'

train = pd.read_csv(FILE_PATH + 'train.csv')
test = pd.read_csv(FILE_PATH + 'test.csv')

test_id = test['id']

target = train['Status']

target_name = str(train.iloc[:, [18]].columns.tolist()) # カラム数-2の値が目的変数

df = pd.concat([train, test], axis=0)
df.reset_index(inplace=True)

# 説明変数をデータ型ごとに代入する
numerical_features = df._get_numeric_data().columns
categorical_features = df.drop(numerical_features, axis=1).columns
numerical_features = numerical_features.drop('id')

# 前処理
# 欠損値の補完
def missing_values(df):

    return df

# 外れ値の削除
def outlier(df):

    return df

# MinMaxScaler(正規化)
def scaling(df):
    df_scale = df[numerical_features]
    sc = MinMaxScaler()
    df[numerical_features] = pd.DataFrame(sc.fit_transform(df_scale), columns = df_scale.columns)

    return df

# 特徴量エンジニアリング
# 特徴量の作成
def create_new_features(df):
    # 血小板減少症インジケーター
    threshold_platelets = 150 # 閾値
    df['thrombocytopenia'] = np.where(df['Platelets'] < threshold_platelets, 1, 0)

    # アルカリ性リン酸塩ホスファターゼの上昇指標：
    threshold_alk_phos_upper = 147
    threshold_alk_phos_lower = 44
    df['elevated_alk_phos'] = np.where((df['Alk_Phos'] > threshold_alk_phos_upper) | (df['Alk_Phos'] < threshold_alk_phos_lower), 1, 0)

    # 正常な銅レベル
    normal_copper_range = (62, 140)
    df['normal_copper'] = np.where((df['Copper'] >= normal_copper_range[0]) & (df['Copper'] <= normal_copper_range[1]), 1, 0)

    # アルブミンの正常範囲
    normal_albumin_range = (3.4, 5.4)
    df['normal_albumin'] = np.where((df['Albumin'] >= normal_albumin_range[1]), 1, 0)

    # 正常なビリルビンレベル
    normal_bilirubin_range = (0.2, 1.2)
    df['normal_bilirubin'] = np.where((df['Bilirubin'] >= normal_bilirubin_range[0]) & (df['Bilirubin'] <= normal_bilirubin_range[1]), 1, 0)

    # 診断日
    df['DiagnosisDays'] = df['Age'] - df['N_Days']

    # ビリルビン * アルブミン
    df['Bilirubin_Albumin'] = df['Bilirubin'] * df['Albumin']

    # 症状のスコア
    symptom_columns = ['Ascites', 'Hepatomegaly', 'Spiders']
    df['Symptom_Score'] = df[symptom_columns].sum(axis=1)

    # 肝臓機能
    liver_columns = ['Bilirubin', 'Albumin', 'Alk_Phos', 'SGOT']
    df['Liver_Function_Index'] = df[liver_columns].mean(axis=1)

    # リスクスコア
    df['Risk_Score'] = df['Bilirubin'] + df['Albumin'] - df['Alk_Phos']

    # 時間特徴量
    df['Diag_Year'] = (df['N_Days'] / 365).astype(int)
    df['Diag_Month'] = ((df['N_Days'] % 365) / 30).astype(int)

    return df

# 年齢に関する特徴量の追加
def convert_days_to_years(age_in_days):
    days_in_year = 365.25
    age_in_years = age_in_days / days_in_year

    return age_in_years

def add_cols(df):
    age = list(df.Age)
    age_in_year = []
    for i in age:
        age_in_year.append(int(convert_days_to_years(i)))
    df['Age_in_year'] = pd.Series(age_in_year)

    return df

# カテゴリ変数のエンコーディング
# One-Hot Encoding
def one_hot_encoding(df, cat_cols):
    df = pd.get_dummies(df, columns=cat_cols)
    # 目的変数を数値化(int64)する
    df['Status'] = df['Status'].map({"D": 0, "CL": 1, "C": 2})
    # 一緒にラベルエンコーディングすると、1,2,3になってしまいモデル学習できないため分割する

    return df

# LabelEncoding
def label_encoder(df):
    categorical_columns = df.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        df[column] = df[column].fillna('').astype('str') # 欠損値の補完をする
        label_encoder = LabelEncoder()
        df[column] = label_encoder.fit_transform(df[column])

    return df

# 特徴量の選択
# 特徴量の重要度評価
def feature_importance_evaluation(df):
    # データを対数変換する

    # 訓練データをX(説明変数)とy（目的変数）に分割する
    X = df.select_dtypes(include=['float', 'int'])
    X = X.drop(['Status'], axis=1) # 目的変数を指定する
    y = target # 目的変数を指定する

    for column in X.columns.tolist():
        X[column] = X[column].apply(lambda x: np.log(x + 1))

    # 特徴量の重要度評価
    lgb = LGBMClassifier(
        random_state=42,
    )

    lgb.fit(X, y)
    importance = lgb.feature_importances_

    feature_importance = pd.DataFrame(data=importance, index=X.columns, columns=['importance']) \
        .sort_values(ascending=True, by='importance')

    return feature_importance

# 特徴量の削除
def drop_columns(df):
    drop_list = [
        'index'
    ]
    dropped_df = df.drop(columns=drop_list)

    return dropped_df

# データセットの更新
# 前処理
df = add_cols(df)
df = missing_values(df)
df = outlier(df)
# df = scaling(df) # 標準化したい時のみ実行する

# 特徴量エンジニアリング
df = create_new_features(df)
df = drop_columns(df)

cat_cols = ['Edema', 'Stage'] # One-Hot Encodingしたい水準数の少ないカラムを指定する
df = one_hot_encoding(df, cat_cols)
df = label_encoder(df)

train = df[df.loc[:, 'id'] < 7905]
test = df[df.loc[:, 'id'] >= 7905]

train_x = train.drop(columns=['Status', 'id'])
train_y = target
test_x = test.drop(columns=['Status', 'id'])

X = train_x.values
y = train_y.values
# y = y.astype(int)

df.head()

# ID削除
df.drop("id", axis=1, inplace=True)

df.to_csv(OUTPUT_DIR + 'data.csv', index=False)

# 確認 (data_import.py)
def file_to_xy(filename):
    data = pd.read_csv(filename, index_col=0)
    print(f'読み込み完了 {filename}')
    train = data[:7905].reset_index(drop=True)
    test = data[7905:].reset_index(drop=True).drop('Status', axis=1)
    # 目的変数と説明変数に分割
    X = train.drop('Status', axis=1)
    y = train['Status'].values
    return data,test,train,X,y

filename = OUTPUT_DIR + 'data.csv'
data,test,train,X,y = file_to_xy(filename)

読み込み完了 /kaggle/working/data.csv


# モデルの構築・学習

# 参考資料
**書籍**


**Kaggle**
- [For Beginners by a Beginner](https://www.kaggle.com/code/juniorbertrand/for-beginners-by-a-beginner)


**自分で作成したファイル**

**その他**


In [45]:
train = train.copy()
train_x = X.copy()
train_y = y.copy()
test_x = test.copy()

In [46]:
# import xgboost as xgb

### XGBoost

In [47]:
# class Model:
#     def __init__(self, params=None):
#         self.model = None
#         if params is None:
#             self.params = {}
#         else:
#             self.params = params
            
#     # 学習
#     def fit(self, tr_x, tr_y):
#         params = {'objective': 'multi:softmax', 'num_class':3,
#                   'random_state': 42
#                  }
#         params.update(self.params)
#         num_round = 10
#         dtrain = xgb.DMatrix(tr_x, label=tr_y)
#         self.model = xgb.train(params, dtrain, num_round)
        
#     # 予測値を出力する
#     def predict(self, x):
#         data = xgb.DMatrix(x)
#         pred = self.model.predict(data)
        
#         return pred

In [48]:
# # モデルの学習と予測

# # モデルのパラメータを指定する
# params = {'param1': 10, 'param2': 100}

# # モデルを定義する
# model = Model(params)

# # 学習データに対してモデルを学習させる
# model.fit(train_x, train_y)

# # テストデータに対して予測結果を出力する
# pred = model.predict(test_x)

In [49]:
from xgboost import XGBClassifier

In [50]:
xgboost = XGBClassifier(random_state=42, max_depth=5)
xgboost.fit(train_x, train_y)

xgb_proba = xgboost.predict_proba(test_x)

probs = xgb_proba

submission = pd.DataFrame({
    'id': test_id,
    'Status_C': np.round(probs[:, 0], 4),
    'Status_CL': np.round(probs[:, 1], 4),
    'Status_D': np.round(probs[:, 2], 4)
})

In [51]:
submission

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.7070,0.0072,0.2857
1,7906,0.5586,0.1322,0.3092
2,7907,0.9956,0.0009,0.0034
3,7908,0.0205,0.0004,0.9791
4,7909,0.0715,0.0294,0.8991
...,...,...,...,...
5266,13171,0.0180,0.0117,0.9703
5267,13172,0.0050,0.0005,0.9945
5268,13173,0.0414,0.0011,0.9576
5269,13174,0.0027,0.0020,0.9953


In [52]:
submission.to_csv('submission.csv', index=False)