In [22]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import StratifiedKFold,GridSearchCV
import lightgbm as lgb

In [23]:
train_df = pd.read_csv("./data/mushroom_train.csv", encoding="utf-8")
test_df = pd.read_csv("./data/mushroom_test.csv", encoding="utf-8")

df = pd.concat([train_df, test_df]).reset_index(drop=True)

In [None]:
df.isnull().sum()

In [24]:
# 값이 1개여서 지움
df = df.drop(['veil-type'],axis=1)

# mushroom_2_trial 에서 plot_importance(model) 에 없어서 지움
# importance에서 10 이하 싹 지움

df = df.drop(['cap-shape','gill-attachment','veil-color','ring-type',
              'stalk-surface-above-ring','stalk-color-above-ring',
              'bruises','stalk-color-below-ring','ring-number',
              'gill-color','cap-color','stalk-shape',],axis=1)

In [25]:
# label encoder
# class : { 0 : e , 1 : p , 2 : nan}

label_encoder = LabelEncoder()

name_columns = ['class', 'cap-surface', 'odor',
       'gill-spacing', 'gill-size',
       'stalk-root',
       'stalk-surface-below-ring',
       'spore-print-color', 'population', 'habitat']

for name in name_columns:
    df[name] = label_encoder.fit_transform(df[name])

In [26]:
train_df = df[:6500]
test_df = df[6500:]

In [27]:
test_df = test_df.drop(["class"],axis=1)

In [28]:
x_train = train_df.drop(['mushroom_id','class'],axis=1)
y_train = train_df['class']
x_test = test_df.drop(['mushroom_id'],axis=1)

In [37]:
model = lgb.LGBMClassifier()

stratified_kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=1234)

param_grid = {
    'n_estimators' : np.arange(1,100,10),
    "max_depth":np.arange(1,10,1)
}

grid_search = GridSearchCV(
    estimator = model,
    param_grid = param_grid,
    cv = stratified_kf
)

In [30]:
result = grid_search.fit(x_train,y_train)

[LightGBM] [Info] Number of positive: 2239, number of negative: 2961
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 45
[LightGBM] [Info] Number of data points in the train set: 5200, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.430577 -> initscore=-0.279498
[LightGBM] [Info] Start training from score -0.279498
[LightGBM] [Info] Number of positive: 2239, number of negative: 2961
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000259 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 46
[LightGBM] [Info] Number of data points in the train set: 5200, number of used features: 9
[LightGBM] [Info] [binary:Boos

In [38]:
result.best_params_

{'max_depth': 4, 'n_estimators': 91}

In [39]:
result.best_score_

0.9998461538461537

In [40]:
model = lgb.LGBMClassifier(
    n_estimators=91,
    max_depth=4
)

In [41]:
model.fit(x_train,y_train)

[LightGBM] [Info] Number of positive: 2799, number of negative: 3701
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000290 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 46
[LightGBM] [Info] Number of data points in the train set: 6500, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.430615 -> initscore=-0.279341
[LightGBM] [Info] Start training from score -0.279341


In [42]:
model.score(x_train,y_train)

1.0

In [None]:
from sklearn.tree import plot_tree
from matplotlib import pyplot as plt

plt.figure(figsize=(15,5))    # 사이즈 
_= plot_tree(
    model,
    filled=True,   # 색깔 주기
    rounded=True    # 모서리 둥글게
)

In [43]:
y_test_pred = model.predict(x_test)
test_df['class'] = y_test_pred
test_df["class"] = test_df["class"].replace([1,0],["p","e"])

In [44]:
test_df[['mushroom_id','class']].to_csv('./data/mushroom_submission_5.csv',index=False)