In [66]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import StratifiedKFold,GridSearchCV
import lightgbm as lgb
from sklearn.model_selection import cross_val_score,KFold
from sklearn.tree import plot_tree
from matplotlib import pyplot as plt

In [67]:
train_df = pd.read_csv("./data/mushroom_train.csv", encoding="utf-8")
test_df = pd.read_csv("./data/mushroom_test.csv", encoding="utf-8")

df = pd.concat([train_df, test_df]).reset_index(drop=True)

1. 데이터 컬럼 및 개수 파악

In [None]:
df.shape

2. 데이터 컬림 이름 파악

In [None]:
df.info()

3. 데이터 null 갑 파악

In [None]:
df.isnull().sum()

4. 데이터 컬럼 내용 파악  -  불필요한 데이터 삭제

In [None]:
df.select_dtypes('object').nunique()

In [None]:
df.nunique()

5. 데이터 삭제 :  veil-type : 값 1개여서 지움

In [68]:
# 값이 1개여서 지움
df = df.drop(['veil-type'],axis=1)

# mushroom_2_trial 에서 plot_importance(model) 에 없어서 지움
# importance에서 10 이하 싹 지움

df = df.drop(['cap-shape','gill-attachment','veil-color','ring-type',
              'stalk-surface-above-ring','stalk-color-above-ring',
              'bruises','stalk-color-below-ring','ring-number',
              'gill-color','cap-color','stalk-shape',],axis=1)

6. one hot encoder사용

In [69]:
# one hot encoding

name_columns = ['cap-surface', 'odor',
       'gill-spacing', 'gill-size',
       'stalk-root',
       'stalk-surface-below-ring',
       'spore-print-color', 'population', 'habitat']

for name in name_columns:
    dummies_df = pd.get_dummies(df[name], prefix='dummies_'+name)
    dummies_df = dummies_df.astype(np.uint8)
    df = pd.concat([df,dummies_df], axis=1)
    df = df.drop([name],axis=1)
df


Unnamed: 0,mushroom_id,class,dummies_cap-surface_f,dummies_cap-surface_g,dummies_cap-surface_s,dummies_cap-surface_y,dummies_odor_a,dummies_odor_c,dummies_odor_f,dummies_odor_l,...,dummies_population_s,dummies_population_v,dummies_population_y,dummies_habitat_d,dummies_habitat_g,dummies_habitat_l,dummies_habitat_m,dummies_habitat_p,dummies_habitat_u,dummies_habitat_w
0,0,p,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,e,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2,e,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,3,p,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,4,e,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,8119,,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8120,8120,,0,0,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
8121,8121,,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8122,8122,,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [70]:
label_encoder = LabelEncoder()

df['class'] = label_encoder.fit_transform(df['class'])
df

Unnamed: 0,mushroom_id,class,dummies_cap-surface_f,dummies_cap-surface_g,dummies_cap-surface_s,dummies_cap-surface_y,dummies_odor_a,dummies_odor_c,dummies_odor_f,dummies_odor_l,...,dummies_population_s,dummies_population_v,dummies_population_y,dummies_habitat_d,dummies_habitat_g,dummies_habitat_l,dummies_habitat_m,dummies_habitat_p,dummies_habitat_u,dummies_habitat_w
0,0,1,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,3,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,4,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,8119,2,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8120,8120,2,0,0,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
8121,8121,2,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8122,8122,2,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


7. train, test set 분리

In [71]:
train_df = df[:6500]
test_df = df[6500:]

8. 종속변수, 독립변수 분리

In [72]:
test_df = test_df.drop(["class"],axis=1)

In [73]:
x_train = train_df.drop(['mushroom_id','class'],axis=1)
y_train = train_df['class']
x_test = test_df.drop(['mushroom_id'],axis=1)

In [74]:
model = RandomForestClassifier()

stratified_kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=1234)

param_grid = {
    'n_estimators' : np.arange(1,100,10),
    "max_depth":np.arange(1,10,1)
}

grid_search = GridSearchCV(
    estimator = model,
    param_grid = param_grid,
    cv = stratified_kf
)

In [76]:
result = grid_search.fit(x_train,y_train)

In [77]:
result.best_params_

{'max_depth': 7, 'n_estimators': 21}

In [78]:
result.best_score_

0.9998461538461537

9. Deicision Tree 학습  //  RandomForest 학습

In [79]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=7,
    criterion='entropy',
    max_depth=21,
    random_state=1234
)

In [80]:
kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=1234)

In [81]:
scores = cross_val_score(model,x_train,y_train,cv=kf)

In [82]:
scores

array([1.        , 1.        , 0.99923077, 0.99923077, 1.        ])

In [83]:
scores.mean()

0.9996923076923077

In [84]:
model.fit(x_train,y_train)

In [85]:
y_test_pred = model.predict(x_test)
test_df['class'] = y_test_pred
test_df["class"] = test_df["class"].replace([1,0],["p","e"])

In [86]:
test_df[['mushroom_id','class']].to_csv('./data/mushroom_submission_7.csv',index=False)

LightLGB 학습

In [None]:
y_test_pred = model.predict(x_test)
test_df['class'] = y_test_pred
test_df["class"] = test_df["class"].replace([1,0],["p","e"])

In [None]:
test_df[['mushroom_id','class']].to_csv('./data/mushroom_submission_4.csv',index=False)