In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
from sklearn.model_selection import StratifiedKFold,GridSearchCV
import lightgbm as lgb
from sklearn.model_selection import cross_val_score,KFold
from sklearn.tree import plot_tree
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_df = pd.read_csv("./data/mushroom_train.csv", encoding="utf-8")
test_df = pd.read_csv("./data/mushroom_test.csv", encoding="utf-8")

df = pd.concat([train_df, test_df]).reset_index(drop=True)

1. 데이터 컬럼 및 개수 파악

In [None]:
df.shape

2. 데이터 컬림 이름 파악

In [None]:
df.info()

3. 데이터 null 갑 파악

In [None]:
df.isnull().sum()

4. 데이터 컬럼 내용 파악  -  불필요한 데이터 삭제

In [None]:
df.select_dtypes('object').nunique()

In [None]:
df.nunique()

5. 데이터 삭제 :  veil-type : 값 1개여서 지움

In [68]:
# 값이 1개여서 지움
df = df.drop(['veil-type'],axis=1)

# mushroom_2_trial 에서 plot_importance(model) 에 없어서 지움
# importance에서 10 이하 싹 지움

df = df.drop(['cap-shape','gill-attachment','veil-color','ring-type',
              'stalk-surface-above-ring','stalk-color-above-ring',
              'bruises','stalk-color-below-ring','ring-number',
              'gill-color','cap-color','stalk-shape',],axis=1)

6. one hot encoder사용

In [3]:
# label encoder
# class : { 0 : e , 1 : p , 2 : nan}

label_encoder = LabelEncoder()

name_columns = ['class','cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']

for name in name_columns:
    df[name] = label_encoder.fit_transform(df[name])
df


Unnamed: 0,mushroom_id,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,1,5,2,4,1,6,1,0,1,...,2,7,7,0,2,1,4,2,3,5
1,1,0,5,2,9,1,0,1,0,0,...,2,7,7,0,2,1,4,3,2,1
2,2,0,0,2,8,1,3,1,0,0,...,2,7,7,0,2,1,4,3,2,3
3,3,1,5,3,8,1,6,1,0,1,...,2,7,7,0,2,1,4,2,3,5
4,4,0,5,2,3,0,5,1,1,0,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,8119,2,3,2,4,0,5,0,0,0,...,2,5,5,0,1,1,4,0,1,2
8120,8120,2,5,2,4,0,5,0,0,0,...,2,5,5,0,0,1,4,0,4,2
8121,8121,2,2,2,4,0,5,0,0,0,...,2,5,5,0,1,1,4,0,1,2
8122,8122,2,3,3,4,0,8,1,0,1,...,1,7,7,0,2,1,0,7,4,2


In [8]:
minmax_scaler = MinMaxScaler()

name_columns_2 = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']


df[['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']] = minmax_scaler.fit_transform(df[['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']])
df

Unnamed: 0,mushroom_id,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,1,1.0,0.666667,0.444444,1.0,0.750,1.0,0.0,1.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,1.0,0.250,0.6,0.833333
1,1,0,1.0,0.666667,1.000000,1.0,0.000,1.0,0.0,0.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,1.0,0.375,0.4,0.166667
2,2,0,0.0,0.666667,0.888889,1.0,0.375,1.0,0.0,0.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,1.0,0.375,0.4,0.500000
3,3,1,1.0,1.000000,0.888889,1.0,0.750,1.0,0.0,1.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,1.0,0.250,0.6,0.833333
4,4,0,1.0,0.666667,0.333333,0.0,0.625,1.0,1.0,0.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,0.0,0.375,0.0,0.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,8119,2,0.6,0.666667,0.444444,0.0,0.625,0.0,0.0,0.0,...,0.666667,0.625,0.625,0.0,0.333333,0.5,1.0,0.000,0.2,0.333333
8120,8120,2,1.0,0.666667,0.444444,0.0,0.625,0.0,0.0,0.0,...,0.666667,0.625,0.625,0.0,0.000000,0.5,1.0,0.000,0.8,0.333333
8121,8121,2,0.4,0.666667,0.444444,0.0,0.625,0.0,0.0,0.0,...,0.666667,0.625,0.625,0.0,0.333333,0.5,1.0,0.000,0.2,0.333333
8122,8122,2,0.6,1.000000,0.444444,0.0,1.000,1.0,0.0,1.0,...,0.333333,0.875,0.875,0.0,0.666667,0.5,0.0,0.875,0.8,0.333333


In [7]:
standard_scaler = StandardScaler()

name_columns_2 = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']


df[name_columns_2] = standard_scaler.fit_transform(df[name_columns_2])
df

Unnamed: 0,mushroom_id,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,1,1.029712,0.140128,-0.198250,1.185917,0.881938,0.162896,-0.438864,1.494683,...,0.586385,0.622441,0.631991,0.0,0.142037,-0.256132,0.948081,-0.670195,-0.514389,2.030028
1,1,0,1.029712,0.140128,1.765874,1.185917,-1.970316,0.162896,-0.438864,-0.669038,...,0.586385,0.622441,0.631991,0.0,0.142037,-0.256132,0.948081,-0.250471,-1.313108,-0.295730
2,2,0,-2.087047,0.140128,1.373049,1.185917,-0.544189,0.162896,-0.438864,-0.669038,...,0.586385,0.622441,0.631991,0.0,0.142037,-0.256132,0.948081,-0.250471,-1.313108,0.867149
3,3,1,1.029712,0.953270,1.373049,1.185917,0.881938,0.162896,-0.438864,1.494683,...,0.586385,0.622441,0.631991,0.0,0.142037,-0.256132,0.948081,-0.670195,-0.514389,2.030028
4,4,0,1.029712,0.140128,-0.591075,-0.843230,0.406562,0.162896,2.278612,-0.669038,...,0.586385,0.622441,0.631991,0.0,0.142037,-0.256132,-1.272216,-0.250471,-2.910546,-0.295730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,8119,2,-0.216992,0.140128,-0.198250,-0.843230,0.406562,-6.138869,-0.438864,-0.669038,...,0.586385,-0.429288,-0.416681,0.0,-3.979055,-0.256132,0.948081,-1.509643,-2.111827,0.285710
8120,8120,2,1.029712,0.140128,-0.198250,-0.843230,0.406562,-6.138869,-0.438864,-0.669038,...,0.586385,-0.429288,-0.416681,0.0,-8.100146,-0.256132,0.948081,-1.509643,0.284330,0.285710
8121,8121,2,-0.840343,0.140128,-0.198250,-0.843230,0.406562,-6.138869,-0.438864,-0.669038,...,0.586385,-0.429288,-0.416681,0.0,-3.979055,-0.256132,0.948081,-1.509643,-2.111827,0.285710
8122,8122,2,-0.216992,0.953270,-0.198250,-0.843230,1.832689,0.162896,-0.438864,1.494683,...,-0.893053,0.622441,0.631991,0.0,0.142037,-0.256132,-1.272216,1.428426,0.284330,0.285710


7. train, test set 분리

In [9]:
train_df = df[:6500]
test_df = df[6500:]

8. 종속변수, 독립변수 분리

In [10]:
test_df = test_df.drop(["class"],axis=1)

In [11]:
x_train = train_df.drop(['mushroom_id','class'],axis=1)
y_train = train_df['class']
x_test = test_df.drop(['mushroom_id'],axis=1)

In [12]:
model = RandomForestClassifier()

stratified_kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=1234)

param_grid = {
    'n_estimators' : np.arange(1,100,10),
    "max_depth":np.arange(1,10,1)
}

grid_search = GridSearchCV(
    estimator = model,
    param_grid = param_grid,
    cv = stratified_kf
)

In [13]:
result = grid_search.fit(x_train,y_train)

In [14]:
result.best_params_

{'max_depth': 6, 'n_estimators': 71}

In [15]:
result.best_score_

0.9998461538461537

9. Deicision Tree 학습  //  RandomForest 학습

In [16]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=6,
    criterion='entropy',
    max_depth=71,
    random_state=1234
)

In [17]:
kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=1234)

In [18]:
scores = cross_val_score(model,x_train,y_train,cv=kf)

In [19]:
scores

array([1.        , 1.        , 0.99923077, 0.99923077, 1.        ])

In [20]:
scores.mean()

0.9996923076923077

In [21]:
model.fit(x_train,y_train)

In [22]:
y_test_pred = model.predict(x_test)
test_df['class'] = y_test_pred
test_df["class"] = test_df["class"].replace([1,0],["p","e"])

In [23]:
test_df[['mushroom_id','class']].to_csv('./data/mushroom_submission_9.csv',index=False)

LightLGB 학습

In [None]:
y_test_pred = model.predict(x_test)
test_df['class'] = y_test_pred
test_df["class"] = test_df["class"].replace([1,0],["p","e"])

In [None]:
test_df[['mushroom_id','class']].to_csv('./data/mushroom_submission_4.csv',index=False)