In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score,KFold
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from lightgbm import plot_importance
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
train_df = pd.read_csv("./data/mushroom_train.csv", encoding="utf-8")
test_df = pd.read_csv("./data/mushroom_test.csv", encoding="utf-8")

In [None]:
df = pd.concat([train_df, test_df]).reset_index(drop=True)
df

In [None]:
df_bar = df.select_dtypes('object').nunique()

plt.figure(figsize=(15,8))
plt.xticks(rotation=45,fontsize=11)
plt.bar(df_bar.index,df_bar.values)

In [None]:
df = pd.get_dummies(data=df, columns=['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
           'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
           'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
           'stalk-surface-below-ring', 'stalk-color-above-ring',
           'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
           'ring-type', 'spore-print-color', 'population', 'habitat'], drop_first=True)
df = df*1
df

In [None]:
train_df = df[:6500]
test_df = df[6500:]

In [None]:
test_df = test_df.drop(["class_p"],axis=1)
x_train = train_df.drop(['mushroom_id','class_p'],axis=1)
y_train = train_df['class_p']
x_test = test_df.drop(['mushroom_id'],axis=1)

In [None]:
stratified_kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=1234)

In [None]:
model = lgb.LGBMClassifier()


param_grid = { "n_estimators":[400],"max_depth":[3,5],
    "num_leaves":[10],
    "subsample": [0.8]}
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=stratified_kf
)

In [None]:
grid_search.fit(x_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
model = lgb.LGBMClassifier(**grid_search.best_params_)

In [None]:
model.fit(x_train,y_train)

In [None]:
y_test_pred = model.predict(x_test)
test_df["class"] = y_test_pred
test_df["class"] = test_df["class"].replace([1,0],["p","e"])
test_df[["mushroom_id","class"]].to_csv("./data/submission_mushroom_13.csv", index=False)

In [None]:
plt.figure(figsize=(15,10))
plot_importance(model,max_num_features = 20)

In [None]:
len(test_df[test_df["class"] =="p"])*100/len(test_df["class"])

In [None]:
test_df['class'].nunique()

In [None]:
counts = test_df['class'].value_counts()
total = len(test_df)
percent = counts/total * 100

In [None]:
sns.set(style='whitegrid')
sns.barplot(
    x=percent.index,
    y=percent.values
)

In [None]:
#끝