In [1]:
from IPython.display import display, HTML
import warnings
warnings.filterwarnings(action='ignore')

# 실습용 데이터 패키지
from sklearn.datasets import load_breast_cancer, load_boston

# 데이터 전처리
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# 기계학습 모델 및 평가
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, recall_score, precision_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import plot_confusion_matrix
from tqdm import tqdm
from sklearn.model_selection import KFold

# 시각화
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.colors import ListedColormap
matplotlib.rcParams['axes.unicode_minus'] = False

# 한글 폰트 설정
plt.rc('font', family='Malgun Gothic')

In [2]:
data = pd.read_csv('Cereals.csv')
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Cereals.csv'

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data = data.dropna(axis=0)

In [None]:
X = pd.DataFrame(data.drop('rating', axis=1))
y = pd.DataFrame(data['rating'])

In [None]:
X = X.drop('name',axis=1)
X

In [None]:
X=pd.get_dummies(X)
X.info()

In [None]:
# Train set/ Test set 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=2021)

# 데이터 컬럼 단위 정규화 하기
normalizer = StandardScaler()
X_train = normalizer.fit_transform(X_train)
X_test = normalizer.transform(X_test)

In [None]:
rfr = RandomForestRegressor(n_estimators=10,random_state=2021)
rfr.fit(X_train, y_train)

In [None]:
y_pred = rfr.predict(X_test)

r2score = r2_score(y_true = y_test, y_pred =y_pred) ## R-squared score
mse = mean_squared_error(y_true = y_test, y_pred =y_pred) ## MSE (Mean Squared Error)
mae = mean_absolute_error(y_true = y_test, y_pred =y_pred) ## MAE (Mean Absolute Error)

print('Random Forest Regressor')
print(f"R2 score:{r2score:0.4f}",f"MSE:{mse:0.4f}",f"MAE:{mae:0.4f}", sep='  |  ')  

In [None]:
# Random Forest Regressor
rfr = RandomForestRegressor(random_state=2021)
param_grid = {
    'max_depth': [10, 50, 100, 200],
    'max_leaf_nodes': [50, 100, 200],
    'criterion':['mae', 'mse']
}

CV_rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=10, verbose=1 ,n_jobs=12, scoring='r2')
CV_rfr.fit(X_train, y_train)

CV_rfr.best_params_

In [None]:
# Random Forest Regressor
rfr = RandomForestRegressor(random_state=2021)
param_grid = {
    'max_depth': [45, 50, 55],
    'max_leaf_nodes': [30, 40, 50],
    'criterion':['mae', 'mse']
}

CV_rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=10, verbose=1 ,n_jobs=12, scoring='r2')
CV_rfr.fit(X_train, y_train)

CV_rfr.best_params_

In [None]:
# Random Forest Regressor
rfr = RandomForestRegressor(random_state=2021)
param_grid = {
    'max_depth': [41, 43, 45],
    'max_leaf_nodes': [35, 40, 45],
    'criterion':['mae', 'mse']
}

CV_rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=10, verbose=1 ,n_jobs=12, scoring='r2')
CV_rfr.fit(X_train, y_train)

CV_rfr.best_params_

In [None]:
result_table = pd.DataFrame(CV_rfr.cv_results_)
result_table = result_table.sort_values(by='mean_test_score', ascending=False)
print(result_table[['params', 'mean_test_score']])

In [None]:
feats = {}
for feature, importance in zip(X.columns.tolist(), best_rfr.feature_importances_):
    feats[feature] = importance
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-Importance'})
importances = importances.sort_values(by='Gini-Importance', ascending=False)
importances = importances.reset_index()
importances = importances.rename(columns={'index': 'Features'})
sns.set(font_scale = 5)
sns.set(style="whitegrid", color_codes=True, font_scale = 1.7)
fig, ax = plt.subplots()
fig.set_size_inches(30,15)
sns.barplot(x=importances['Gini-Importance'], y=importances['Features'], data=importances, color='skyblue')
plt.xlabel('Importance', fontsize=25, weight = 'bold')
plt.ylabel('Features', fontsize=25, weight = 'bold')
plt.title('Feature Importance', fontsize=25, weight = 'bold')
display(plt.show())
display(importances)

In [None]:
# 데이터 전처리
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# 기계학습 모델 생성, 학습, 평가
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, AdaBoostRegressor
from lightgbm import LGBMClassifier # 패키지 설치하세요 ~ !pip install lightgbm
from catboost import CatBoostClassifier # 패키지 설치하세요 ~ !pip install catboost
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import torch
from sklearn.metrics import accuracy_score
device = torch.cuda.is_available() 
# Catboost와 Lightgbm은 gpu 사용이 가능합니다 ! Gpu 장비와 cuda 설치가 되어있다면 활용하면 좋습니다! (학습 시간 단축)
# Cpu로도 학습 충분히 가능합니다


# 시각화 & 편의용
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from tqdm import tqdm
import pickle
import warnings
warnings.filterwarnings('ignore')
matplotlib.rcParams['axes.unicode_minus'] = False

# 한글 폰트 설정
plt.rc('font', family='Malgun Gothic')

In [None]:
## 탐색하고자 하는 하이퍼파라미터 설정 ## 
model_params = {
                "AdaBoost": {'model': AdaBoostRegressor(random_state=2021),# 에이다 부스팅
                        'params': {'base_estimator':  [DecisionTreeRegressor(max_depth=i) for i in range(1,11,2)], # 약분류기 (주로 의사결정나무를 사용)
                                   'n_estimators':[50*(i+1) for i in range(3)], 
                                   'learning_rate':[0.2*(i+1) for i in range(3)]
                                  },
                }}

In [None]:
for model_name, v in model_params.items():

        pbar.set_description(desc=f"{model_name}") # tqdm에서 활용되는 옵션입니다. 현재 학습되고 있는 모델 이름을 출력합니다.
        
        model, params = v['model'], v['params']
        gcv = GridSearchCV(estimator=model, param_grid=params, n_jobs=2 if model_name == 'Catboost' else 5, cv=5, scoring='r2')
        gcv.fit(X_train_scaled, y_train)
        
        result = pd.DataFrame.from_dict(gcv.cv_results_) # cv_results에는 각 fold별 예측 성능이 기록되어 있습니다.
        result['test_r2_with_best_hyp'] = r2_score(y_pred = gcv.predict(X_test_scaled) , y_true = y_test) #gcv.predict()는 최적 하이퍼파라미터가 탑재된 모델의 예측 결과를 파악하는데 사용됩니다.
        result['model_name'] = model_name
        result['seed'] = seed
        output = pd.concat([output,result])
        pbar.update(1)

output.reset_index(drop=True)       
output.to_csv('./classification_result_cereal.csv')

In [None]:
## 최적의 하이퍼파라미터를 탑재한 예측 모델의 테스팅 데이터 예측 성능 시각화

output = pd.read_csv('./classification_result_cereal.csv',index_col=0)
output.reset_index(drop=True,inplace=True) # index 초기화 
plt.figure(figsize=(10,10))

In [None]:
idx = output.groupby(['model_name'])['mean_test_score'].idxmax() # 각 모델별 검증용 accuracy를 기준하여, Best 성능을 기록한 경우 추출
output.loc[idx,['model_name','test_r2_with_best_hyp']].set_index('model_name').plot.bar(legend=False)
plt.title('Predictive performance of models with optimal hyperparameters',fontsize=20)
plt.yticks(fontsize=20)
plt.show()