<a href="https://colab.research.google.com/github/DaniyolKim/dp2/blob/main/dp2_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!sudo apt-get install -y fonts-nanum
#!sudo fc-cache -fv
#!rm ~/.cache/matplotlib -rf

In [None]:
import datetime
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# DATA 정리
df = pd.read_csv('https://raw.githubusercontent.com/DaniyolKim/dp2/main/result.csv')
df.rename(columns= {
    'Unnamed: 0':'time',
    }, inplace=True)
df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d')
df["year"] = df["time"].dt.year
df["month"] = df["time"].dt.month
df["day"] = df["time"].dt.day
df["hour"] = df["time"].dt.hour
df["dayofweek"] = df["time"].dt.dayofweek

notWorkingDays = [datetime.datetime(2019,1,1),
                  datetime.datetime(2019,2,4),
                  datetime.datetime(2019,2,5),
                  datetime.datetime(2019,2,6),
                  datetime.datetime(2019,3,1),
                  datetime.datetime(2019,5,5),
                  datetime.datetime(2019,5,12),
                  datetime.datetime(2019,6,6),
                  datetime.datetime(2019,8,15),
                  datetime.datetime(2019,9,12),
                  datetime.datetime(2019,9,13),
                  datetime.datetime(2019,9,14),
                  datetime.datetime(2019,10,3),
                  datetime.datetime(2019,10,9),
                  datetime.datetime(2019,12,25),
                  
                  datetime.datetime(2020,1,1),
                  datetime.datetime(2020,1,24),
                  datetime.datetime(2020,1,25),
                  datetime.datetime(2020,1,26),
                  datetime.datetime(2020,3,1),
                  datetime.datetime(2020,4,30),
                  datetime.datetime(2020,5,5),
                  datetime.datetime(2020,6,6),
                  datetime.datetime(2020,8,15),
                  datetime.datetime(2020,8,17),
                  datetime.datetime(2020,9,30),
                  datetime.datetime(2020,10,1),
                  datetime.datetime(2020,10,2),
                  datetime.datetime(2020,10,3),
                  datetime.datetime(2020,10,9),
                  datetime.datetime(2020,12,25),

                  datetime.datetime(2021,1,1),
                  datetime.datetime(2020,2,11),
                  datetime.datetime(2020,2,12),
                  datetime.datetime(2020,2,13),
                  datetime.datetime(2020,3,1),
                  datetime.datetime(2020,5,5),
                  datetime.datetime(2020,5,19),
                  datetime.datetime(2020,6,6),
                  datetime.datetime(2020,8,15),
                  datetime.datetime(2020,9,20),
                  datetime.datetime(2020,9,21),
                  datetime.datetime(2020,9,22),
                  datetime.datetime(2020,10,3),
                  datetime.datetime(2020,10,9),
                  datetime.datetime(2020,12,25),
                  ]

def IsHoliday(d):
  for notWorkingDay in notWorkingDays:
    if (d.year == notWorkingDay.year and d.month == notWorkingDay.month and d.day == notWorkingDay.day) :
      return 1
  return 0

df['holiday'] = df['time'].apply(IsHoliday)

rainTypeDatas = pd.get_dummies(df['rain_type'])
df = df.join(rainTypeDatas.drop('None', axis=1))

df = df.drop('rain_type', axis = 1)
df = df.drop('time', axis = 1)
df


In [None]:
# 스케일링좀 해보자
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['hum', 'rain_fall', 'temp', 'wind_dir', 'wind_pwr']] = scaler.fit_transform(df[['hum', 'rain_fall', 'temp', 'wind_dir', 'wind_pwr']])

df.corr().round(3)

In [None]:
# 랜덤 포레스트 한번 돌려보자...

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

features = ['hum', 'rain_fall', 'temp', 'wind_dir', 'wind_pwr', 'year',
       'month', 'day', 'hour', 'dayofweek', 'holiday', 'Rain', 'Rain/Snow',
       'Snow']

X = df[features].values
y = df['elec'].values
X_train, X_test, y_train, y_test = train_test_split(X, y)

model = RandomForestRegressor(n_estimators= 100)
model.fit(X_train, y_train)
model.score(X_test, y_test) # R2

In [None]:
# 주요 특성 변수 뽑아보자
def feature_importances_fig(features, importances):
    df = pd.DataFrame({'feature':features,'importance':importances})
    df = df.sort_values('importance', ascending=False)
    ypos = np.arange(len(df.feature))

    plt.figure(figsize=(6,4))
    plt.barh(df.feature, df.importance)
    plt.yticks(ypos, df.feature)
    plt.xlabel('Importance')
    plt.ylabel('Variable')
    plt.xlim(0, 1)
    plt.ylim(-1, len(df.feature))
    plt.show()

feature_importances_fig(features, model.feature_importances_)

In [None]:
#시각화 해보자
from sklearn.metrics import r2_score

def plot_y_pred(y_test, y_pred):
    plt.figure(figsize=(8, 5))
    plt.plot(y_test, c='r')
    plt.plot(y_pred, c='b')
    plt.show()
    print("R2=", r2_score(y_test, y_pred).round(3))
    error = abs(y_test - y_pred)
    print("MAE=", error.mean().round(3))
    print("rmse=", np.sqrt((error**2).mean()).round(3))
    print("max=", max(error).round(3))

def scatter_errors(y_test, y_pred):
    error = abs(y_test - y_pred)
    plt.scatter(y_test, error, s=2)

y_pred = model.predict(X_test)
plot_y_pred(y_test[:100], y_pred[:100])

In [None]:
# 일기 예보가 없는 상황

features = ['year','month', 'day', 'hour', 'dayofweek', 'holiday']

X = df[features].values
y = df['elec'].values
X_train, X_test, y_train, y_test = train_test_split(X, y)

model = RandomForestRegressor(n_estimators= 100)
model.fit(X_train, y_train)
model.score(X_test, y_test) # R2

In [None]:
# 주요특성
feature_importances_fig(features, model.feature_importances_)

In [None]:
# 시각화
y_pred = model.predict(X_test)
plot_y_pred(y_test[:100], y_pred[:100])