# **실제 분류 데이터를 활용하여 SVR학습 모델 학습**

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler,normalize
from sklearn.metrics import confusion_matrix, mean_squared_error

import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split

import warnings
warnings.filterwarnings("ignore")

**mortgage - regression 예제**

실제 분류 예제 데이터(mortgage.csv) 를 사용하여 인공신경망 모델을 학습해보겠습니다.

학습데이터를 정규화작업을 거쳐, 학습과 모델 평가를 위해 6:4의 비율로 학습데이터와 평가데이터로 나누어 보도록 보겠습니다.

또한, 예측 성능 평가를 위해 MAPE 성능지표 산출 함수를 정의하였습니다.

In [2]:
# read mortgage.csv
mor = "______________"
print(mor.shape)

(1049, 16)


In [3]:
mor

Unnamed: 0,OneMonthCDRate,OneY.CMaturityRate,ThreeM.Rate.AuctionAverage,ThreeM.Rate.SecondaryMarket,ThreeY.CMaturityRate,FiveY.CMaturityRate,BankCredit,Currency,DemandDeposits,FederalFunds,MoneyStock,CheckableDeposits,LoansLeases,SavingsDeposits,TradeCurrencies,ThirtyY.CMortgageRate
0,8.72,90.729,9.69,7.62,7.60,7.72,7.69,2605.8,223.4,279.6,8.52,794.4,564.8,2020.2,894.7,7.66
1,13.85,109.392,17.19,12.06,12.47,13.94,13.82,1347.4,124.4,230.8,14.35,443.0,314.8,1033.8,343.9,13.73
2,6.59,87.979,9.94,5.74,5.67,7.42,7.73,2280.2,198.9,287.2,6.77,755.9,551.0,1743.9,936.5,6.65
3,17.43,96.064,15.07,15.20,15.01,13.13,12.89,1237.4,115.5,241.9,18.12,410.3,291.3,933.1,377.5,14.24
4,3.16,85.121,8.21,2.97,2.94,4.93,5.85,2937.9,288.7,336.4,2.96,1012.5,715.6,2110.2,1179.5,3.50
5,7.57,86.024,10.61,5.73,5.73,8.08,8.38,2255.3,197.9,287.5,6.81,750.3,546.7,1718.2,935.2,7.15
6,6.30,87.478,9.52,5.86,5.82,7.36,7.79,2771.1,256.4,278.0,6.10,843.8,580.3,2115.2,949.5,6.34
7,5.56,100.143,6.92,4.91,4.92,5.29,5.32,4336.6,445.7,373.0,5.59,1070.9,616.4,3183.2,1527.3,5.24
8,7.81,94.470,11.58,6.96,6.98,9.43,9.73,2229.8,193.1,293.9,7.59,754.8,555.1,1704.0,950.7,8.33
9,11.10,92.872,13.43,10.46,10.45,12.06,12.01,1181.3,112.8,263.9,10.85,405.7,288.8,891.0,193.2,12.07


In [4]:
mor_y = "____________"
mor_x = "____________"
print('X:',mor_x.shape)
print('y:',mor_y.shape)

X: (1049, 15)
y: (1049,)


- 데이터 정규화

In [5]:
scaler = "____________"
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [6]:
transformer = scaler."____________"
transformer

StandardScaler(copy=True, with_mean=True, with_std=True)

In [7]:
mor_x = transformer."____________"
mor_x

array([[ 0.35491563, -0.45799122, -0.2403576 , ...,  0.09625994,
         0.0848087 , -0.16115823],
       [ 1.8746428 ,  0.83226746,  2.29560079, ..., -1.49612074,
        -1.28483455, -1.64134569],
       [-0.2760822 , -0.64811132, -0.15582566, ...,  0.00836053,
        -0.29884137, -0.04882737],
       ...,
       [-0.53381371, -0.31709495, -0.75093056, ...,  0.24212201,
         2.18481727,  2.10453902],
       [ 0.19790678, -0.42605105, -0.09496266, ...,  0.11409461,
         0.14937522, -0.10714267],
       [ 0.79631787,  2.46405097,  1.25078593, ..., -1.0642671 ,
        -0.91132073, -0.7187815 ]])

In [8]:
mor_train_x, mor_test_x, mor_train_y, mor_test_y = train_test_split(mor_x, mor_y, test_size=0.4, random_state=0)

---

- 실제값 대비 잔차가 얼마나 좋아졌는지 확인

$$MAPE = \frac{100\%}{n} \sum^{n}_{t=1}|\frac{A_t-F_t}{A_t}|$$

$A_t$: actual value

$F_t$: forecast value

In [9]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

---

### 단순 Multiple Linear Regression 학습 모델과의 비교

SVM regression과 단순한 MLR(Multiple Linear Regression)의 성능 비교를 위해 MLR학습 모델을 정의하였습니다.

In [10]:
#  package
from sklearn.linear_model import LinearRegression

# model/fit
mor_regression = LinearRegression()
mor_regression."____________"

# prediction
mor_regression_pred = mor_regression."____________"

# eval
mor_regression_mape = mean_absolute_percentage_error(mor_test_y,mor_regression_pred)
mor_regression_rmse = np.sqrt(mean_squared_error(y_true=mor_test_y, y_pred=mor_regression_pred))

print('==== mortgage - regression ================')
print('MAPE:{}'.format(mor_regression_mape))
print('===========================================')

MAPE:1.153271361250734


---

### Neural Network Regression 학습 모델과의 비교

SVM regression과 ANN regression의 성능 비교를 위해 모델을 정의하였습니다. 

기본적인 학습 파라미터인 'hidden node'와 'max_iteration'을 cross-valdation을 통해서 최적화를 진행하였습니다.

In [11]:
### Neural Network model training
from sklearn.neural_network import MLPRegressor
def NN_CV_regresser(train_x, train_y, max_nodes,CV_N ,plot=False):
    
    #1부터 10까지의 neighbor에 대해 최적의 k를 cross-validation을 이용하여 찾아냄
    parameters = {'hidden_layer_sizes': np.arange(start=1, stop=max_nodes,step=10).tolist(),
                  'max_iter': [50,100,200,300,400,500]}
    
    # MLPRegressor(learning_rate_init=0.01)
    clf = "____________"
    
    grid_clf = "____________"
    
    # fitting
    grid_clf."____________"

    print( grid_clf.best_params_)
    
    optimal_parameters = list(grid_clf.best_params_.values())

    print("The optimal number of hidden nodes : {}\n& max iteration : {}".format(optimal_parameters[0],optimal_parameters[1]))

    opt_NN = MLPRegressor(hidden_layer_sizes=grid_clf.best_params_['hidden_layer_sizes'],
                          max_iter=grid_clf.best_params_['max_iter'])
    opt_NN.fit(X=train_x, y=train_y)

    return opt_NN

In [12]:
mor_optNNR=NN_CV_regresser(train_x=mor_train_x, train_y=mor_train_y,max_nodes=100,CV_N=5)
mor_optNNR_pred = mor_optNNR."____________"

{'hidden_layer_sizes': 91, 'max_iter': 500}
The optimal number of hidden nodes : 91
  & max iteration : 500


In [13]:
mor_nnr_mape = mean_absolute_percentage_error(mor_test_y,mor_optNNR_pred)
mor_nnr_rmse = np.sqrt(mean_squared_error(y_true=mor_test_y, y_pred=mor_optNNR_pred))

print('==== mortgage - NN regression ================')
print('MAPE:{}'.format(mor_nnr_mape))
print('==============================================')


MAPE:2.6285673841415296


---

### Kernel SVR

In [14]:
def CV_SVR_classifier(train_x, train_y,CV_n):
    parameters =  {'C':[0.01,0.05,0.1,1,5,10,],'gamma':[0.01,0.02,0.06,0.08,0.1,1,2,10], 'epsilon':[0.01,0.1,1]}
    print("매개변수 그리드:\n{}".format(parameters))
 
    # RBF SVR instance 생성
    clf = "____________"

    # Grid RBF SVR instance 생성
    grid_clf = "____________"
    
    # fitting with cross-validation
    grid_clf."____________"

    print(grid_clf.best_params_)
    

    print("The optimal 'C' : {}\nThe optimal 'epsilon' : {}\nThe optimal 'Gamma' : {}".format(grid_clf.best_params_['C'],
                                                                                              grid_clf.best_params_['epsilon'],
                                                                                              grid_clf.best_params_['gamma']))
    # using best params
    opt_SVR = "____________"
    
    # fitting
    opt_SVR."____________"
    
    return opt_SVR

In [15]:
mor_opt_SVR = CV_SVR_classifier(train_x=mor_train_x, train_y=mor_train_y,CV_n=10)
mor_opt_SVR

매개변수 그리드:
{'C': [0.01, 0.05, 0.1, 1, 5, 10], 'gamma': [0.01, 0.02, 0.06, 0.08, 0.1, 1, 2, 10], 'epsilon': [0.01, 0.1, 1]}
{'C': 10, 'epsilon': 0.01, 'gamma': 0.1}
The optimal 'C' : 10
The optimal 'epsilon' : 0.01
The optimal 'Gamma' : 0.1


SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.01, gamma=0.1,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [16]:
mor_opt_SVR_pred = mor_opt_SVR."____________"

In [17]:
mor_svr_mape = mean_absolute_percentage_error(mor_test_y,mor_opt_SVR_pred)
mor_svr_rmse = np.sqrt(mean_squared_error(y_true=mor_test_y, y_pred=mor_opt_SVR_pred))

print('==== mortgage - SVM regression ================')
print('MAPE:{}'.format(round(mor_svr_mape,4)))
print('===============================================')


MAPE:0.6545


---


- **세가지 모델('MLR', 'ANN', 'SVR')의 예측 성능 비교**



In [18]:
# pd.DataFrame([], index =[], columns=[])
"____________"

Unnamed: 0,RMSE,MAPE
L_reg,0.114446,1.153271
NN,0.239643,2.628567
SVR,0.073614,0.65449
