# 8. UCI-SECOM(결측값 50% 제거/오버샘플링/선형 회귀)

### 📌 데이터 분석 과정
- 데이터 전처리 : Null 값의 비율이 50%가 넘는 칼럼은 삭제하고 남은 결측값은 앞 또는 뒤 값을 가져옴체
- 스케일링: StandardScaler, PCA
- 모델링 : Ridge, Lasso, ElasticNet
- 교차 검증 : cross_val_score
- 평가 : RMSE

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter("ignore")  #파이썬 경고 억제

In [2]:
dataset = pd.read_csv("uci-secom.csv") # 데이터 불러오기

In [3]:
dataset.head()  #데이터 확인

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,581,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,...,,0.5005,0.0118,0.0035,2.363,,,,,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,...,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045,-1
2,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,...,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1
3,2008-07-19 14:43:00,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,...,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,-1
4,2008-07-19 15:22:00,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,...,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,-1


In [4]:
dataset.isnull().any().any()  #Null 값 존재 여부 확인

True

## 1. Data Processing

### 1.1 결측값 처리

In [5]:
def null_values(dataset):
    nv=pd.concat([dataset.isnull().sum(), 100 * dataset.isnull().sum()/dataset.shape[0]],axis=1).rename(columns={0:'Missing_Records', 1:'Percentage (%)'})
    return nv[nv.Missing_Records>0].sort_values('Missing_Records', ascending=False)

In [6]:
dataset_na = null_values(dataset)
dataset_na

Unnamed: 0,Missing_Records,Percentage (%)
292,1429,91.193363
293,1429,91.193363
157,1429,91.193363
158,1429,91.193363
358,1341,85.577537
...,...,...
456,1,0.063816
218,1,0.063816
356,1,0.063816
457,1,0.063816


In [7]:
dataset_na = dataset_na[dataset_na["Percentage (%)"] > 50]
dataset_na

Unnamed: 0,Missing_Records,Percentage (%)
292,1429,91.193363
293,1429,91.193363
157,1429,91.193363
158,1429,91.193363
358,1341,85.577537
85,1341,85.577537
492,1341,85.577537
220,1341,85.577537
518,1018,64.964901
246,1018,64.964901


In [8]:
dataset_na.shape

(28, 2)

In [9]:
dataset_na.index

Index(['292', '293', '157', '158', '358', '85', '492', '220', '518', '246',
       '245', '516', '517', '110', '384', '382', '383', '109', '244', '111',
       '580', '578', '581', '579', '73', '72', '345', '346'],
      dtype='object')

In [10]:
dataset = dataset.drop(axis=1, columns=dataset_na.index)
dataset.shape

(1567, 564)

In [11]:
dataset.head()

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,577,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,...,14.9509,0.5005,0.0118,0.0035,2.363,,,,,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,...,10.9003,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045,-1
2,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,...,9.2721,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1
3,2008-07-19 14:43:00,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,...,8.5831,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,-1
4,2008-07-19 15:22:00,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,...,10.9698,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,-1


In [12]:
dataset.fillna(method='ffill', inplace=True)
dataset.fillna(method='bfill', inplace=True)
dataset

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,577,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.00,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,...,14.9509,0.5005,0.0118,0.0035,2.3630,0.0096,0.0201,0.0060,208.2045,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,...,10.9003,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.0060,208.2045,-1
2,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,...,9.2721,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1
3,2008-07-19 14:43:00,2988.72,2479.90,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,...,8.5831,0.4990,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,-1
4,2008-07-19 15:22:00,3032.24,2502.87,2233.3667,1326.5200,1.5334,100.0,100.3967,0.1235,1.5031,...,10.9698,0.4800,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,2008-10-16 15:13:00,2899.41,2464.36,2179.7333,3085.3781,1.4843,100.0,82.2467,0.1248,1.3424,...,11.7256,0.4988,0.0143,0.0039,2.8669,0.0068,0.0138,0.0047,203.1720,-1
1563,2008-10-16 20:49:00,3052.31,2522.55,2198.5667,1124.6595,0.8763,100.0,98.4689,0.1205,1.4333,...,17.8379,0.4975,0.0131,0.0036,2.6238,0.0068,0.0138,0.0047,203.1720,-1
1564,2008-10-17 05:26:00,2978.81,2379.78,2206.3000,1110.4967,0.8236,100.0,99.4122,0.1208,1.4333,...,17.7267,0.4987,0.0153,0.0041,3.0590,0.0197,0.0086,0.0025,43.5231,-1
1565,2008-10-17 06:01:00,2894.92,2532.01,2177.0333,1183.7287,1.5726,100.0,98.7978,0.1213,1.4622,...,19.2104,0.5004,0.0178,0.0038,3.5662,0.0262,0.0245,0.0075,93.4941,-1


In [13]:
null_values(dataset)

Unnamed: 0,Missing_Records,Percentage (%)


In [14]:
dataset.shape

(1567, 564)

In [15]:
# nunique() 함수를 써서 5열의 고유치 개수를 확인
dataset["5"].nunique()

1

In [16]:
# Column '5'와 같이 고유값이 1개인 열을 저장하는 unique_columns함수 생성
def unique_columns(df):
    uni_col_list = []
    for column in df.columns:
        if df[column].nunique() == 1:
            uni_col_list.append(column)
    return uni_col_list

In [17]:
# 고유치 개수가 1개인 열의 개수 확인
len(unique_columns(dataset))

116

In [18]:
dataset.shape

(1567, 564)

In [19]:
dataset = dataset.drop(axis=1, columns=unique_columns(dataset))
dataset.shape

(1567, 448)

### 1.2 데이터 분&오버샘플링

In [20]:
X = dataset.drop(['Pass/Fail','Time'],axis=1)  #Time과 Pass/Fail 칼럼 삭제한 값 X에 저장
y = dataset['Pass/Fail']  #Pass/Fail 칼럼만 y에 저장

print("shape of x:", X.shape)
print("shape of y:", y.shape)

shape of x: (1567, 446)
shape of y: (1567,)


In [21]:
X.head()

Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,576,577,582,583,584,585,586,587,588,589
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,97.6133,0.1242,1.5005,0.0162,-0.0034,...,1.6765,14.9509,0.5005,0.0118,0.0035,2.363,0.0096,0.0201,0.006,208.2045
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,102.3433,0.1247,1.4966,-0.0005,-0.0148,...,1.1065,10.9003,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,95.4878,0.1241,1.4436,0.0041,0.0013,...,2.0952,9.2721,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,104.2367,0.1217,1.4882,-0.0124,-0.0033,...,1.7585,8.5831,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.3967,0.1235,1.5031,-0.0031,-0.0072,...,1.6597,10.9698,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_resample, y_resample  = SMOTE(random_state=1).fit_sample(X, y.values.ravel())

print(X_resample.shape)
print(y_resample.shape)

(2926, 446)
(2926,)


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size = 0.3, random_state = 1)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2048, 446)
(2048,)
(878, 446)
(878,)


### 1.3 데이터 스케일링

In [25]:
print('X_train의 평균값 ')
print(X_train.mean())

print('\nX_train의 분산값')
print(X_train.var())

X_train의 평균값 
0      3011.144012
1      2493.032969
2      2200.443854
3      1379.807452
4         3.497539
          ...     
585       3.063362
586       0.021704
587       0.017219
588       0.005502
589     100.078616
Length: 446, dtype: float64

X_train의 분산값
0        5425.488405
1        5185.374302
2         714.053345
3      137536.018966
4        2409.480186
           ...      
585         5.511276
586         0.000129
587         0.000068
588         0.000007
589      6659.807276
Length: 446, dtype: float64


In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
#StandardScaler 객체 생성
sc = StandardScaler()

#StandScaler로 데이터 세트 변환
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

X_sc = sc.fit_transform(X)

In [28]:
X_sc_pd = pd.DataFrame(X_sc)
X_sc_pd.head()  #표준화된 데이터 확인

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,436,437,438,439,440,441,442,443,444,445
0,0.225661,0.848549,-0.438248,0.030352,-0.050141,-0.553885,0.265085,0.510062,1.1288,-0.380965,...,-0.229797,-0.13552,0.117015,-0.204837,-0.093217,-0.197053,-0.95928,0.411568,0.24996,1.155458
1,1.106653,-0.382768,1.010742,0.149398,-0.059605,0.200866,0.321051,0.457237,0.02307,-1.607223,...,-0.263493,-0.460054,0.527813,0.406731,0.444697,0.385118,-0.95928,0.411568,0.24996,1.155458
2,-1.110023,0.797981,-0.483128,0.680468,-0.047466,-0.893044,0.253892,-0.260632,0.327643,0.124597,...,-0.205046,-0.590505,-1.262092,0.022317,0.014366,0.029892,2.990884,3.626604,3.321193,-0.17981
3,-0.347765,-0.19893,-0.054692,-1.105709,-0.050851,0.502989,-0.014743,0.343462,-0.764845,-0.370208,...,-0.22495,-0.645708,-0.323126,-0.292204,-0.362173,-0.283356,-0.101252,-0.179181,-0.308446,-0.275866
4,0.243458,0.087165,1.110687,-0.161373,-0.047053,-0.109747,0.186733,0.545278,-0.149079,-0.789718,...,-0.230791,-0.454486,-5.898239,26.867225,27.071414,26.913331,-0.101252,-0.179181,-0.308446,-0.275866


In [29]:
#DataFrame 형태로 변경
X_train_sc_pd = pd.DataFrame(X_train_sc)
X_test_sc_pd = pd.DataFrame(X_test_sc)

In [30]:
print('X_train_sc의 평균값 ')
print(X_train_sc_pd.mean())

print('\nX_train_sc의 분산값')
print(X_train_sc_pd.var())

X_train_sc의 평균값 
0      2.306315e-15
1     -4.322714e-15
2     -1.307298e-14
3     -1.962406e-16
4      4.977166e-18
           ...     
441   -1.667638e-17
442    4.453360e-17
443    3.729655e-17
444    3.361027e-18
445   -9.107298e-18
Length: 446, dtype: float64

X_train_sc의 분산값
0      1.000489
1      1.000489
2      1.000489
3      1.000489
4      1.000489
         ...   
441    1.000489
442    1.000489
443    1.000489
444    1.000489
445    1.000489
Length: 446, dtype: float64


In [31]:
from sklearn.decomposition import PCA

In [32]:
pca = PCA(0.85)

X_sc_pca = pca.fit_transform(X_sc) # StandardScaler에서 학습용 데이터에 적용한 것처럼 fit과 transform 둘 다를 적용해줘야 하네~~??
X_sc_pca_pd = pd.DataFrame(data = X_sc_pca)

In [33]:
# Transform train and test datasets
X_train_pca = pca.transform(X_train_sc)
X_test_pca = pca.transform(X_test_sc)

print('PCA차원 축소 이전 X_train:', X_train_sc.shape)
print('PCA차원 축소 이전 X_test:', X_test_sc.shape)
print('PCA차원 축소 이후 X_train:', X_train_pca.shape)
print('PCA차원 축소 이후 X_test:', X_test_pca.shape)

PCA차원 축소 이전 X_train: (2048, 446)
PCA차원 축소 이전 X_test: (878, 446)
PCA차원 축소 이후 X_train: (2048, 105)
PCA차원 축소 이후 X_test: (878, 105)


In [34]:
X_sc_pca_pd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
0,-1.782791,3.310516,3.932738,-2.499307,-1.523197,-1.325046,1.879771,0.348805,2.077824,1.688163,...,-0.879597,-1.597523,0.103828,-0.775828,-0.188024,0.984490,-0.122749,-0.026944,-0.444697,0.163044
1,-2.308054,1.127098,2.994238,-2.072917,-2.104424,-1.650421,1.854886,-1.661375,3.127837,0.957623,...,-0.874492,-1.750679,-0.992883,1.980152,-0.227913,-0.299410,-0.026762,0.457744,1.120423,0.704828
2,0.084345,1.016602,1.395086,-0.416968,1.894827,-1.969401,1.264174,-0.475057,2.324017,0.500682,...,-0.277269,-0.677653,1.681806,0.869927,-0.293659,1.111161,-0.771199,1.257127,-2.804366,0.638406
3,1.142221,5.022797,4.462427,-3.882784,1.539173,-7.398277,-5.624882,9.407509,6.522849,4.848480,...,0.370177,-0.625577,0.469737,-0.075763,0.832901,0.824972,1.223726,-1.220631,1.805943,0.452387
4,0.824743,2.331620,2.492553,-0.096323,3.662749,-4.893456,-1.780137,3.173356,-1.354895,2.461963,...,0.426510,0.579608,1.989791,-1.076240,-2.225584,0.512941,0.188578,-0.917982,0.716724,2.342091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,-1.076417,-3.981591,-2.990087,5.915839,1.957389,-1.115185,-0.472458,0.502001,1.632937,-0.679923,...,-1.123765,-1.406492,1.385098,-0.660136,0.131356,-0.369535,0.458754,-0.622539,0.618022,-2.195900
1563,-0.421181,0.492293,2.299712,-0.093414,1.140234,-1.832707,5.108660,-2.873185,1.805468,1.061318,...,-1.905081,-1.045418,0.353778,-1.757976,0.724189,0.637349,1.433791,-0.479098,1.108009,-0.208709
1564,-1.271202,-1.425527,-0.821447,2.135336,-2.205088,-0.031404,-0.475627,-1.028669,0.295908,0.323200,...,-0.106197,-0.751618,1.364894,-0.855894,-0.293999,0.443955,-0.042150,-1.223821,1.079353,0.434340
1565,-1.143090,-3.449855,-3.811150,3.200253,2.870841,-2.575377,0.872113,1.060514,0.951481,-2.345769,...,-0.346007,0.257588,1.723857,-0.286277,-0.166879,1.788604,-0.555770,-0.530852,-0.850115,-0.712494


In [35]:
X_sc_pca_pd.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
count,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,...,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0
mean,4.602431e-16,1.643725e-15,-9.777331e-17,2.491094e-16,2.196357e-16,-6.489881e-16,-8.785428e-18,-6.659921000000001e-17,-2.078037e-16,-7.396763e-17,...,1.2823890000000002e-17,3.9676130000000005e-17,-3.9073900000000006e-17,-1.643725e-17,-2.713564e-17,-7.627027e-17,1.689778e-17,-1.55162e-17,-2.8906890000000003e-17,5.590083e-17
std,5.055478,4.150362,3.660669,3.472044,3.130901,3.049505,2.933979,2.906331,2.767981,2.621447,...,1.113742,1.102922,1.097014,1.080287,1.077829,1.070343,1.063605,1.054437,1.052615,1.03517
min,-4.58203,-8.400412,-17.54912,-10.79304,-46.00558,-11.658,-26.67949,-37.73539,-14.08771,-16.05459,...,-6.273765,-3.683675,-4.793863,-3.638962,-4.557697,-3.677055,-4.667348,-4.046119,-5.381531,-3.570671
25%,-1.183271,-2.160019,-1.694276,-2.042725,-1.7966,-1.738253,-1.305998,-1.562594,-1.793273,-1.262513,...,-0.6809744,-0.7051724,-0.7234107,-0.7669103,-0.6884115,-0.6964983,-0.6574561,-0.7387902,-0.6610164,-0.6712793
50%,-0.4641254,-0.6394576,-0.1433646,-0.1350749,0.02325896,0.03396328,0.1817918,-0.1519961,-0.05636912,-0.1106314,...,-0.02034984,-0.01000699,-0.01044508,0.004221591,-0.006412952,-0.01667646,0.01082921,-0.01501405,-0.0128011,0.06122087
75%,0.4726549,1.152886,1.814965,1.856308,1.757138,1.575311,1.522023,1.439886,1.80153,1.072877,...,0.6744316,0.7098217,0.6990741,0.710377,0.6540122,0.7011472,0.653063,0.7231406,0.6544552,0.6762651
max,104.1173,32.20186,99.73046,91.8572,58.77679,72.6851,50.86949,50.06345,18.88549,43.54671,...,8.305547,5.643938,5.829439,4.508657,5.871356,4.999072,4.747863,5.075104,5.585031,3.735398


## 2. Data Modeling

### 2.1 Lasso

In [36]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings(action='ignore')

In [37]:
lasso = Lasso(alpha=0.1, random_state=50)
lasso.fit(X_train_pca,y_train)

# 회귀 계수 출력
print ("Lasso model:", (lasso.coef_))

Lasso model: [ 0.00750286  0.04133769  0.         -0.01460349 -0.00617559  0.00016598
 -0.         -0.01664508  0.01294108  0.03727213  0.04432673 -0.04173814
  0.02393123 -0.00178926  0.          0.00290404 -0.          0.
 -0.01122505 -0.          0.         -0.00646822  0.00955722  0.02660342
  0.         -0.         -0.00810494  0.         -0.01431085  0.
 -0.          0.         -0.         -0.01611877  0.         -0.
 -0.01022952 -0.         -0.          0.          0.02080161  0.
  0.          0.          0.          0.          0.00875623 -0.
  0.00062821  0.         -0.          0.         -0.          0.
 -0.          0.00830005 -0.          0.00566823  0.          0.
 -0.          0.00115102  0.         -0.02847935 -0.          0.
  0.08327833 -0.         -0.          0.          0.          0.
 -0.         -0.         -0.          0.          0.         -0.02105081
 -0.          0.          0.          0.         -0.         -0.
 -0.         -0.         -0.         -0.     

In [38]:
# 회귀 계수 큰 값 순으로 정렬
coef = pd.Series(data=np.round(lasso.coef_,1), index=X_sc_pca_pd.columns)
coef.sort_values(ascending=False)

99     0.1
66     0.1
104   -0.0
37    -0.0
27     0.0
      ... 
72    -0.0
73    -0.0
74    -0.0
75     0.0
0      0.0
Length: 105, dtype: float64

In [39]:
y_pred = lasso.predict(X_test_pca)

#Convert the sign of the predicted values as the classifier
y_pred2 = np.sign(y_pred)

In [40]:
print("Accuracy : ", lasso.score(X_test_pca,y_test)*100)

Accuracy :  24.554454697174542


In [41]:
mse = mean_squared_error(y_test, y_pred2)
rmse = np.sqrt(mse)

print("RMSE : ", rmse)

RMSE :  1.0056786373589186


In [42]:
neg_mse_scores = cross_val_score(lasso, X_test_pca, y_test, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-1*neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

In [43]:
print('5 folds의 개별 Negative MSE scores : ', np.round(neg_mse_scores,2))
print('5 folds의 개별 RMSE scores : ', np.round(rmse_scores,2))
print('5 folds의 평균 RMSE : {0:.3f}'.format(avg_rmse))

5 folds의 개별 Negative MSE scores :  [-0.7  -0.77 -0.74 -0.71 -0.71]
5 folds의 개별 RMSE scores :  [0.84 0.88 0.86 0.84 0.84]
5 folds의 평균 RMSE : 0.853


### 2.2 Ridge

In [44]:
ridge = Ridge(alpha=0.1, random_state=50)
ridge.fit(X_train_pca,y_train)

# 회귀 계수 출력
print ("Lasso model:", (ridge.coef_))

Lasso model: [ 0.01559843  0.04410227  0.00288186 -0.01842367 -0.01245348  0.00767265
 -0.01006345 -0.02927989  0.03086141  0.04408766  0.04374105 -0.04440268
  0.02977546 -0.01237194  0.01255561  0.01894592 -0.01662693  0.01532007
 -0.031981   -0.01415214  0.0110172  -0.02169364  0.02516795  0.04159496
  0.00798852 -0.02097268 -0.02605096 -0.00841919 -0.02640434 -0.00320913
 -0.01623091  0.01240559 -0.01041686 -0.04480415  0.0201461  -0.00605821
 -0.03794889 -0.00934558 -0.00776357  0.00175164  0.03902932  0.02636676
 -0.008662   -0.00435234  0.01499112  0.00187625  0.03737406 -0.0302147
  0.03395106  0.00207982 -0.02603167  0.01218023 -0.00690064  0.00763602
 -0.01173171  0.05645146 -0.03267239  0.02728822  0.00649655  0.01894146
 -0.02353747  0.05100521  0.01225857 -0.05959704 -0.01179295 -0.0011567
  0.11003491 -0.0021752  -0.02976485  0.01201844  0.04766413  0.01756997
 -0.01723514 -0.02253258 -0.03403908  0.02743145  0.0395916  -0.06449557
 -0.04612569  0.00909084  0.01321052  0.

In [45]:
# 회귀 계수 큰 값 순으로 정렬
coef = pd.Series(data=np.round(ridge.coef_,1), index=X_sc_pca_pd.columns)
coef.sort_values(ascending=False)

55     0.1
61     0.1
101    0.1
99     0.1
66     0.1
      ... 
63    -0.1
98    -0.1
77    -0.1
102   -0.1
88    -0.1
Length: 105, dtype: float64

In [46]:
y_pred = ridge.predict(X_test_pca)

#Convert the sign of the predicted values as the classifier
y_pred2 = np.sign(y_pred)

In [47]:
print("Accuracy : ", ridge.score(X_test_pca,y_test)*100)

Accuracy :  35.89718743164381


In [48]:
mse = mean_squared_error(y_test, y_pred2)
rmse = np.sqrt(mse)

print("RMSE : ", rmse)

RMSE :  0.9005187065100244


In [49]:
neg_mse_scores = cross_val_score(ridge, X_test_pca, y_test, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-1*neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

In [50]:
print('5 folds의 개별 Negative MSE scores : ', np.round(neg_mse_scores,2))
print('5 folds의 개별 RMSE scores : ', np.round(rmse_scores,2))
print('5 folds의 평균 RMSE : {0:.3f}'.format(avg_rmse))

5 folds의 개별 Negative MSE scores :  [-0.79 -0.82 -0.69 -0.64 -0.68]
5 folds의 개별 RMSE scores :  [0.89 0.91 0.83 0.8  0.83]
5 folds의 평균 RMSE : 0.851


### 2.3 ElasticNet

In [51]:
elasticnet = ElasticNet(alpha=0.1, random_state=50)
elasticnet.fit(X_train_pca,y_train)

ElasticNet(alpha=0.1, random_state=50)

In [52]:
y_pred = elasticnet.predict(X_test_pca)

#Convert the sign of the predicted values as the classifier
y_pred2 = np.sign(y_pred)

In [53]:
mse = mean_squared_error(y_test, y_pred2)
rmse = np.sqrt(mse)

print("RMSE : ", rmse)

RMSE :  0.9473617604902469


In [54]:
neg_mse_scores = cross_val_score(elasticnet, X_test_pca, y_test, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-1*neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

In [55]:
print('5 folds의 개별 Negative MSE scores : ', np.round(neg_mse_scores,2))
print('5 folds의 개별 RMSE scores : ', np.round(rmse_scores,2))
print('5 folds의 평균 RMSE : {0:.3f}'.format(avg_rmse))

5 folds의 개별 Negative MSE scores :  [-0.65 -0.73 -0.7  -0.65 -0.62]
5 folds의 개별 RMSE scores :  [0.8  0.86 0.84 0.81 0.79]
5 folds의 평균 RMSE : 0.818
