# 2. UCI-SECOM(결측값 900개 제거/오버샘플링 X/선형 회귀)

### 📌 데이터 분석 과정
- 데이터 전처리 : Null 값이 900개 넘는 칼럼 삭제하고 남는 NaN값은 0으로 대체
- 스케일링: StandardScaler, PCA
- 모델링 : Ridge, Lasso, ElasticNet
- 교차 검증 : cross_val_score
- 평가 : RMSE

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter("ignore")  #파이썬 경고 억제

In [2]:
dataset = pd.read_csv("uci-secom.csv") # 데이터 불러오기

In [3]:
dataset.head()  #데이터 확인

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,581,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,...,,0.5005,0.0118,0.0035,2.363,,,,,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,...,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045,-1
2,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,...,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1
3,2008-07-19 14:43:00,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,...,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,-1
4,2008-07-19 15:22:00,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,...,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,-1


In [4]:
dataset.isnull().any().any()  #Null 값 존재 여부 확인

True

## 1. Data Processing

### 1.1 결측값 처리

In [5]:
d = dataset.isnull().sum()

In [6]:
j = []

In [7]:
for i in d.keys():
    if(d[i]>900):
        print(i,d[i])
        j.append(i)

85 1341
109 1018
110 1018
111 1018
157 1429
158 1429
220 1341
244 1018
245 1018
246 1018
292 1429
293 1429
358 1341
382 1018
383 1018
384 1018
492 1341
516 1018
517 1018
518 1018
578 949
579 949
580 949
581 949


In [8]:
dataset.drop(j, axis=1, inplace=True)
dataset.head()

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,577,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,...,14.9509,0.5005,0.0118,0.0035,2.363,,,,,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,...,10.9003,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045,-1
2,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,...,9.2721,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1
3,2008-07-19 14:43:00,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,...,8.5831,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,-1
4,2008-07-19 15:22:00,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,...,10.9698,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,-1


In [9]:
dataset.replace(np.nan, 0, inplace=True)
dataset.head()

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,577,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,...,14.9509,0.5005,0.0118,0.0035,2.363,0.0,0.0,0.0,0.0,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,...,10.9003,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045,-1
2,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,...,9.2721,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1
3,2008-07-19 14:43:00,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,...,8.5831,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,-1
4,2008-07-19 15:22:00,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,...,10.9698,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,-1


In [10]:
dataset.isnull().any().any()  #Null 값 존재 여부 확인

False

### 1.2 데이터 분&오버샘플링

In [11]:
X = dataset.drop(['Pass/Fail','Time'],axis=1)  #Time과 Pass/Fail 칼럼 삭제한 값 X에 저장
y = dataset['Pass/Fail']  #Pass/Fail 칼럼만 y에 저장

print("shape of x:", X.shape)
print("shape of y:", y.shape)

shape of x: (1567, 566)
shape of y: (1567,)


In [12]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,576,577,582,583,584,585,586,587,588,589
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,1.6765,14.9509,0.5005,0.0118,0.0035,2.363,0.0,0.0,0.0,0.0
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,1.1065,10.9003,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,2.0952,9.2721,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,1.7585,8.5831,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,1.6597,10.9698,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1096, 566)
(1096,)
(471, 566)
(471,)


### 1.3 데이터 스케일링

In [15]:
print('X_train의 평균값 ')
print(X_train.mean())

print('\nX_train의 분산값')
print(X_train.var())

X_train의 평균값 
0      3004.093650
1      2484.738485
2      2184.538425
3      1385.454239
4         4.378115
          ...     
585       3.030747
586       0.021582
587       0.016570
588       0.005329
589     101.229596
Length: 566, dtype: float64

X_train의 분산값
0       38312.046531
1       34589.793602
2       35978.862976
3      203997.344117
4        3376.212713
           ...      
585         9.518938
586         0.000157
587         0.000086
588         0.000009
589      9383.171176
Length: 566, dtype: float64


In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
#StandardScaler 객체 생성
sc = StandardScaler()

#StandScaler로 데이터 세트 변환
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

X_sc = sc.fit_transform(X)

In [18]:
X_sc_pd = pd.DataFrame(X_sc)
X_sc_pd.head()  #표준화된 데이터 확인

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,556,557,558,559,560,561,562,563,564,565
0,0.139998,0.429208,0.032735,0.059342,-0.049911,0.094946,-0.228536,0.239971,0.43685,1.128343,...,-0.229797,-0.13552,0.055275,-0.204269,-0.092516,-0.196519,-1.734706,-1.868461,-1.840769,-1.061159
1,0.46402,-0.105874,0.236852,0.173847,-0.059375,0.094946,0.187826,0.278951,0.393723,0.022511,...,-0.263493,-0.460054,0.162312,0.407145,0.445214,0.385516,-0.958144,0.412587,0.251029,1.156951
2,-0.351256,0.407233,0.026413,0.684661,-0.047236,0.094946,-0.415634,0.232175,-0.192349,0.327111,...,-0.205046,-0.590505,-0.304064,0.022827,0.01503,0.030373,2.989383,3.624211,3.318999,-0.178407
3,-0.070903,-0.025985,0.086766,-1.033387,-0.05062,0.094946,0.354494,0.045074,0.300837,-0.765478,...,-0.22495,-0.645708,-0.059408,-0.291614,-0.361381,-0.282803,-0.100689,-0.177535,-0.306784,-0.274469
4,0.146544,0.09834,0.250931,-0.12507,-0.046823,0.094946,0.016475,0.1854,0.4656,-0.149655,...,-0.230791,-0.454486,-1.512057,26.860983,27.06285,26.907579,-0.100689,-0.177535,-0.306784,-0.274469


In [19]:
#DataFrame 형태로 변경
X_train_sc_pd = pd.DataFrame(X_train_sc)
X_test_sc_pd = pd.DataFrame(X_test_sc)

In [20]:
print('X_train_sc의 평균값 ')
print(X_train_sc_pd.mean())

print('\nX_train_sc의 분산값')
print(X_train_sc_pd.var())

X_train_sc의 평균값 
0     -1.825790e-15
1     -2.762186e-15
2     -9.245696e-16
3     -1.418168e-17
4     -4.460898e-17
           ...     
561    9.990488e-18
562    1.319656e-16
563   -7.658108e-17
564   -3.916170e-16
565    1.175054e-16
Length: 566, dtype: float64

X_train_sc의 분산값
0      1.000913
1      1.000913
2      1.000913
3      1.000913
4      1.000913
         ...   
561    1.000913
562    1.000913
563    1.000913
564    1.000913
565    1.000913
Length: 566, dtype: float64


In [21]:
from sklearn.decomposition import PCA

In [22]:
pca = PCA(0.85)

X_sc_pca = pca.fit_transform(X_sc) # StandardScaler에서 학습용 데이터에 적용한 것처럼 fit과 transform 둘 다를 적용해줘야 하네~~??
X_sc_pca_pd = pd.DataFrame(data = X_sc_pca)

In [23]:
# Transform train and test datasets
X_train_pca = pca.transform(X_train_sc)
X_test_pca = pca.transform(X_test_sc)

print('PCA차원 축소 이전 X_train:', X_train_sc.shape)
print('PCA차원 축소 이전 X_test:', X_test_sc.shape)
print('PCA차원 축소 이후 X_train:', X_train_pca.shape)
print('PCA차원 축소 이후 X_test:', X_test_pca.shape)

PCA차원 축소 이전 X_train: (1096, 566)
PCA차원 축소 이전 X_test: (471, 566)
PCA차원 축소 이후 X_train: (1096, 99)
PCA차원 축소 이후 X_test: (471, 99)


In [24]:
X_sc_pca_pd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,89,90,91,92,93,94,95,96,97,98
0,-1.480892,3.443087,4.556093,-2.232184,0.885531,-1.750156,1.411821,2.036742,0.768743,-0.998271,...,-1.978389,-1.640996,-0.631491,-0.069233,0.430680,0.122708,-0.782225,-2.126332,0.699609,0.674416
1,-1.989067,1.429700,3.487361,-1.856899,1.680818,-2.511428,0.874279,3.376972,-0.995468,-0.482618,...,-0.968337,-0.464505,1.746610,-0.112332,0.651566,1.513998,1.104444,0.510013,0.187142,0.617557
2,0.264950,0.942759,0.643228,-0.361364,-0.015905,0.323023,-0.804382,2.185448,-0.887683,1.684722,...,0.868783,3.323481,-0.823437,-0.373998,-1.230204,4.274918,-0.787184,-0.104358,1.879640,0.984710
3,0.423280,1.710080,-2.255846,5.844296,-0.866750,0.289288,-0.026192,25.742741,18.438800,16.554750,...,1.631955,-0.601355,0.502127,-3.025899,3.316678,-4.649706,-0.980783,-3.603541,1.912542,-0.821435
4,0.302437,3.314989,3.017150,-0.204792,0.107710,0.579850,-0.736321,2.274743,5.860552,1.713185,...,1.510240,-0.489043,1.842977,-2.015589,1.419746,-1.405011,-1.480750,3.123900,2.284525,-0.352683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,-1.279091,-4.394633,-3.994319,4.977084,1.495699,0.472086,-1.972459,0.375644,1.462248,-2.144301,...,1.412541,-0.648601,0.436758,0.667039,0.198648,-0.219622,-0.434466,-0.248356,1.567079,1.024875
1563,-0.381968,0.069666,1.243167,0.564084,1.101920,-1.235559,0.295034,4.483768,-1.984628,-2.560960,...,-1.186694,-2.281451,0.573762,0.045348,-0.410490,-1.237799,-0.145078,-1.167611,-0.961543,0.742751
1564,-1.135744,-0.977762,-17.543619,-5.067267,55.976869,-2.741146,-0.260357,-2.810107,2.951807,-2.642046,...,-0.955336,0.914713,1.102924,0.121679,1.278490,-0.755959,-0.445384,-0.036693,-1.553303,-0.452818
1565,-1.145397,-4.149672,-3.887361,1.943701,1.282016,0.101448,-2.176326,-0.293376,1.848950,-5.863599,...,2.488764,1.264039,-2.167528,0.789507,0.972609,0.290541,-1.006687,0.713894,0.182323,-0.166799


In [25]:
X_sc_pca_pd.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,89,90,91,92,93,94,95,96,97,98
count,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,...,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0
mean,1.147774e-17,-3.592106e-16,6.416196e-16,-5.441297e-16,8.615387000000001e-17,-6.574901e-17,-3.7408920000000004e-17,2.692309e-16,-2.040486e-16,3.83158e-16,...,-2.8977740000000005e-17,4.534414e-17,-2.1892720000000003e-17,-3.7692320000000003e-17,-1.101721e-16,6.309213e-17,2.125507e-18,4.7469650000000005e-17,7.325913e-17,-1.7145750000000003e-17
std,5.092896,4.149319,3.646348,3.450775,3.259465,3.16487,2.938732,2.863489,2.837659,2.704716,...,1.171237,1.170212,1.145915,1.13781,1.127782,1.124712,1.106876,1.102975,1.088925,1.086226
min,-9.304416,-9.054036,-17.8468,-10.83546,-5.662818,-20.30334,-46.15106,-21.51964,-16.7817,-11.30524,...,-4.286098,-4.746583,-4.464508,-6.045724,-5.583018,-4.649706,-4.139484,-3.733346,-4.036008,-3.622699
25%,-1.09753,-2.254008,-1.898392,-1.692172,-0.7693717,-1.356117,-0.8069957,-1.417572,-1.857939,-1.644506,...,-0.7566617,-0.7066462,-0.7587375,-0.7151263,-0.6491579,-0.720626,-0.7456147,-0.6813539,-0.6765627,-0.7122765
50%,-0.4201242,-0.6689511,-0.1544717,-0.06078513,-0.1953381,-0.2582915,-0.01555921,0.2922021,-0.02145469,-0.0913066,...,-0.005034191,-0.02003333,-0.001955805,0.01754763,0.01303057,0.01436333,0.02155433,0.007630727,0.00329165,0.01557076
75%,0.3983487,1.343064,2.053309,1.525247,0.3389696,1.118596,0.819283,1.80619,1.67817,1.544008,...,0.8070431,0.7130696,0.7079437,0.7035629,0.6435116,0.7161274,0.7364748,0.6794236,0.6356659,0.7114206
max,105.4699,32.05599,81.59886,103.9124,56.17039,94.48666,85.83749,25.74274,25.33041,29.64458,...,5.400102,6.136647,6.545766,10.1698,6.694362,4.87686,4.465981,5.809036,7.183777,5.858476


## 2. Data Modeling

### 2.1 Lasso

In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings(action='ignore')

In [27]:
lasso = Lasso(alpha=0.1, random_state=50)
lasso.fit(X_train_pca,y_train)

# 회귀 계수 출력
print ("Lasso model:", (lasso.coef_))

Lasso model: [ 0.         0.0085509 -0.        -0.        -0.         0.
  0.         0.        -0.         0.         0.        -0.
 -0.         0.         0.        -0.         0.        -0.
  0.         0.         0.         0.         0.        -0.
  0.        -0.         0.        -0.        -0.         0.
  0.         0.         0.         0.        -0.        -0.
  0.        -0.        -0.         0.        -0.        -0.
  0.        -0.         0.        -0.         0.        -0.
  0.         0.        -0.        -0.        -0.         0.
 -0.         0.         0.        -0.        -0.         0.
  0.        -0.         0.        -0.         0.        -0.
 -0.        -0.         0.         0.         0.        -0.
  0.         0.         0.        -0.        -0.        -0.
  0.         0.         0.        -0.         0.         0.
  0.         0.         0.        -0.        -0.         0.
 -0.         0.        -0.        -0.        -0.         0.
 -0.        -0.         0. 

In [28]:
# 회귀 계수 큰 값 순으로 정렬
coef = pd.Series(data=np.round(lasso.coef_,1), index=X_sc_pca_pd.columns)
coef.sort_values(ascending=False)

98    0.0
24    0.0
26    0.0
27   -0.0
28   -0.0
     ... 
68    0.0
69    0.0
70    0.0
71   -0.0
0     0.0
Length: 99, dtype: float64

In [29]:
y_pred = lasso.predict(X_test_pca)

#Convert the sign of the predicted values as the classifier
y_pred2 = np.sign(y_pred)

In [30]:
print("Accuracy : ", lasso.score(X_test_pca,y_test)*100)

Accuracy :  1.6165464229399684


In [31]:
mse = mean_squared_error(y_test, y_pred2)
rmse = np.sqrt(mse)

print("RMSE : ", rmse)

RMSE :  0.5213081699608597


In [32]:
neg_mse_scores = cross_val_score(lasso, X_test_pca, y_test, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-1*neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

In [33]:
print('5 folds의 개별 Negative MSE scores : ', np.round(neg_mse_scores,2))
print('5 folds의 개별 RMSE scores : ', np.round(rmse_scores,2))
print('5 folds의 평균 RMSE : {0:.3f}'.format(avg_rmse))

5 folds의 개별 Negative MSE scores :  [-0.28 -0.28 -0.24 -0.27 -0.23]
5 folds의 개별 RMSE scores :  [0.53 0.53 0.49 0.52 0.48]
5 folds의 평균 RMSE : 0.508


### 2.2 Ridge

In [34]:
ridge = Ridge(alpha=0.1, random_state=50)
ridge.fit(X_train_pca,y_train)

# 회귀 계수 출력
print ("Lasso model:", (ridge.coef_))

Lasso model: [ 0.00195363  0.0142041  -0.0012221  -0.00112671 -0.0063186   0.00139699
  0.0005313   0.00821429 -0.00660063  0.00642829  0.01107145  0.00206351
 -0.00263874  0.01363439  0.00279727 -0.00922669  0.00616973 -0.00040982
  0.01161065  0.00015386  0.00700773  0.0053361   0.00874739 -0.00790072
  0.00399897 -0.01850793  0.00543184 -0.00469302 -0.00592482  0.02059968
  0.00477085  0.01226035 -0.00046995  0.00601929 -0.01050514 -0.01618528
  0.01056514 -0.00207146 -0.01261526  0.0028     -0.00681904 -0.00770396
  0.0104818  -0.00361731  0.00888975 -0.01101013  0.00825613 -0.02533997
  0.01751528  0.00697875 -0.00220714 -0.01608126 -0.00430665  0.00913724
 -0.0024619   0.01339332  0.01229147 -0.01137314 -0.01130505 -0.001924
  0.01312334 -0.02857707  0.00839877 -0.01756486 -0.00125951 -0.01629578
 -0.01183104 -0.02948904  0.00322531  0.02334835  0.02647136 -0.00904114
  0.00385588  0.00961402  0.00332448 -0.02567831 -0.01239102 -0.00965548
  0.00641053  0.00684999  0.00779016 -0.

In [35]:
# 회귀 계수 큰 값 순으로 정렬
coef = pd.Series(data=np.round(ridge.coef_,1), index=X_sc_pca_pd.columns)
coef.sort_values(ascending=False)

98    0.0
24    0.0
26    0.0
27   -0.0
28   -0.0
     ... 
68    0.0
69    0.0
70    0.0
71   -0.0
0     0.0
Length: 99, dtype: float64

In [36]:
y_pred = ridge.predict(X_test_pca)

#Convert the sign of the predicted values as the classifier
y_pred2 = np.sign(y_pred)

In [37]:
print("Accuracy : ", ridge.score(X_test_pca,y_test)*100)

Accuracy :  -1118.0700473008353


In [38]:
mse = mean_squared_error(y_test, y_pred2)
rmse = np.sqrt(mse)

print("RMSE : ", rmse)

RMSE :  0.545197134139854


In [39]:
neg_mse_scores = cross_val_score(ridge, X_test_pca, y_test, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-1*neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

In [40]:
print('5 folds의 개별 Negative MSE scores : ', np.round(neg_mse_scores,2))
print('5 folds의 개별 RMSE scores : ', np.round(rmse_scores,2))
print('5 folds의 평균 RMSE : {0:.3f}'.format(avg_rmse))

5 folds의 개별 Negative MSE scores :  [-11.43  -0.33  -0.5   -0.35  -0.3 ]
5 folds의 개별 RMSE scores :  [3.38 0.58 0.71 0.59 0.54]
5 folds의 평균 RMSE : 1.160


### 2.3 ElasticNet

In [41]:
elasticnet = ElasticNet(alpha=0.1, random_state=50)
elasticnet.fit(X_train_pca,y_train)

ElasticNet(alpha=0.1, random_state=50)

In [42]:
y_pred = elasticnet.predict(X_test_pca)

#Convert the sign of the predicted values as the classifier
y_pred2 = np.sign(y_pred)

In [43]:
mse = mean_squared_error(y_test, y_pred2)
rmse = np.sqrt(mse)

print("RMSE : ", rmse)

RMSE :  0.5213081699608597


In [44]:
neg_mse_scores = cross_val_score(elasticnet, X_test_pca, y_test, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-1*neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

In [45]:
print('5 folds의 개별 Negative MSE scores : ', np.round(neg_mse_scores,2))
print('5 folds의 개별 RMSE scores : ', np.round(rmse_scores,2))
print('5 folds의 평균 RMSE : {0:.3f}'.format(avg_rmse))

5 folds의 개별 Negative MSE scores :  [-0.35 -0.3  -0.24 -0.26 -0.23]
5 folds의 개별 RMSE scores :  [0.6  0.55 0.49 0.51 0.48]
5 folds의 평균 RMSE : 0.526
