# 7. UCI-SECOM(결측값 50% 제거/오버샘플링/XGboost)

### 📌 데이터 분석 과정
- 데이터 전처리 : Null 값의 비율이 50%가 넘는 칼럼은 삭제하고 남은 결측값은 앞 또는 뒤 값을 가져온 뒤 고유치가 1인 열 제거 
- 스케일링: OverSampling, StandardScaler, PCA
- 모델링 : XGBoost
- 교차 검증 : GridsearchCV
- 평가 : Confusion matrix, Accuracy score, Recall score, F1_score, Precision score

In [64]:
#Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings(action='ignore')

In [65]:
# Read the data
data = pd.read_csv('uci-secom.csv')

In [66]:
# Data shape 및 상위 5개 열 보기
print(data.shape)
data.head()

(1567, 592)


Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,581,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,...,,0.5005,0.0118,0.0035,2.363,,,,,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,...,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045,-1
2,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,...,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1
3,2008-07-19 14:43:00,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,...,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,-1
4,2008-07-19 15:22:00,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,...,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,-1


In [67]:
# include='all'하면 입력의 모든 열이 데이터 출력으로 포함, 기본값인 'None'은 수치데이터가 있는 열만 결과에 포함
data.describe(include='all')

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,581,582,583,584,585,586,587,588,589,Pass/Fail
count,1567,1561.0,1560.0,1553.0,1553.0,1553.0,1553.0,1553.0,1558.0,1565.0,...,618.0,1566.0,1566.0,1566.0,1566.0,1566.0,1566.0,1566.0,1566.0,1567.0
unique,1534,,,,,,,,,,...,,,,,,,,,,
top,2008-10-15 01:52:00,,,,,,,,,,...,,,,,,,,,,
freq,3,,,,,,,,,,...,,,,,,,,,,
mean,,3014.452896,2495.850231,2200.547318,1396.376627,4.197013,100.0,101.112908,0.121822,1.462862,...,97.934373,0.500096,0.015318,0.003847,3.067826,0.021458,0.016475,0.005283,99.670066,-0.867262
std,,73.621787,80.407705,29.513152,441.69164,56.35554,0.0,6.237214,0.008961,0.073897,...,87.520966,0.003404,0.01718,0.00372,3.578033,0.012358,0.008808,0.002867,93.891919,0.49801
min,,2743.24,2158.75,2060.66,0.0,0.6815,100.0,82.1311,0.0,1.191,...,0.0,0.4778,0.006,0.0017,1.1975,-0.0169,0.0032,0.001,0.0,-1.0
25%,,2966.26,2452.2475,2181.0444,1081.8758,1.0177,100.0,97.92,0.1211,1.4112,...,46.1849,0.4979,0.0116,0.0031,2.3065,0.013425,0.0106,0.0033,44.3686,-1.0
50%,,3011.49,2499.405,2201.0667,1285.2144,1.3168,100.0,101.5122,0.1224,1.4616,...,72.2889,0.5002,0.0138,0.0036,2.75765,0.0205,0.0148,0.0046,71.9005,-1.0
75%,,3056.65,2538.8225,2218.0555,1591.2235,1.5257,100.0,104.5867,0.1238,1.5169,...,116.53915,0.502375,0.0165,0.0041,3.295175,0.0276,0.0203,0.0064,114.7497,-1.0


## 1. Data Preprocessing

In [68]:
# Check data's Null values
data.isnull().any().any()

True

위의 결과를 통해 데이터에 결측값이 있다는 것을 확인했고, 결측값이 50% 이상인 열은 삭제하고, 나머지는 fillna의 method='ffill'을 통해 결측값을 앞 방향으로 채운 뒤 추가로 고유값이 1개뿐인 열을 삭제하는 전처리 진행

In [69]:
# Column에서 결측값의 percentage를 나타내는 함수 정의
# pd.concat->데이터 프레임 결합
# df.shape[0]은 데이터 전체 열의 개수
def null_values(df):
    nv=pd.concat([df.isnull().sum(), 100 * df.isnull().sum()/df.shape[0]],axis=1).rename(columns={0:'Missing_Records', 1:'Percentage (%)'})
    return nv[nv.Missing_Records>0].sort_values('Missing_Records', ascending=False)

In [70]:
df_na = null_values(data)
df_na

Unnamed: 0,Missing_Records,Percentage (%)
292,1429,91.193363
293,1429,91.193363
157,1429,91.193363
158,1429,91.193363
358,1341,85.577537
...,...,...
456,1,0.063816
218,1,0.063816
356,1,0.063816
457,1,0.063816


In [71]:
# 결측값의 비율이 50% 넘는 열 개수 확인
df_na = df_na[df_na["Percentage (%)"] > 50]
df_na
df_na.shape

(28, 2)

In [72]:
# 결측값의 비율이 50% 넘는 열 확인
df_na.index

Index(['292', '293', '157', '158', '358', '85', '492', '220', '518', '246',
       '245', '516', '517', '110', '384', '382', '383', '109', '244', '111',
       '580', '578', '581', '579', '73', '72', '345', '346'],
      dtype='object')

In [73]:
# 결측값의 비율이 50% 넘는 열 삭제
data = data.drop(axis=1, columns=df_na.index)
data.shape

(1567, 564)

본래 592개의 열에서 50%가 넘는 28개의 열이 삭제됐음을 확인할 수 있음

In [74]:
# 나머지 열의 결측값은  나머지는 fillna의 method='ffill'과 'bfill'을 통해 결측값을 앞뒤 방향으로 채운다.
data.fillna(method='ffill', inplace=True)
data.fillna(method='bfill', inplace=True)
data

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,577,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.00,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,...,14.9509,0.5005,0.0118,0.0035,2.3630,0.0096,0.0201,0.0060,208.2045,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,...,10.9003,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.0060,208.2045,-1
2,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,...,9.2721,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1
3,2008-07-19 14:43:00,2988.72,2479.90,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,...,8.5831,0.4990,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,-1
4,2008-07-19 15:22:00,3032.24,2502.87,2233.3667,1326.5200,1.5334,100.0,100.3967,0.1235,1.5031,...,10.9698,0.4800,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,2008-10-16 15:13:00,2899.41,2464.36,2179.7333,3085.3781,1.4843,100.0,82.2467,0.1248,1.3424,...,11.7256,0.4988,0.0143,0.0039,2.8669,0.0068,0.0138,0.0047,203.1720,-1
1563,2008-10-16 20:49:00,3052.31,2522.55,2198.5667,1124.6595,0.8763,100.0,98.4689,0.1205,1.4333,...,17.8379,0.4975,0.0131,0.0036,2.6238,0.0068,0.0138,0.0047,203.1720,-1
1564,2008-10-17 05:26:00,2978.81,2379.78,2206.3000,1110.4967,0.8236,100.0,99.4122,0.1208,1.4333,...,17.7267,0.4987,0.0153,0.0041,3.0590,0.0197,0.0086,0.0025,43.5231,-1
1565,2008-10-17 06:01:00,2894.92,2532.01,2177.0333,1183.7287,1.5726,100.0,98.7978,0.1213,1.4622,...,19.2104,0.5004,0.0178,0.0038,3.5662,0.0262,0.0245,0.0075,93.4941,-1


In [75]:
data.isnull().any().any()

False

결측값이 더이상 없음을 확인할 수 있다.

#### 추가로 고유값이 1개뿐인 열을 삭제하는 전처리 진행

In [76]:
# nunique() 함수를 써서 5열의 고유치 개수를 확인
data["5"].nunique()

1

In [77]:
# Column '5'와 같이 고유값이 1개인 열을 저장하는 unique_columns함수 생성
def unique_columns(df):
    uni_col_list = []
    for column in df.columns:
        if df[column].nunique() == 1:
            uni_col_list.append(column)
    return uni_col_list

In [78]:
# 고유치 개수가 1개인 열의 개수 확인
len(unique_columns(data))

116

In [79]:
data.shape

(1567, 564)

In [80]:
data = data.drop(axis=1, columns=unique_columns(data))
data.shape

(1567, 448)

고유치 개수가 1개인 열 116개가 지워진 것을 확인할 수 있다.

### Data Scaling
- 1. OverSampling - SMOTE
- 2. StandardScaler 
- 3. PCA 차원축소

In [81]:
# Train/Test Dataset split을 위해 Dataset을 Data/Target값으로 분리
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

(1) SMOTE를 활용해 OverSampling하여 적은 Fail의 데이터를 Pass데이터 비율에 맞게 늘려준다.

In [82]:
from imblearn.over_sampling import SMOTE
X_resample, y_resample  = SMOTE(random_state=1).fit_sample(X, y.values.ravel())

(2) StandardScaler: 각 feature의 평균을 0, 분산을 1로 변경, 모든 feature들이 같은 Scale을 갖게된다.

In [83]:
# 먼저 데이터를 학습용과 테스트용으로 분할
# test_test_split의 random_state를 제외한 다른 하이퍼파라미터는 default로 두기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, random_state=50)

#### ※ Scaler를 사용하기 전에 주의할 점
scaler는 fit과 transform 매서드를 지니고 있다. fit 매서드로 데이터 변환을 학습하고, transform 매서드로 실제 데이터의 스케일을 조정한다.
이때! fit 매서드는 학습용 데이터에만 적용해야한다. 그 후 transform 매서드를 학습용 데이터와 테스트 데이터에 적용한다.

In [84]:
from sklearn.preprocessing import StandardScaler 
sc= StandardScaler()
X_sc = sc.fit_transform(X)
X_train_sc = sc.fit_transform(X_train)
print('Features Min value before Scale : \n {}'. format(X_train.min(axis=0)))
print('Features Max value before Scale : \n {}'. format(X_train.max(axis=0)))
print('Features Min value after Scale : \n {}'. format(X_train_sc.min(axis=0)))
print('Features Max value after Scale : \n {}'. format(X_train_sc.max(axis=0)))
X_test_sc = sc.transform(X_test)

Features Min value before Scale : 
 0      2770.4000
1      2158.7500
2      2060.6600
3         0.0000
4         0.6815
         ...    
585       1.2845
586      -0.0060
587       0.0032
588       0.0010
589       0.0000
Length: 446, dtype: float64
Features Max value before Scale : 
 0      3356.3500
1      2846.4400
2      2315.2667
3      3715.0417
4      1112.4728
         ...    
585      99.3032
586       0.1028
587       0.0799
588       0.0286
589     737.3048
Length: 446, dtype: float64
Features Min value after Scale : 
 [-3.30731663e+00 -4.59571275e+00 -5.21938515e+00 -3.65398101e+00
 -4.94427401e-02 -3.46324375e+00 -2.00457529e+01 -4.38977089e+00
 -3.87362655e+00 -4.05128807e+00 -2.76524610e+01 -6.22678393e+00
 -2.49560932e+00 -2.01573721e+00 -9.08130583e-01 -3.73845933e+01
 -8.42145819e+00 -1.56319748e+01 -1.50675439e+01 -2.09745911e+00
 -7.36148101e+00 -4.56610732e+00 -5.62185877e+00 -5.64327483e+00
 -7.25048781e+00 -4.62743147e+00 -2.97351791e+00 -4.32672173e+00
 -4.7565

(3) PCA차원축소

- 많은 feature로 구성된 다차원 데이터 세트의 차원을 축소해 새로운 차원의 데이터 세트를 생성하는 것
-  일반적으로 차원이 증가할수록, 즉 feature가 많아질수록 예측 신뢰도가 떨어지고, 과적합(overfitting)이 발생하고, 개별 feature간의 상관관계가 높을 가능성이 있음

In [85]:
from sklearn.decomposition import PCA
# Initializing PCA and fitting
# Choose minimum number of PCA features that 85% variance is retained to avoid overfitting
pca = PCA(0.85)
X_sc_pca = pca.fit_transform(X_sc) # StandardScaler에서 학습용 데이터에 적용한 것처럼 fit과 transform 둘 다를 적용해줘야 하네~~??

In [86]:
# Transform train and test datasets
X_train_pca = pca.transform(X_train_sc)
X_test_pca = pca.transform(X_test_sc)

print('PCA차원 축소 이전 X_train:', X_train_sc.shape)
print('PCA차원 축소 이전 X_test:', X_test_sc.shape)
print('PCA차원 축소 이후 X_train:', X_train_pca.shape)
print('PCA차원 축소 이후 X_test:', X_test_pca.shape)

PCA차원 축소 이전 X_train: (2194, 446)
PCA차원 축소 이전 X_test: (732, 446)
PCA차원 축소 이후 X_train: (2194, 105)
PCA차원 축소 이후 X_test: (732, 105)


PCA차원 축소로 Feature가 565개에서 99개로 줄어 차원이 감소 됐음을 확인 할 수 있다.

## 2. Data Modeling

In [87]:
#inport libraries
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

### 2.1 교차검증
- 보통은 train set 으로 모델을 훈련, test set으로 모델을 검증한다.고정된 test set을 통해 모델의 성능을 검증하고 수정하는 과정을 반복하면, 결국 내가 만든 모델은 test set 에만 잘 동작하는 모델이 된다.즉, test set에 과적합(overfitting)하게 되므로, 다른 실제 데이터를 가져와 예측을 수행하면 엉망인 결과가 나와버리게 된다.
- 교차 검증은 train set을 train set + validation set으로 분리한 뒤, validation set을 사용해 검증하는 방식이다.

In [93]:
xgb= XGBClassifier(eval_metric='error')
params = {'max_depth':[2,4,6,8,10,12], 'random_state':[1]}

In [94]:
grid_clf = GridSearchCV(xgb, param_grid=params, scoring='accuracy', cv=10)
grid_clf.fit(X_train_pca, y_train)

print('최적의 하이퍼 파라미터:{0}, 최적의 평균 정확도:{1:.3f}'.format(grid_clf.best_params_, grid_clf.best_score_))

최적의 하이퍼 파라미터:{'max_depth': 8, 'random_state': 1}, 최적의 평균 정확도:0.978


### 2.2 모델 학습
#### XGBoost(eXtra Gradient Boost) 개요 및 장점
- 트리 기반의 앙상블 학습에서 가장 각광받고 있는 알고리즘 중 하나
- 분류와 회귀 영역에서 뛰어난 예측 성능 발휘
- GBM대비 빠른 수행시간(다른 머신러닝 알고리즘에 비하면 느림)
- 과적합 규제(Regularization)
- Tree pruning(나무 가지치기)으로 더 이상 긍정 이득이 없는 분할을 가지치기해서 분할 수를 줄임
- 자체 내장된 교차 검증으로 최적화된 반복 수행 횟수를 가질 수 있음
- 결손값 자체 처리 기능

In [95]:
em = grid_clf.best_estimator_
pred = em.predict(X_test_pca)

## 3. Data Evaluation
- TP(true positive): 1로 예측했을 때 정답도 1인 경우
- FP(false positive): 1로 예측했지만 정답은 0인 경우
- FN(false negative): 0으로 예측했지만 정답은 1인 경우
- TN(true negative): 0으로 예측했을 때 정답도 0인 경우
- recall(재현율)=TP/(TP+FN): 
  양성 positive 케이스들에서 진짜 true 양성 positive로 예측에 성공한 확률
- precision(정밀도)=TP/(TP+FP): 
  양성 positive라고 판단한 경우 중에 진짜 true 양성 positive인 확률
- F1_score=2∗(precision∗recall)/precision+recall

In [96]:
# Import Libraries
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score

In [97]:
print('confusion_matrix\n', confusion_matrix(pred, y_test))
print('accuracy score : ', accuracy_score(pred, y_test))
print('f1_score : ', f1_score(y_test,pred))
print('recall_score : ', recall_score(pred, y_test))
print('precision_score : ',precision_score(y_test,pred))

confusion_matrix
 [[354   2]
 [ 13 363]]
accuracy score :  0.9795081967213115
f1_score :  0.979757085020243
recall_score :  0.9654255319148937
precision_score :  0.9654255319148937
