In [1]:
import mglearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn import cluster
from sklearn import metrics
from sklearn import ensemble

In [2]:
# 데이터 불러오기
data = pd.read_csv('fer2013.csv')
data.head()

Unnamed: 0,emotion,pixels,Usage
0,0,70 80 82 72 58 58 60 63 54 58 60 48 89 115 121...,Training
1,0,151 150 147 155 148 133 111 140 170 174 182 15...,Training
2,2,231 212 156 164 174 138 161 173 182 200 106 38...,Training
3,4,24 32 36 30 32 23 19 20 30 41 21 22 32 34 21 1...,Training
4,6,4 0 0 0 0 0 0 0 0 0 0 0 3 15 23 28 48 50 58 84...,Training


In [3]:
pixels_data = data.copy()

In [4]:
# train / test 두 세트로 분류하기 위해 Usage 열 삭제
del pixels_data['Usage']

In [5]:
pixels_data.head()

Unnamed: 0,emotion,pixels
0,0,70 80 82 72 58 58 60 63 54 58 60 48 89 115 121...
1,0,151 150 147 155 148 133 111 140 170 174 182 15...
2,2,231 212 156 164 174 138 161 173 182 200 106 38...
3,4,24 32 36 30 32 23 19 20 30 41 21 22 32 34 21 1...
4,6,4 0 0 0 0 0 0 0 0 0 0 0 3 15 23 28 48 50 58 84...


In [6]:
# label 열 추출
emotion = pixels_data[['emotion']].copy()
emotion.head()

Unnamed: 0,emotion
0,0
1,0
2,2
3,4
4,6


In [7]:
# emotion 열 삭제
del pixels_data['emotion']
pixels_data.head()

Unnamed: 0,pixels
0,70 80 82 72 58 58 60 63 54 58 60 48 89 115 121...
1,151 150 147 155 148 133 111 140 170 174 182 15...
2,231 212 156 164 174 138 161 173 182 200 106 38...
3,24 32 36 30 32 23 19 20 30 41 21 22 32 34 21 1...
4,4 0 0 0 0 0 0 0 0 0 0 0 3 15 23 28 48 50 58 84...


#### test size = 0.3으로 train data, test data 분류

In [8]:
from sklearn import model_selection

In [9]:
train_data, test_data, train_label, test_label = model_selection.train_test_split(pixels_data, emotion, test_size=0.3, random_state=0)

In [10]:
#string > float로 데이터 변경
train_data = np.array(list(map(str.split, train_data['pixels'])), np.float32) 
test_data = np.array(list(map(str.split, test_data['pixels'])), np.float32) 

In [11]:
# 픽셀의 가장 큰 값인 255로 train_data, test data 정규화
train_data = train_data / 255.0
test_data = test_data / 255.0

In [12]:
train_data

array([[0.01176471, 0.00784314, 0.00784314, ..., 0.36862746, 0.4       ,
        0.42745098],
       [0.7921569 , 0.7764706 , 0.7411765 , ..., 0.6901961 , 0.68235296,
        0.69803923],
       [1.        , 1.        , 1.        , ..., 0.49411765, 0.45490196,
        0.4509804 ],
       ...,
       [0.23529412, 0.23921569, 0.2509804 , ..., 0.23921569, 0.22745098,
        0.25490198],
       [0.43137255, 0.49019608, 0.52156866, ..., 0.5921569 , 0.61960787,
        0.654902  ],
       [0.10588235, 0.11764706, 0.12941177, ..., 0.9647059 , 0.29803923,
        0.        ]], dtype=float32)

In [13]:
X_train = np.array(train_data)
X_test = np.array(test_data)

# continuous로 나타나는 label 데이터 array로 타입변경
Y_train = np.array((train_label), np.float32)
Y_test = np.array((test_label), np.float32)

### PCA 사용하여 열을 줄이기

In [14]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn import decomposition
from sklearn import datasets

In [28]:
# PCA를 사용해서 열 개수 조정 2304 > 100개로 열 축소
model = decomposition.PCA(n_components=100) 

In [29]:
#X_train 데이터 PCA 적용으로 열 줄이기
model.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [30]:
train_data_1 = model.transform(X_train)

In [31]:
train_data_1

array([[ 1.4445480e+01, -1.8245298e+00,  4.0581608e+00, ...,
        -2.8163877e-01, -4.7258541e-02, -3.7810236e-02],
       [-3.0321217e-01,  2.1969481e+00,  5.6065130e+00, ...,
         2.9554334e-01, -7.5621760e-01,  3.2679531e-01],
       [-9.9274712e+00,  6.7252669e+00, -1.9324429e+00, ...,
         2.7343768e-01, -1.3570668e-01, -3.7528673e-01],
       ...,
       [-5.0642223e+00, -6.8186955e+00, -4.4002381e-01, ...,
         4.5948297e-02, -9.7779952e-02,  9.9467009e-02],
       [-2.1612229e+00,  7.2566682e-01,  1.8125300e+00, ...,
         1.2542270e-02, -1.0496268e-01, -1.8091053e-01],
       [ 3.8804790e-01,  7.2551930e-01,  8.1798115e+00, ...,
        -1.4680423e-01,  3.3063257e-01,  6.1432111e-01]], dtype=float32)

In [32]:
train_data_1.shape

(25120, 100)

In [33]:
#X_test 데이터 PCA 적용으로 열 줄이기
model.fit(X_test)

PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [34]:
test_data_1 = model.transform(X_test)

In [35]:
test_data_1

array([[-12.463509  ,   1.0060757 ,   0.37455624, ...,  -0.3619038 ,
         -0.31174183,  -0.16339847],
       [  4.241225  ,   1.4804138 ,  -1.6832688 , ...,   0.10753611,
          0.38410807,   0.0619607 ],
       [ -9.024975  ,   2.3973377 ,   4.5554724 , ...,  -0.39793956,
          0.6988648 ,   0.18324307],
       ...,
       [  1.1437162 ,  -4.0588884 ,  -3.8486247 , ...,   0.1263438 ,
         -0.2591908 ,  -0.37800872],
       [  5.5951204 ,   3.6003907 ,  -2.908931  , ...,  -0.16359861,
         -0.06880213,  -0.30506015],
       [  3.9563665 ,  -0.05226219,  -0.1387003 , ...,   0.11242965,
          0.46677393,   0.5055139 ]], dtype=float32)

In [36]:
test_data_1.shape

(10767, 100)

#### SVM 머신러닝 적용

In [37]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn import utils

#### SVC 디폴트 값으로 머신러닝 진행시 오버피팅 발생

In [38]:
svc = SVC()
print(svc)
svc.fit(train_data_1, Y_train)
print('디폴트 값')
print("Accuracy on Training set: {:.3f}".format(svc.score(train_data_1, Y_train)))
print("Accuracy on Test set: {:.3f}".format(svc.score(test_data_1, Y_test)))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
디폴트 값
Accuracy on Training set: 0.617
Accuracy on Test set: 0.212


#### Parameter 값 변경( kernel = 'rbf', C=10, gamma = 0.01 )
**Overfitting 발생**

In [39]:
svc_g = SVC(kernel='rbf', C=10, gamma=0.01)
svc_g.fit(train_data_1, Y_train)

print('kernel ="rbf", C=10, gamma=0.01 적용 결과')
print("Accuracy on Training set: {:.3f}".format(svc_g.score(train_data_1, Y_train)))
print("Accuracy on Test set: {:.3f}".format(svc_g.score(test_data_1, Y_test)))

kernel ="rbf", C=10, gamma=0.01 적용 결과
Accuracy on Training set: 0.955
Accuracy on Test set: 0.193


### GridSearchCV for SVM

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C' : [0.1, 1, 10, 100, 1000, 10000], 
             'gamma' : [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]}

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=1)
# refit : 찾아진 가장 좋은 params로 estimator를 setting할 지 여부 (setting해줘야 곧바로 predict가 가능)
# verbose : 설명의 자세한 정도 (verbose를 3과 같이 바꿔보시면 더 자세하게 매 param set 마다의 결과를 확인할 수 있습니다.)
grid.fit(train_data_1, Y_train)
print('The best parameters are ', grid.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


GridSearch는 컴퓨터가 안 돌아감