# load data

In [1]:
import pandas as pd
import random
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

import pandas as pd
import numpy as np


train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

# preprocessing

### drop

In [4]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Quality', 'Y_Class', 'TIMESTAMP'])
train_y = train_df['Y_Class']
test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

### LabelEncoder

In [5]:
from sklearn.preprocessing import LabelEncoder

# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


### fillna(0)

In [7]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

### PCA

#### basic PCA

In [22]:
from sklearn.decomposition import PCA

In [101]:
pca = PCA()
pca.fit(train_x)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1



pca = PCA(n_components=d)
train_x_pca = pca.fit_transform(train_x)
test_x_pca = pca.transform(test_x)



print(train_x_pca.shape)
print(test_x_pca.shape)



col = []
for i in range(int(d)):
    col.append('pca{}'.format(i))

train_x_pca = pd.DataFrame(data=train_x_pca, columns=col)
test_x_pca = pd.DataFrame(data=test_x_pca, columns=col)

(598, 2)
(310, 2)


#### 랜덤 PCA

In [106]:
rnd_pca = PCA(svd_solver='randomized')
rnd_pca.fit(train_x)
cumsum = np.cumsum(rnd_pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1



rnd_pca = PCA(n_components=d, svd_solver='randomized')
train_x_random_pca = rnd_pca.fit_transform(train_x)
test_x_random_pca = rnd_pca.fit_transform(test_x)



print(train_x_random_pca.shape)
print(test_x_random_pca.shape)



col = []
for i in range(int(d)):
    col.append('pca{}'.format(i))

train_x_random_pca = pd.DataFrame(data=train_x_random_pca, columns=col)
test_x_random_pca = pd.DataFrame(data=test_x_random_pca, columns=col)

(598, 2)
(310, 2)


#### 점진적 PCA : IncrementalPCA

In [111]:
from sklearn.decomposition import IncrementalPCA
inc_pca = IncrementalPCA()
inc_pca.fit(train_x)
cumsum = np.cumsum(inc_pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1



n_batch = 100
inc_pca = IncrementalPCA(n_components=d)
for X_batch in np.array_split(train_x, n_batch):
    inc_pca.partial_fit(X_batch)

train_x_reduced_pca = inc_pca.transform(train_x)
test_x_reduced_pca = inc_pca.transform(test_x)



print(train_x_reduced_pca.shape)
print(test_x_reduced_pca.shape)



col = []
for i in range(int(d)):
    col.append('pca{}'.format(i))

train_x_reduced_pca = pd.DataFrame(data=train_x_reduced_pca, columns=col)
test_x_reduced_pca = pd.DataFrame(data=test_x_reduced_pca, columns=col)

(598, 2)
(310, 2)


#### 커널 PCA

In [125]:
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = Pipeline([('kpca', KernelPCA(n_components=200)),
                 ('rfc', RandomForestClassifier())
                 ])


param_grid = [{
    'kpca__gamma' : np.linspace(0.03, 0.05, 10),
    'kpca__kernel' : ['rbf', 'softmax']
               }]


grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(train_x, train_y)

print('best param :',grid_search.best_params_)
print('best score :', grid_search.best_score_)

best param : {'kpca__gamma': 0.03, 'kpca__kernel': 'rbf'}
best score : 0.7006197654941374


In [129]:
for n_component in [2, 5, 10, 25, 50, 100, 200]:
    print('### n_component :{} ###'.format(n_component))
    clf = Pipeline([('kpca', KernelPCA(n_components=n_component)),
                    ('rfc', RandomForestClassifier())
                    ])


    param_grid = [{
        'kpca__gamma' : np.linspace(0.03, 0.05, 10),
        'kpca__kernel' : ['rbf', 'softmax']
                }]


    grid_search = GridSearchCV(clf, param_grid, cv=3)
    grid_search.fit(train_x, train_y)

    print('best param :',grid_search.best_params_)
    print('best score :', grid_search.best_score_, '\n')

### n_component :2 ###
best param : {'kpca__gamma': 0.03888888888888889, 'kpca__kernel': 'rbf'}
best score : 0.6872194304857621 

### n_component :5 ###
best param : {'kpca__gamma': 0.04777777777777778, 'kpca__kernel': 'rbf'}
best score : 0.6955778894472361 

### n_component :10 ###
best param : {'kpca__gamma': 0.043333333333333335, 'kpca__kernel': 'rbf'}
best score : 0.6939112227805695 

### n_component :25 ###
best param : {'kpca__gamma': 0.03888888888888889, 'kpca__kernel': 'rbf'}
best score : 0.7006281407035176 

### n_component :50 ###
best param : {'kpca__gamma': 0.04777777777777778, 'kpca__kernel': 'rbf'}
best score : 0.6989530988274707 

### n_component :100 ###
best param : {'kpca__gamma': 0.03222222222222222, 'kpca__kernel': 'rbf'}
best score : 0.7006197654941374 

### n_component :200 ###
best param : {'kpca__gamma': 0.03888888888888889, 'kpca__kernel': 'rbf'}
best score : 0.7039698492462311 



##### best

In [156]:
best_param = {'kpca__gamma': 0.04777777777777778, 'kpca__kernel': 'rbf'}

n_components = 5

rbf_pca = KernelPCA(n_components=n_components,
                    gamma=0.04777777777777778,
                    kernel='rbf',
                    fit_inverse_transform=True)

train_x_rbf_pca = rbf_pca.fit_transform(train_x)
test_x_rbf_pca = rbf_pca.transform(test_x)


col = []
for i in range(int(n_components)):
    col.append('pca{}'.format(i))

train_x_rbf_pca = pd.DataFrame(data=train_x_rbf_pca, columns=col)
test_x_rbf_pca = pd.DataFrame(data=test_x_rbf_pca, columns=col)

#### LLE : locally linear embedding

In [149]:
from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
train_x_lle = lle.fit_transform(train_x)
test_x_lle = lle.fit_transform(test_x)



print(train_x_lle.shape)
print(test_x_lle.shape)



col = []
for i in range(int(d)):
    col.append('pca{}'.format(i))

train_x_lle = pd.DataFrame(data=train_x_lle, columns=col)
test_x_lle = pd.DataFrame(data=test_x_lle, columns=col)

(598, 2)
(310, 2)


# split train / test

In [205]:
from sklearn.model_selection import train_test_split

train_x_set = [train_x_pca,
           train_x_random_pca,
           train_x_reduced_pca,
           train_x_rbf_pca,
           train_x_lle]

In [206]:
basic = train_test_split(train_x_set[0], train_y, test_size=0.2)

random = train_test_split(train_x_set[1], train_y, test_size=0.2)

incremental = train_test_split(train_x_set[2], train_y, test_size=0.2)

kernal = train_test_split(train_x_set[3], train_y, test_size=0.2)

lle = train_test_split(train_x_set[4], train_y, test_size=0.2)

# build model

In [189]:
# model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn import svm

# metric
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

rfc = RandomForestClassifier(random_state=42)
xgbc = XGBClassifier(eval_metric='mlogloss')
svmc = svm.SVC()
gbc = GradientBoostingClassifier(random_state=42)

models = [rfc, xgbc, svmc, gbc]
model_names = ['RandomForestClassifier','XGBClassifier','SVC','GradientBoostingClassifier']

In [207]:
pcas = [basic, random, incremental, kernal, lle]
pcas_n = ['basic', 'random', 'incremental', 'kernal', 'lle']

for idx, pca in enumerate(pcas):
    print('\n\n### {} ###'.format(pcas_n[idx]))
    for idx, i in enumerate(models):
        warnings.filterwarnings('ignore')
        model = i
        model.fit(pca[0], pca[2])
        pred = model.predict(pca[1])
        print('{} :'.format(model_names[idx]), accuracy_score(pred, pca[3]))



### basic ###
RandomForestClassifier : 0.6666666666666666
XGBClassifier : 0.6666666666666666
SVC : 0.6666666666666666
GradientBoostingClassifier : 0.6666666666666666


### random ###
RandomForestClassifier : 0.6666666666666666
XGBClassifier : 0.6666666666666666
SVC : 0.6666666666666666
GradientBoostingClassifier : 0.6666666666666666


### incremental ###
RandomForestClassifier : 0.7166666666666667
XGBClassifier : 0.7166666666666667
SVC : 0.7166666666666667
GradientBoostingClassifier : 0.7166666666666667


### kernal ###
RandomForestClassifier : 0.7083333333333334
XGBClassifier : 0.7166666666666667
SVC : 0.75
GradientBoostingClassifier : 0.7166666666666667


### lle ###
RandomForestClassifier : 0.7333333333333333
XGBClassifier : 0.675
SVC : 0.7333333333333333
GradientBoostingClassifier : 0.7333333333333333


In [196]:
print('### model : 1 ###\n')
for idx, i in enumerate(models):
    warnings.filterwarnings('ignore')
    model = i
    model.fit(basic[0], basic[2])
    pred = model.predict(basic[1])
    print('{} :'.format(model_names[idx]), accuracy_score(pred, basic[3]))

### model : 1 ###

RandomForestClassifier : 0.7083333333333334
XGBClassifier : 0.7083333333333334
SVC : 0.7083333333333334
GradientBoostingClassifier : 0.7083333333333334
