# Libraries

In [1]:
import os
import random
import numpy as np
import pandas as pd
import pickle
import gc

import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, confusion_matrix

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
SEED = 2019
seed_everything(SEED)

# Preprocess

## create dataframe

In [4]:
DATASET_PATH = './dataset/'
actions = ['cClockwise', 'clockwise', 'left2right', 'right2left', 'up2down']

for action in actions:
    all_data = []
    
    for example in os.listdir(DATASET_PATH + action):
        
        # example == 1.txt
        # action == cClockwise
        
        with open(DATASET_PATH + action + '/' + example, 'rb') as f:
            data = pickle.load(f)
            all_data.append(np.array(data).flatten().reshape(1, -1))

    data_num = len(all_data)
    
    # data to dict
    data_dict = {}
    for i, data in enumerate(all_data):
        data_dict[i] = data

    # create empty dataframe
    columns = [i for i in range(102)]
    indices = [i for i in range(data_num)]
    df = pd.DataFrame(columns=columns, index=indices)

    for i in range(data_num):
        df.loc[i] = data_dict[i]

    df.to_csv('./dataset/{}.csv'.format(action), index=False)

In [5]:
os.listdir('./dataset/')

['up2down',
 'cClockwise',
 'clockwise',
 '.DS_Store',
 'right2left_example.csv',
 'clockwise_example.csv',
 'up2down_example.csv',
 'right2left',
 'left2right_example.csv',
 'cClockwise_example.csv',
 'left2right']

In [6]:
df1 = pd.read_csv('./dataset/cClockwise.csv')
df2 = pd.read_csv('./dataset/clockwise.csv')
df3 = pd.read_csv('./dataset/left2right.csv')
df4 = pd.read_csv('./dataset/right2left.csv')
df5 = pd.read_csv('./dataset/up2down.csv')

In [7]:
df1['target'] = 4 # 반시계
df2['target'] = 3 # 시계
df3['target'] = 2 # 왼오
df4['target'] = 1 # 오왼
df5['target'] = 0 # 업다운

In [8]:
df1.shape, df2.shape, df3.shape, df4.shape, df5.shape

((1000, 103), (1007, 103), (1000, 103), (1006, 103), (1012, 103))

In [9]:
df = pd.concat([df1, df2], axis=0)
df = pd.concat([df, df3], axis=0)
df = pd.concat([df, df4], axis=0)
df = pd.concat([df, df5], axis=0)

In [10]:
df['target'].value_counts()

0    1012
3    1007
1    1006
2    1000
4    1000
Name: target, dtype: int64

## train, test split

In [11]:
# shuffle dataframe before splitting train, test
df = df.sample(frac=1)

In [12]:
df = df.reset_index(drop=True)

In [13]:
# 80% train set, 20% test set
train_num = int(len(df)*0.8)

In [14]:
train_df = df[:train_num]
test_df = df[train_num:]

In [15]:
train_df.shape, test_df.shape

((4020, 103), (1005, 103))

In [16]:
train_df['target'].value_counts()

2    812
3    810
4    809
0    804
1    785
Name: target, dtype: int64

In [17]:
test_df['target'].value_counts()

1    221
0    208
3    197
4    191
2    188
Name: target, dtype: int64

- 대략 골고루 들어간 것을 확인

# Training

In [18]:
# example of randomforest
# X = df.iloc[:, :-1]
# y = df['target']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=2019)

# model = RandomForestClassifier(n_estimators=200, random_state=2019)

# model.fit(X_train, y_train)

# result = model.predict(X_test)

# accuracy_score(result, y_test)

In [19]:
num_classes = train_df['target'].nunique()
X = train_df.iloc[:, :-1]
y = train_df['target']

In [20]:
params = {
          "objective" : "multiclass",
          "num_class" : num_classes,
          "num_leaves" : 2 ** 8,
          "max_depth": 5,
          "learning_rate" : 0.01,
          "bagging_fraction" : 0.9,  # subsample
          "feature_fraction" : 0.9,  # colsample_bytree
          "bagging_freq" : 2,        # subsample_freq
          "verbosity" : -1
}

In [21]:
NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
models = []
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print('Fold:',fold_)
    tr_x, tr_y = X.iloc[trn_idx,:], y.iloc[trn_idx]
    vl_x, vl_y = X.iloc[val_idx,:], y.iloc[val_idx]

    lgb_train, lgb_valid = lgb.Dataset(tr_x, tr_y), lgb.Dataset(vl_x, vl_y)
    model = lgb.train(params, lgb_train, 1000, valid_sets=[lgb_train, lgb_valid], verbose_eval=100, early_stopping_rounds=20)
    models.append(model)
    print('\n')
    del model, tr_x, tr_y, vl_x, vl_y, lgb_train, lgb_valid
    gc.collect()

Fold: 0
Training until validation scores don't improve for 20 rounds.
[100]	training's multi_logloss: 0.532695	valid_1's multi_logloss: 0.573796
[200]	training's multi_logloss: 0.210907	valid_1's multi_logloss: 0.257586
[300]	training's multi_logloss: 0.0881561	valid_1's multi_logloss: 0.132501
[400]	training's multi_logloss: 0.0377901	valid_1's multi_logloss: 0.0794044
[500]	training's multi_logloss: 0.0159759	valid_1's multi_logloss: 0.0556582
[600]	training's multi_logloss: 0.00684311	valid_1's multi_logloss: 0.0445372
[700]	training's multi_logloss: 0.00296053	valid_1's multi_logloss: 0.0389563
[800]	training's multi_logloss: 0.0012947	valid_1's multi_logloss: 0.0354611
[900]	training's multi_logloss: 0.000569325	valid_1's multi_logloss: 0.0328597
[1000]	training's multi_logloss: 0.000253241	valid_1's multi_logloss: 0.0312174
Did not meet early stopping. Best iteration is:
[1000]	training's multi_logloss: 0.000253241	valid_1's multi_logloss: 0.0312174


Fold: 1
Training until valid

# Inference

In [22]:
X_test = test_df.iloc[:, :-1]
y_test = test_df['target']

In [23]:
predictions = []
for model in models:
    prediction = model.predict(X_test, num_iteration=model.best_iteration)
    predictions.append(prediction)

In [24]:
predictions = np.mean(predictions, axis=0)
predictions = np.argmax(predictions, axis=1)

In [25]:
print("정확도: {}".format(accuracy_score(predictions, y_test)))

정확도: 0.9900497512437811


In [26]:
result = pd.DataFrame(y_test).reset_index(drop=True)
result['prediction'] = predictions

In [27]:
result

Unnamed: 0,target,prediction
0,0,0
1,1,1
2,4,4
3,0,0
4,4,4
5,1,1
6,3,3
7,0,0
8,0,0
9,2,2


In [28]:
result.loc[result['target'] != result['prediction']]

Unnamed: 0,target,prediction
112,1,2
136,3,1
445,3,2
472,2,4
788,3,2
805,3,2
853,3,0
892,4,0
897,3,0
918,4,1


In [29]:
confusion_matrix(y_test, predictions)

array([[208,   0,   0,   0,   0],
       [  0, 220,   1,   0,   0],
       [  0,   0, 187,   0,   1],
       [  2,   1,   3, 191,   0],
       [  1,   1,   0,   0, 189]])