In [1]:
import tensorflow as tf
from tensorflow import keras
import os
import numpy as np
import pandas as pd
from scipy import fftpack
from numpy.fft import *
from numpy.random import seed
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from transforms3d.axangles import axangle2mat
import warnings
warnings.filterwarnings(action='ignore')

2021-10-08 10:51:58.495082: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


In [2]:
# data load
# path 변수를 적절히 변경
x_train_path = os.path.join('data/train_features.csv')
y_train_path = os.path.join('data/train_labels.csv')
x_test_path = os.path.join('data/test_features.csv')
sub_path = os.path.join('data/sample_submission.csv')

train = pd.read_csv(x_train_path)
train_label = pd.read_csv(y_train_path)
test = pd.read_csv(x_test_path)
sub = pd.read_csv(sub_path)

In [3]:
def jerk_signal(signal, dt=0.02): 
        return np.array([(signal[i+1]-signal[i])/dt for i in range(len(signal)-1)])
    
    
def fourier_transform_one_signal(t_signal):
    complex_f_signal= fftpack.fft(t_signal)
    amplitude_f_signal=np.abs(complex_f_signal)
    return amplitude_f_signal


def feature_engineering(data_):
    data = data_.copy()
    data['acc_Energy']=(data['acc_x']**2+data['acc_y']**2+data['acc_z']**2)**(1/3)

    data['gy_Energy']=(data['gy_x']**2+data['gy_y']**2+data['gy_z']**2)**(1/3)

    data['gy_acc_Energy']=((data['gy_x']-data['acc_x'])**2+(data['gy_y']-data['acc_y'])**2+(data['gy_z']-data['acc_z'])**2)**(1/3)

    data_dt=[]
    for i in tqdm(data['id'].unique()):
        temp=data.loc[data['id']==i]
        for v in data.columns[2:]:
            values=jerk_signal(temp[v].values)
            values=np.insert(values,0,0)
            temp.loc[:,v+'_dt']=values
        data_dt.append(temp)
    data = pd.concat(data_dt)
    
    fft=[]
    for i in tqdm(data['id'].unique()):
        temp=data.loc[data['id']==i]
        for i in data.columns[2:8]:
            temp[i]=fourier_transform_one_signal(temp[i].values)
        fft.append(temp)
    data=pd.concat(fft)
    
    return data

In [4]:
def scaling(data_, scaler=None):
    data = data_
    col = data.columns
    
    if scaler:
        data.iloc[:,2:]= scaler.transform(data.iloc[:,2:])
        data = pd.DataFrame(data = data,columns =col)
        
        return data
    
    else:
        scaler = StandardScaler()
    
        data.iloc[:,2:]= scaler.fit_transform(data.iloc[:,2:])
        data = pd.DataFrame(data = data,columns =col)
        
        return data, scaler

In [18]:
def rolling(data_):
    data = data_.copy()
    sampling = np.random.choice(data.shape[0], int(data.shape[0] * 2 / 3))
    for j in sampling:
        data[j] = np.roll(data[j], np.random.choice(data.shape[1]), axis=0)
    return data


def rotation(data_):
    data = data_.copy()
    axis = np.random.uniform(low=-1, high=1, size=data.shape[1])
    angle = np.random.uniform(low=-np.pi, high=np.pi)
    return np.matmul(data, axangle2mat(axis, angle))


def permutation(data_, nPerm=4, mSL=10):
    data = data_.copy()
    data_new = np.zeros(data.shape)
    idx = np.random.permutation(nPerm)
    bWhile = True
    while bWhile == True:
        segs = np.zeros(nPerm + 1, dtype=int)
        segs[1:-1] = np.sort(np.random.randint(mSL, data.shape[0] - mSL, nPerm - 1))
        segs[-1] = data.shape[0]
        if np.min(segs[1:] - segs[0:-1]) > mSL:
            bWhile = False
    pp = 0
    for ii in range(nPerm):
        data_temp = data[segs[idx[ii]]:segs[idx[ii] + 1], :]
        data_new[pp:pp + len(data_temp), :] = data_temp
        pp += len(data_temp)
    return data_new


# 2:5
# 5:
def augmentation(data_, labels):
    data = data_.copy()
    
    # rotation
    print('rotation...')
    ro_aug = []
    ro_label = []
    ro_sampling = np.random.choice(data.shape[0]//600, int((data.shape[0]//600) * 1 / 3))
    for j in tqdm(ro_sampling):
        columns = data.iloc[:, 2:].columns
        temp_df = data.iloc[j*600:(j+1)*600, :2]
        # acc
        acc = rotation(np.array(data.iloc[j*600:(j+1)*600, 2:5]))
        # gy
        gy = rotation(np.array(data.iloc[j*600:(j+1)*600, 5:]))
        
        temp = np.concatenate([acc, gy], axis=1)
        temp = pd.DataFrame(temp, columns=columns, index=temp_df.index)
        temp = pd.concat([temp_df, temp], axis=1)
        ro_label.append(labels[j])
        ro_aug.append(temp)
        
    ro_aug = pd.concat(ro_aug)
    ro_feature = feature_engineering(ro_aug)
    ro_sc, _ = scaling(ro_feature)
    ro_series = ro_sc.iloc[:, 2:].to_numpy().reshape(-1, 600, 18)
    ro_series = rolling(ro_series)
    
    # permutation
    print('permutation...')
    per_aug = []
    per_label = []
    per_sampling = np.random.choice(data.shape[0]//600, int((data.shape[0]//600) * 1 / 3))
    for j in tqdm(per_sampling):
        columns = data.iloc[:, 2:].columns
        temp_df = data.iloc[j*600:(j+1)*600, :2]
        # acc
        acc = permutation(np.array(data.iloc[j*600:(j+1)*600, 2:5]))
        # gy
        gy = permutation(np.array(data.iloc[j*600:(j+1)*600, 5:]))
        
        temp = np.concatenate([acc, gy], axis=1)
        temp = pd.DataFrame(temp, columns=columns, index=temp_df.index)
        temp = pd.concat([temp_df, temp], axis=1)
        per_label.append(labels[j])
        per_aug.append(temp)
    
    per_aug = pd.concat(per_aug)
    per_feature = feature_engineering(per_aug)
    per_sc, _ = scaling(per_feature)
    per_series = per_sc.iloc[:, 2:].to_numpy().reshape(-1, 600, 18)
    per_series = rolling(per_series)
    
    origin_feature = feature_engineering(data)
    origin_sc, _ = scaling(origin_feature)
    origin_series = origin_sc.iloc[:, 2:].to_numpy().reshape(-1, 600, 18)
    
    final = np.concatenate([origin_series, ro_series, per_series], axis=0)
    final_label = np.concatenate([labels, ro_label, per_label], axis=0)
    
    return final, final_label

In [10]:
def ids_26(label):
    mask = label['label'] == 26
    ids = label.loc[mask, 'id'].to_numpy()
    
    return ids


def make_split_dataset(train, ids, labels):
    final_list = []
    columns = train.columns
    train = train.to_numpy().reshape(-1, 600, 8)
    
    k_split = KFold(n_splits=15, shuffle=True, random_state=42)
    
    except_mask = np.setdiff1d(np.array(range(3125)), ids)
    except_train = train[except_mask]
    except_label = labels[except_mask]
    
    train_26 = train[ids]

    for _, fold in k_split.split(train_26):
        temp_train = train_26[fold]
        temp_label = np.array([26] * len(temp_train))
        
        temp_train = np.concatenate([temp_train, except_train], axis=0)
        temp_label = np.concatenate([temp_label, except_label], axis=0)
        
        temp_train = pd.DataFrame(temp_train.reshape(temp_train.shape[0] * 600, -1), columns=columns)
        print(temp_train.shape)
        final_list.append([temp_train, temp_label])
        
    return final_list

In [11]:
# dataset과 validation set을 만들어 주는 함수
# validation set은 shuffle 적용 x
def make_train(series_data, labels):
    cat_y = tf.keras.utils.to_categorical(labels)

    BATCH_SIZE = 64
    train_dataset = tf.data.Dataset.from_tensor_slices((series_data, cat_y))
    train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(1000, seed=42)
    train_dataset = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return train_dataset

def make_val(series_data, labels):
    cat_y = tf.keras.utils.to_categorical(labels)

    BATCH_SIZE = 64
    val_dataset = tf.data.Dataset.from_tensor_slices((series_data, cat_y))
    val_dataset = val_dataset.batch(BATCH_SIZE)
    val_dataset = val_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return val_dataset

In [12]:
class Rolling(keras.layers.Layer):
    def __init__(self):
        super(Rolling, self).__init__()
    
    def call(self, inputs, training=None):
        if training:
            return tf.roll(inputs, shift=np.random.randint(0, 599), axis=1)
        else:
            return inputs        

In [13]:
# 모델을 만들어 주는 함수
# 기존 base에서 overfitting이 심해, dropout을 늘림(아직 제출은 안해봄)
def base():
    seed(2021)
    tf.random.set_seed(2021)
    model = keras.models.Sequential([
            keras.layers.Input([600, 18]),
            Rolling(),
            keras.layers.Conv1D(filters=128, kernel_size=9, padding='same'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dropout(0.3),
            keras.layers.Conv1D(filters=256, kernel_size=6, padding='same'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dropout(0.4),
            keras.layers.Conv1D(filters=128, kernel_size=3,padding='same'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dropout(0.5),
            keras.layers.GlobalAveragePooling1D(),
            keras.layers.Dense(61, activation='softmax')
    ])
    model.compile(optimizer='adam',
                loss='categorical_crossentropy', 
                metrics=['accuracy'])
    return model

In [14]:
ids = ids_26(train_label)
data_list = make_split_dataset(train, ids, train_label['label'])

(1025400, 8)
(1025400, 8)
(1025400, 8)
(1024800, 8)
(1024800, 8)
(1024800, 8)
(1024800, 8)
(1024800, 8)
(1024800, 8)
(1024800, 8)
(1024800, 8)
(1024800, 8)
(1024800, 8)
(1024800, 8)
(1024800, 8)


In [15]:
# checkpoint path
# 중간중간 모델의 weight를 저장할 경로 설정
ckpt_name = 'random_sampling.hdf5'
checkpoint_dir_path = os.path.join('checkpoint')
checkpoint_path = os.path.join('checkpoint', ckpt_name)

# check checkpoint paht
# 경로가 없으면 생성함
if not(os.path.exists(checkpoint_dir_path)):
    os.mkdir(checkpoint_dir_path)

# callback 함수 목록
callbacks_list = [
    # 매 epoch 마다 val_loss를 체크하여 가장 낮은 상태의 weight를 저장
    tf.keras.callbacks.ModelCheckpoint(
        filepath = checkpoint_path,
        monitor='val_loss',
        mode='min',
        save_weights_only=True,
        save_best_only=True
    ),
    # 8번 동안 val_loss의 향상이 없으면 훈련 종료
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        mode='min',
        verbose=1, 
        patience=8
    ),
    tf.keras.callbacks.ReduceLROnPlateau(patience = 4,verbose = 1,factor = 0.5)
]

In [21]:
models = []
k = 5
split = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
for data in data_list:
    series = data[0].to_numpy().reshape(-1, 600, 8)
    
    train, val, y_train, y_val = train_test_split(series, data[1], train_size=0.7, stratify=data[1], random_state=42)
    
    train_data = pd.DataFrame(train.reshape(train.shape[0]*600, -1), columns=data[0].columns)
    val_data = pd.DataFrame(val.reshape(val.shape[0]*600, -1), columns=data[0].columns)
    
    val_data = feature_engineering(val_data)
    val_data = val_data.iloc[:, 2:].to_numpy().reshape(-1, 600, 18)
    
    print('augmentation...')
    aug_data, aug_y = augmentation(train_data, y_train)
    y_train = np.concatenate([y_train, y_train, y_train], axis=0)
    
    #train_data = feature_engineering(train_data)
    #train_data = train_data.iloc[:, 2:].to_numpy().reshape(-1, 600, 18)
    
    train_dataset = make_train(aug_data, aug_y)
    #train_dataset = make_train(train_data, aug_y)
    val_dataset = make_val(val_data, y_val)
    model = base()

    model.fit(train_dataset, validation_data=val_dataset, callbacks=callbacks_list, epochs=1000)
    #model.fit(train_dataset, epochs=1000)
    #model.load_weights(checkpoint_path)
    model.evaluate(train_dataset)
    #model.evaluate(val_dataset)

    models.append(model)

100%|████████████████████████████████████████| 513/513 [00:03<00:00, 137.53it/s]
100%|████████████████████████████████████████| 513/513 [00:00<00:00, 951.39it/s]


augmentation...
rotation...


100%|███████████████████████████████████████| 398/398 [00:00<00:00, 1914.38it/s]
100%|████████████████████████████████████████| 333/333 [00:02<00:00, 131.44it/s]
100%|████████████████████████████████████████| 333/333 [00:00<00:00, 959.20it/s]


permutation...


100%|███████████████████████████████████████| 398/398 [00:00<00:00, 1724.15it/s]
100%|████████████████████████████████████████| 336/336 [00:02<00:00, 128.73it/s]
100%|███████████████████████████████████████| 336/336 [00:00<00:00, 1005.61it/s]
100%|██████████████████████████████████████| 1196/1196 [00:08<00:00, 133.74it/s]
100%|██████████████████████████████████████| 1196/1196 [00:01<00:00, 863.36it/s]


Epoch 1/1000


2021-10-08 10:57:30.995828: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2021-10-08 10:57:31.097656: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 00012: early stopping


 76%|██████████████████████████████▎         | 388/513 [00:02<00:00, 133.25it/s]


KeyboardInterrupt: 

In [89]:
# 결과 생성
pred_list = []    # 예측 결과를 담을 리스트
for model in models:
    pred = model.predict(series_test)
    pred_list.append(pred)

pred = np.mean(pred_list, axis=0)

In [104]:
# 제출물 생성
sub.iloc[:, 1:] = pred
sub.to_csv('overfit.csv', index=False)