In [1]:
import pandas as pd
import numpy as np
from scipy import signal
from tqdm import tqdm
from numpy.fft import fft, fftshift
import random
import warnings
warnings.filterwarnings(action='ignore')

In [22]:
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from numpy.random import seed

In [3]:
# data load
# path 변수를 적절히 변경
x_train_path = os.path.join('data/train_features.csv')
y_train_path = os.path.join('data/train_labels.csv')
x_test_path = os.path.join('data/test_features.csv')
sub_path = os.path.join('data/sample_submission.csv')

train = pd.read_csv(x_train_path)
train_label = pd.read_csv(y_train_path)
test = pd.read_csv(x_test_path)
sub = pd.read_csv(sub_path)

In [4]:
train['acc_Energy']=(train['acc_x']**2+train['acc_y']**2+train['acc_z']**2)**(1/3)
test['acc_Energy']=(test['acc_x']**2+test['acc_y']**2+test['acc_z']**2)**(1/3)

train['gy_Energy']=(train['gy_x']**2+train['gy_y']**2+train['gy_z']**2)**(1/3)
test['gy_Energy']=(test['gy_x']**2+test['gy_y']**2+test['gy_z']**2)**(1/3)

train['gy_acc_Energy']=((train['gy_x']-train['acc_x'])**2+(train['gy_y']-train['acc_y'])**2+(train['gy_z']-train['acc_z'])**2)**(1/3)
test['gy_acc_Energy']=((test['gy_x']-test['acc_x'])**2+(test['gy_y']-test['acc_y'])**2+(test['gy_z']-test['acc_z'])**2)**(1/3)

In [5]:
dt=0.02 
def jerk_signal(signal): 
        return np.array([(signal[i+1]-signal[i])/dt for i in range(len(signal)-1)])

In [6]:
train_dt=[]
for i in tqdm(train['id'].unique()):
    temp=train.loc[train['id']==i]
    for v in train.columns[2:]:
        values=jerk_signal(temp[v].values)
        values=np.insert(values,0,0)
        temp.loc[:,v+'_dt']=values
    train_dt.append(temp)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3125/3125 [00:27<00:00, 111.75it/s]


In [7]:
test_dt=[]
for i in tqdm(test['id'].unique()):
    temp=test.loc[test['id']==i]
    for v in train.columns[2:]:
        values=jerk_signal(temp[v].values)
        values=np.insert(values,0,0)
        temp.loc[:,v+'_dt']=values
    test_dt.append(temp)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 782/782 [00:06<00:00, 127.31it/s]


In [8]:
from scipy import fftpack
from numpy.fft import *

def fourier_transform_one_signal(t_signal):
    complex_f_signal= fftpack.fft(t_signal)
    amplitude_f_signal=np.abs(complex_f_signal)
    return amplitude_f_signal

In [9]:
train=pd.concat(train_dt)

In [10]:
fft=[]
for i in tqdm(train['id'].unique()):
    temp=train.loc[train['id']==i]
    for i in train.columns[2:8]:
        temp[i]=fourier_transform_one_signal(temp[i].values)
    fft.append(temp)
train=pd.concat(fft)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3125/3125 [00:09<00:00, 323.44it/s]


In [11]:
test=pd.concat(test_dt)

In [12]:
fft_t=[]
for i in tqdm(test['id'].unique()):
    temp=test.loc[test['id']==i]
    for i in test.columns[2:8]:
        temp[i]=fourier_transform_one_signal(temp[i].values)
    fft_t.append(temp)
test=pd.concat(fft_t)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 782/782 [00:01<00:00, 532.89it/s]


In [13]:
col=train.columns
train_s=train.copy()
test_s=test.copy()

In [14]:
scaler = StandardScaler()

train_s.iloc[:,2:]= scaler.fit_transform(train_s.iloc[:,2:])
train_sc = pd.DataFrame(data = train_s,columns =col)

test_s.iloc[:,2:]= scaler.transform(test_s.iloc[:,2:])
test_sc = pd.DataFrame(data = test_s,columns =col)

In [19]:
# make dataset
# x를 시계열 데이터로 변경 시켜주는 함수
def make_series(data):
    ids = data['id'].unique()
    id_data = data.groupby('id')
    series_data = []

    for i in ids:
        df = id_data.get_group(i)
        df = df.drop(['id', 'time'], axis=1)
        series_data.append(df.to_numpy())

    series_data = np.array(series_data)
    return series_data


def ids_26(label):
    mask = label['label'] == 26
    ids = label.loc[mask, 'id'].tolist()
    
    return ids


def make_split_dataset(series_train, ids, labels):
    final_list = []
    
    k_split = KFold(n_splits=15, shuffle=True, random_state=42)
    
    temp = np.array(range(3125))
    except_train_mask = np.setdiff1d(temp, ids, assume_unique=True)
    except_train = series_train[except_train_mask]
    except_label = labels[except_train_mask]
    
    train_26 = series_train[ids]

    for _, fold in k_split.split(train_26):
        temp_train = train_26[fold]
        temp_label = np.array([26] * len(temp_train))
        
        temp_train = np.concatenate([temp_train, except_train], axis=0)
        temp_label = np.concatenate([temp_label, except_label], axis=0)
        print(temp_train.shape)
        final_list.append([temp_train, temp_label])
        
    return final_list

In [20]:
series_train = make_series(train_sc)
series_test = make_series(test_sc)

In [23]:
ids = ids_26(train_label)
data_list = make_split_dataset(series_train, ids, train_label['label'])

(1709, 600, 18)
(1709, 600, 18)
(1709, 600, 18)
(1708, 600, 18)
(1708, 600, 18)
(1708, 600, 18)
(1708, 600, 18)
(1708, 600, 18)
(1708, 600, 18)
(1708, 600, 18)
(1708, 600, 18)
(1708, 600, 18)
(1708, 600, 18)
(1708, 600, 18)
(1708, 600, 18)


In [24]:
import tensorflow as tf

In [25]:
# dataset과 validation set을 만들어 주는 함수
# validation set은 shuffle 적용 x
def make_train(series_data, labels):
    cat_y = tf.keras.utils.to_categorical(labels)

    BATCH_SIZE = 64
    train_dataset = tf.data.Dataset.from_tensor_slices((series_data, cat_y))
    train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(1000, seed=42)
    train_dataset = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return train_dataset

def make_val(series_data, labels):
    cat_y = tf.keras.utils.to_categorical(labels)

    BATCH_SIZE = 64
    val_dataset = tf.data.Dataset.from_tensor_slices((series_data, cat_y))
    val_dataset = val_dataset.batch(BATCH_SIZE)
    val_dataset = val_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return val_dataset

In [26]:
from tensorflow import keras

In [27]:
# 모델을 만들어 주는 함수
# 기존 base에서 overfitting이 심해, dropout을 늘림(아직 제출은 안해봄)
def base():
    seed(2021)
    tf.random.set_seed(2021)
    model = keras.models.Sequential([
            keras.layers.Conv1D(filters=128, kernel_size=9, padding='same', input_shape=[600, 18]),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dropout(0.3),
            keras.layers.Conv1D(filters=256, kernel_size=6, padding='same'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dropout(0.4),
            keras.layers.Conv1D(filters=128, kernel_size=3,padding='same'),
            keras.layers.BatchNormalization(),
            keras.layers.Activation('relu'),
            keras.layers.Dropout(0.5),
            keras.layers.GlobalAveragePooling1D(),
            keras.layers.Dense(61, activation='softmax')
    ])
    model.compile(optimizer='adam',
                loss='categorical_crossentropy', 
                metrics=['accuracy'])
    return model

In [28]:
test_model = base()
test_model.load_weights('./checkpoint/best/k_fold_cnn/0_fold_cnn_weighted_ckpt.hdf5')

In [29]:
for series in data_list:
    x_train, x_val, y_train, y_val = train_test_split(series[0], series[1], train_size=0.7, stratify=series[1], random_state=42)
    
    train_dataset = make_train(x_train, y_train)
    val_dataset = make_val(x_val, y_val)
    
    test_model.evaluate(train_dataset)
    test_model.evaluate(val_dataset)

