# LSTM

### Processor
* VOC_duration : 1min
* Wash_Out : 8min

### Sensors
+ TGS : TGS2600, TGS2602, TGS2603, TGS2610, TGS2620
+ MQ : MQ2, MQ3, MQ4, MQ5, MQ6, MQ7, MQ8, MQ9, MQ135

### VOC
1. DPG ( 10%) + DPG (100%)
2. Actetic (10%) + DPG (99%)
3. Benzaldehyde (10%) + DPG (99%)
4. Keras (10%) + DPG (99%)

In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import pymysql
from pathlib import Path
from pprint import pprint

import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from matplotlib import cm
from matplotlib.colors import ListedColormap

import os, sys
from pathlib import Path
from tqdm import tqdm
from datetime import datetime, timedelta
import datetime, time

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz, _tree
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error, make_scorer
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import minmax_scale
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix

import statsmodels.api as sm
from scipy.stats import spearmanr

import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM
import tensorflow.keras.backend as K

from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras import backend as K
import keras

import graphviz

In [13]:
import warnings
warnings.filterwarnings(action='ignore')

plot_style = plt.style.available
# plt.style.use('ggplot')
# plt.style.use('default')
# plt.style.use('fivethirtyeight')
plt.style.use('seaborn-paper')
# plt.style.use('seaborn-muted')

# pd.options.display.float_format = '{:.5f}'.format
# pd.set_option('display.max_row', 500)
pd.set_option('display.max_rows', 500)

pd.set_option('display.expand_frame_repr', False)
np.random.seed(42)

In [14]:
PROJECT_DIR = Path('.')
CSV_DIR = Path(PROJECT_DIR, 'csv')
DATA_DIR = Path(PROJECT_DIR, 'data')
RESULT_DIR = Path(PROJECT_DIR, 'results')
NPY_DIR = Path(PROJECT_DIR, 'data', 'npy')
HDF_DIR = Path(PROJECT_DIR, 'data', 'hdf')

## LSTM

### Check of CPU/GPU status

In [15]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
    print('Using GPU')
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)
else:
    print('Using CPU')

Using GPU


### Data read (1-sec)

In [16]:
with pd.HDFStore(PROJECT_DIR/'drug_data_1sec.h5') as store:
    print(store.info())
    df_01 = store['/drug/tgs_sensers/1sec_mean']

# df_01 = pd.read_hdf('drug_data_1s.h5', '/drug/tgs_sensors/1sec_mean')
# df_01.info()

<class 'pandas.io.pytables.HDFStore'>
File path: drug_data_1sec.h5
/drug/tgs_sensers/1sec_mean            frame        (shape->[394550,6])


In [17]:
df_01.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 394550 entries, ('Acetic_001', Timestamp('2023-05-07 03:44:03')) to ('Keras_725', Timestamp('2023-05-11 17:57:43'))
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   TGS2600  390472 non-null  float64
 1   TGS2602  390472 non-null  float64
 2   TGS2603  390472 non-null  float64
 3   TGS2620  390472 non-null  float64
 4   TGS2610  390472 non-null  float64
 5   TGS822   390472 non-null  float64
dtypes: float64(6)
memory usage: 31.4+ MB


In [18]:
df = df_01.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 394550 entries, ('Acetic_001', Timestamp('2023-05-07 03:44:03')) to ('Keras_725', Timestamp('2023-05-11 17:57:43'))
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   TGS2600  390472 non-null  float64
 1   TGS2602  390472 non-null  float64
 2   TGS2603  390472 non-null  float64
 3   TGS2620  390472 non-null  float64
 4   TGS2610  390472 non-null  float64
 5   TGS822   390472 non-null  float64
dtypes: float64(6)
memory usage: 31.4+ MB


In [19]:
columns = ['TGS2600', 'TGS2602', 'TGS2603', 'TGS2620', 'TGS2610', 'TGS822',
           'CYCLE', 'VOC_CYCLE', 'UNIXSTAMP']
available_senosors = ['TGS2600', 'TGS2602', 'TGS2603', 'TGS2620', 'TGS2610', 'TGS822']
non_available_sensors = []

In [20]:
labels = {
    'DPG' : 0,
    'Acetic' : 1,
    'Benzaldehyde' : 2,
    'Keras' : 3
}

In [21]:
df = df[available_senosors]
idx = pd.IndexSlice
available_sensors_no = len(available_senosors)
samples = df.index.get_level_values('VOC_CYCLE').unique().tolist()

In [26]:
analysis_seconds = [30, 60*1, 60*2, 60*3, 60*4, 60*5, 60*6, 60*7, 60*8, 60*9]

for sec in tqdm(analysis_seconds):
    analysis_time = int(sec) # seconds
    
    df_min = pd.DataFrame()
    voc_labels = []
    for sample in samples:
        voc_label = sample.split('_')[0]
        df_temp = df.loc[idx[str(sample), :], :]
        df_temp = df_temp.fillna(method='ffill').fillna(method='bfill')[:analysis_time]
        if len(df_temp) == analysis_time:
            df_min = pd.concat([df_min, df_temp])
            voc_labels.append(voc_label)
        else:
            continue

    y_voc = pd.DataFrame({'voc': voc_labels})
    y_voc_onehot = pd.get_dummies(y_voc['voc']).astype(int)
    y_voc_encoded = LabelEncoder().fit_transform(y_voc)
    
    # Data Saving
    df_min.to_hdf(f'{HDF_DIR}/1_sec_data.h5', f'{analysis_time}sec/X_sensor')
    y_voc.to_hdf(f'{HDF_DIR}/1_sec_data.h5', f'{analysis_time}sec/y_voc')
    y_voc_onehot.to_hdf(f'{HDF_DIR}/1_sec_data.h5', f'{analysis_time}sec/y_voc_onehot')
#     y_voc_encoded.to_hdf(f'./data/hdf/1_sec.npy', f'{analysis_time}sec/y_voc_encode')

100% 10/10 [01:07<00:00,  6.79s/it]
