# Microbe VOC Identification with Decision Trees

### Processor
* VOC_duration : 3sec
* Wash_Out : 8min

### Sensors
+ TGS : TGS2600, TGS2602, TGS2603, TGS2610, TGS2620
+ MQ : MQ2, MQ3, MQ4, MQ5, MQ6, MQ7, MQ8, MQ9, MQ135

### VOC
1. C.albicans
2. C.glabrata
3. E.coli
4. K.pneumoniae
5. S.aureus
6. S.epidermidis

In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import pymysql
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from matplotlib import cm
import seaborn as sns

import datetime, time
import os, sys
from pathlib import Path
from tqdm import tqdm
from datetime import datetime, timedelta

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz, _tree
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error, make_scorer
import graphviz

import statsmodels.api as sm
from scipy.stats import spearmanr

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [4]:
plot_style = plt.style.available
# pd.options.display.float_format = '{:.5f}'.format
# pd.set_option('display.max_row', 500)
pd.set_option('display.max_rows', 500)

In [5]:
# plt.style.use('ggplot')
# plt.style.use('default')
# plt.style.use('fivethirtyeight')
plt.style.use('seaborn-paper')
# plt.style.use('seaborn-muted')

In [6]:
pd.set_option('display.expand_frame_repr', False)
np.random.seed(42)

In [7]:
PROJECT_DIR = Path('.')
DATA_DIR = Path(PROJECT_DIR, 'data')

In [8]:
print(PROJECT_DIR, DATA_DIR)

. data


## Load Data (1-second)

In [14]:
with pd.HDFStore(PROJECT_DIR/'microbe_data_1s.h5') as store:
    print(store.info())
    df_01 = store['/microbe/sensers/1sec_mean']

# df_01 = pd.read_hdf('microbe_data_1s.h5', '/microbe/sensors/1sec_mean')
# df_01.info()

<class 'pandas.io.pytables.HDFStore'>
File path: microbe_data_1s.h5
/microbe/sensers/1sec_mean            frame        (shape->[1313754,17])


In [30]:
df = df_01.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1313754 entries, ('C.albicans_sensor_1_000_1', Timestamp('2023-09-11 11:00:37')) to ('S.epidermidis_sensor_3_120_1', Timestamp('2023-10-06 01:11:58'))
Data columns (total 17 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   TGS2600        1302568 non-null  float64
 1   TGS2602        1300634 non-null  float64
 2   TGS2603        1301748 non-null  float64
 3   TGS2620        1302197 non-null  float64
 4   TGS2610        1301220 non-null  float64
 5   TGS822         1300517 non-null  float64
 6   MQ2            1301531 non-null  float64
 7   MQ3            0 non-null        float64
 8   MQ4            1302506 non-null  float64
 9   MQ5            0 non-null        float64
 10  MQ6            0 non-null        float64
 11  MQ7            1302414 non-null  float64
 12  MQ8            1301943 non-null  float64
 13  MQ9            0 non-null        float64
 14  MQ135          1301908 no

In [31]:
columns = ['TGS2600', 'TGS2602', 'TGS2603', 'TGS2620', 'TGS2610', 'TGS822', 
           'MQ2', 'MQ3', 'MQ4', 'MQ5', 'MQ6', 'MQ7', 'MQ8', 'MQ9', 'MQ135',
           'CYCLE', 'TIME_RELATIVE']

available_senosrs = ['TGS2600', 'TGS2602', 'TGS2603', 'TGS2620', 'TGS2610', 'TGS822', 
                     'MQ2', 'MQ4','MQ7', 'MQ8', 'MQ135']

non_available_senosrs = ['MQ3', 'MQ5', 'MQ6', 'MQ9']

In [33]:
# df = df.drop(non_available_senosrs, axis=1)
df = df[available_senosrs].fillna(method='bfill')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,TGS2600,TGS2602,TGS2603,TGS2620,TGS2610,TGS822,MQ2,MQ4,MQ7,MQ8,MQ135
SAMPLE_CYCLE_VOC,TIME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
C.albicans_sensor_1_000_1,2023-09-11 11:00:37,0.987000,1.237746,1.127573,1.096367,1.074478,0.417827,0.994416,0.919962,0.970140,1.025450,0.990485
C.albicans_sensor_1_000_1,2023-09-11 11:00:38,0.987000,1.237746,1.127573,1.096367,1.074478,0.417827,0.994416,0.919962,0.970140,1.025450,0.990485
C.albicans_sensor_1_000_1,2023-09-11 11:00:39,0.988125,1.237746,1.129157,1.099863,1.067678,0.417827,0.994416,0.912147,0.970140,1.025450,0.990485
C.albicans_sensor_1_000_1,2023-09-11 11:00:40,0.990000,1.241156,1.140243,1.101961,1.065411,0.417827,0.994416,0.911031,0.970140,1.025450,0.990485
C.albicans_sensor_1_000_1,2023-09-11 11:00:41,0.994500,1.247975,1.138659,1.107205,1.062010,0.417827,0.994416,0.918846,0.968049,1.025450,0.990485
...,...,...,...,...,...,...,...,...,...,...,...,...
S.epidermidis_sensor_3_120_1,2023-10-06 01:11:54,1.112317,0.961582,0.589669,1.223003,1.104757,0.455466,1.077199,0.988885,0.955273,1.088651,1.045023
S.epidermidis_sensor_3_120_1,2023-10-06 01:11:55,1.098308,0.961582,0.581364,1.221167,1.104757,0.455466,1.071872,0.981219,0.955273,1.088651,1.043781
S.epidermidis_sensor_3_120_1,2023-10-06 01:11:56,1.098308,0.961582,0.582748,1.223615,1.104757,0.455466,1.071872,0.981219,0.955273,1.088651,1.052479
S.epidermidis_sensor_3_120_1,2023-10-06 01:11:57,1.098308,0.961582,0.591053,1.225452,1.104757,0.478239,1.071872,0.988885,0.955273,1.088651,1.052479
