## Import libraries

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

## Import data

In [None]:
# Read CSV files into DataFrames
na = pd.read_csv('./normal_air_v1.csv')
cf = pd.read_csv('./caffe.csv')
mo = pd.read_csv('./medicated_oil.csv')

### Data overview

In [None]:
# See head
na.head()

In [None]:
# See tail
na.tail()

In [None]:
# Check size
na.shape

In [None]:
# Summary
na.info()

In [None]:
# descriptive statistics
na.describe().astype(int)

In [None]:
# Column names
na.columns

In [None]:
# Check count of missing values
na.isnull().sum()

In [None]:
# Check duplicate values => Cac gia tri trung lap
na.value_counts()

In [None]:
# Check unique values => Cac gia tri dac biet
na['temperature'].unique()

### Add classes

In [None]:
na['class'] = 'normal air'
cf['class'] = 'caffee'
mo['class'] = 'medicated oil'

### Remove the duplicated data

In [203]:
# Check for duplicate rows 
duplicate_rows = na.duplicated(subset=['temperature','humidity','pressure','gas_resistance'],keep='first')
duplicate_rows.sum()
# Select rows that are not duplicates
# na_no_dup = na_dup[~duplicate_rows]
# na_no_dup

124

### Concatenate DataFrames along a 'row' axis

In [None]:
dataset = pd.concat([na,cf,mo], axis=0)
dataset

### [Optional] Convert Unix time to Date time

In [None]:
na['unix_time'] = pd.to_datetime(na['unix_time'], unit='s')
na

In [None]:
cf['unix_time'] = pd.to_datetime(cf['unix_time'], unit='s')
cf

In [None]:
mo['unix_time'] = pd.to_datetime(mo['unix_time'], unit='s')
mo

## Pre-process data

### Create: Independent variable (X) and Dependent variable (Y)

In [None]:
X = dataset.iloc[:,:-1].values
X.astype(int)

In [None]:
Y = dataset.iloc[:,5].values
Y

### Encoder

#### Lab Encoder: mã hóa dữ liệu Text thành Numerical

In [None]:
dataset

In [None]:
lblencoder = LabelEncoder()
Y[:,] = lblencoder.fit_transform(Y[:,])
# lblencoder.inverse_transform([0,1,2])
# lblencoder.classes_
# [0,1,2] ~ ['caffee', 'medicated oil', 'normal air']
Y

#### One Hot Encoder: mã hóa dữ liệu Numerical thành Binary [Chưa xong]

In [None]:
# ohencoder = OneHotEncoder(categories=categories[5])
# dataset = ohencoder.fit_transform(dataset).toarray()
# dataset