In [12]:
import pandas as pd
import numpy as np
from datetime import datetime

In [13]:
def parser(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

input_file = './data/AirQualityUCI_refined.csv'

df = pd.read_csv(input_file,
                 index_col=[0],
                 parse_dates=[0],
                 date_parser=parser)

In [14]:
print(df)

                     CO(GT)  PT08.S1(CO)  PT08.S2(NMHC)  NOx(GT)  \
Datetime                                                           
2004-03-10 18:00:00     2.6       1360.0         1046.0    166.0   
2004-03-10 19:00:00     2.0       1292.0          955.0    103.0   
2004-03-10 20:00:00     2.2       1402.0          939.0    131.0   
2004-03-10 21:00:00     2.2       1376.0          948.0    172.0   
2004-03-10 22:00:00     1.6       1272.0          836.0    131.0   
...                     ...          ...            ...      ...   
2005-04-04 10:00:00     3.1       1314.0         1101.0    472.0   
2005-04-04 11:00:00     2.4       1163.0         1027.0    353.0   
2005-04-04 12:00:00     2.4       1142.0         1063.0    293.0   
2005-04-04 13:00:00     2.1       1003.0          961.0    235.0   
2005-04-04 14:00:00     2.2       1071.0         1047.0    265.0   

                     PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)    RH  \
Datetime                            

In [15]:
# Visualization setup
%matplotlib
from matplotlib import pyplot as plt
import seaborn; seaborn.set()  # set plot styles
%config InlineBackend.figure_format = 'svg'
plt.rcParams['figure.figsize'] = [10, 5]
plt.ion() # enable the interactive mode

Using matplotlib backend: Qt5Agg


In [17]:
co = df['CO(GT)'].copy()
co.interpolate(inplace = True)

In [19]:
# Visualize original and imputed date
plt.plot(df['CO(GT)'], label = 'original', zorder = 2)
plt.plot(co, label = 'linear interpolation', zorder = 1)
plt.legend(loc= 'best')
plt.show()

In [21]:
# Detecting outliers using Boxplot
plt.boxplot(co)
plt.title('Detecting outliers using boxplot')
plt.xlabel('CO(GT)')
plt.show()

In [22]:
# Calculate correlations between variables 
corr_matrix = df.corr()
print(corr_matrix)

                 CO(GT)  PT08.S1(CO)  PT08.S2(NMHC)   NOx(GT)  PT08.S3(NOx)  \
CO(GT)         1.000000     0.877203       0.914973  0.792557     -0.701703   
PT08.S1(CO)    0.877203     1.000000       0.892964  0.713654     -0.771938   
PT08.S2(NMHC)  0.914973     0.892964       1.000000  0.704435     -0.796703   
NOx(GT)        0.792557     0.713654       0.704435  1.000000     -0.655707   
PT08.S3(NOx)  -0.701703    -0.771938      -0.796703 -0.655707      1.000000   
NO2(GT)        0.679262     0.641529       0.646245  0.763111     -0.652083   
PT08.S4(NO2)   0.639470     0.682881       0.777254  0.233731     -0.538468   
PT08.S5(O3)    0.851403     0.899324       0.880578  0.787046     -0.796569   
RH             0.040218     0.114606      -0.090380  0.221032     -0.056740   
AH             0.065809     0.135324       0.186933 -0.149323     -0.232017   
C6H6(GT)       0.845144     0.883795       0.981950  0.626638     -0.735744   

                NO2(GT)  PT08.S4(NO2)  PT08.S5(O3) 

In [23]:
# choose the least corrlated variable 
rh = df['RH'].copy().interpolate() # 결측치까지 메꾸고 가져옴. (RH(Relative Humidity))
rh.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x20c48ffd548>

In [30]:
# Visulize a scatter plot(CO, RH)
plt.scatter(co, rh, s = 8, c = 'black')
plt.xlabel('CO(GT)')
plt.ylabel('RH')
plt.show()

In [31]:
# Choose the most correlated variable
nmhc = df['PT08.S2(NMHC)'].copy().interpolate() # NMHC: NON-methane HydroCarbon

In [32]:
# Visulize a scater plot(CO, NMHC)
plt.scatter(co, nmhc, s = 12, c = 'black')
plt.xlabel('CO(GT)')
plt.ylabel('PT08.S2(NMHC)')
plt.show()

In [34]:
"""
IQR-based Outlier Detection
"""

#Q1, Q2(median), Q3
q1 = co.quantile(0.25) # 25% value
median = co.quantile(0.5) # median
q3 = co.quantile(0.75) # 75% value
print(q1, median, q3)

1.1 1.8 2.9


In [37]:
# IQR, upper_fence, lower_fence
iqr = q3 - q1
upper_fence = q3 + 1.5 * iqr
lower_fence = q1 - 1.5 * iqr
print(iqr, upper_fence, lower_fence) # lower fence는 음수 값이 나올 수 없기 때문에 0 임.

1.7999999999999998 5.6 -1.5999999999999996


In [41]:
# Filtering the outliers

outliers = co.loc[(co > upper_fence) | (co < lower_fence)] # 값이 모두 0 이상이기 때문에, lower fence보다 작을 순 없다.
print(outliers)

Datetime
2004-03-11 19:00:00    6.9
2004-03-11 20:00:00    6.1
2004-03-12 20:00:00    6.6
2004-03-14 20:00:00    5.9
2004-03-15 09:00:00    8.1
                      ... 
2005-03-23 19:00:00    6.2
2005-03-23 20:00:00    7.2
2005-03-24 19:00:00    5.9
2005-03-24 20:00:00    7.5
2005-03-25 19:00:00    5.7
Name: CO(GT), Length: 224, dtype: float64


In [46]:
# Mask for outliers
mask = co.index.isin(outliers.index)

In [48]:
# Visulize the normal date and outliers
plt.plot(co[~mask], label = 'normal', color = 'blue',
        marker = 'o', markersize = 3, linestyle = 'None')
plt.plot(outliers, label = 'outliers', color = 'red',
        marker = 'x', markersize = 3, linestyle = 'None')
plt.legend(loc = 'best')
plt.show()

In [52]:
# Removing the ouliers
co_refined = co.copy()
co_refined[mask] = np.nan
# co_refined.plot()

Datetime
2004-03-10 18:00:00    2.6
2004-03-10 19:00:00    2.0
2004-03-10 20:00:00    2.2
2004-03-10 21:00:00    2.2
2004-03-10 22:00:00    1.6
                      ... 
2005-04-04 10:00:00    3.1
2005-04-04 11:00:00    2.4
2005-04-04 12:00:00    2.4
2005-04-04 13:00:00    2.1
2005-04-04 14:00:00    2.2
Name: CO(GT), Length: 9357, dtype: float64


In [53]:
# linear interpolation
co_refined.interpolate(inplace = True)
co_refined.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x20c4ac71548>

  c = np.array(c)


In [54]:
"""
Detecting Outliers with Z - scores
"""
#Visualize the distribution of the 'CO(GT)' variable
import seaborn as sns
sns.distplot(co)# distribution plot

<matplotlib.axes._subplots.AxesSubplot at 0x20c462c2048>

In [56]:
# Mean, Standard deviation
mean = np.mean(co)
std = np.std(co)
print(mean, std)

2.126146200705357 1.4369815748596482


In [58]:
# calculate Z - scores for each date points
outliers = []
thres = 3 # Z - score threshold

for i in co:
    z_score = (i-mean) / std
    if (np.abs(z_score) > thres):
        print(z_score)
        outliers.append(i)

3.3221398818289734
3.113368937755246
4.157223658123882
4.087633343432639
3.0437786230640036
3.113368937755246
3.8092720846676698
3.182959252446488
3.113368937755246
3.600501140593942
3.5309108259027
3.0437786230640036
3.0437786230640036
3.182959252446488
3.0437786230640036
3.600501140593942
3.182959252446488
3.7396817699764275
3.0437786230640036
3.182959252446488
3.3917301965202156
4.087633343432639
3.878862399358912
3.2525495671377302
4.36599460219761
3.2525495671377302
4.018043028741397
4.36599460219761
4.087633343432639
4.36599460219761
5.131488063801275
3.0437786230640036
3.113368937755246
3.7396817699764275
3.2525495671377302
4.018043028741397
4.087633343432639
4.922717119727547
3.3917301965202156
3.786075313103922
4.180420429687629
4.574765546271336
4.157223658123882
4.087633343432639
3.113368937755246
3.2525495671377302
3.9484527140501537
5.618620266639971
3.113368937755246
3.7396817699764275
3.5309108259027
4.296404287506367
4.505175231580093
3.3221398818289734
3.04377862306400

In [63]:
# Simplified version of filtering outliers
outliers = co.loc[np.abs((co - mean) / std) > thres].copy()

In [65]:
# Mask for outliers
mask = co.index.isin(outliers.index)
mask[:50]

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False])

In [69]:
# Comparison of distributions before outlier removal
sns.distplot(co, axlabel = 'CO(GT)', label = 'original')
sns.distplot(co[~mask], label = 'outliers removed')
plt.legend(loc = 'best')
plt.show()
# [exer] Adjust thres

In [71]:
# Flooring and Copping
floor = co.quantile(0.1)
cap = co.quantile(0.9)
# Visualize the result
co.loc[co < floor] = floor
co.loc[co > cap] = cap
co.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x20c4f0fddc8>