In [None]:
##Data Reduction##
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv('./data/south-korean-pollution-data.csv')
print(df)

       Unnamed: 0       date  pm25  pm10  o3  no2  so2  co      Lat      Long   
0               0   2022/2/1   112    31  35    2    1   4  38.2089  127.9495  \
1               1   2022/2/2    92    21  35    2    1   0  38.2089  127.9495   
2               2   2022/2/3    60    20  35    1    1   4  38.2089  127.9495   
3               3   2022/2/4    51    27  33    1    1   4  38.2089  127.9495   
4               4   2022/2/5    57    24  27    2    1   5  38.2089  127.9495   
...           ...        ...   ...   ...  ..  ...  ...  ..      ...       ...   
34525       34525  2022/2/11    49    43  46    3    1   4  37.5082  130.8217   
34526       34526  2022/2/12   102    24  39    2    0   3  37.5082  130.8217   
34527       34527  2022/2/13    59    15  40    2    0   0  37.5082  130.8217   
34528       34528  2022/2/14    41     0   0    0    0   0  37.5082  130.8217   
34529       34529  2022/1/31     0    19  40    2    1   3  37.5082  130.8217   

                City   Dist

In [None]:
#### 1. Feature selection/Reduction ####
# 위도 경도는 필요 없고, country도 South Korea로 같으니, compute time을 줄여주기 위해서 일단 제거
df = df.drop(['Unnamed: 0', 'Lat', 'Long', 'Country'], axis=1)
print(df.head())

       date  pm25  pm10  o3  no2  so2  co           City District
0  2022/2/1   112    31  35    2    1   4  Bangsan-Myeon  Gangwon
1  2022/2/2    92    21  35    2    1   0  Bangsan-Myeon  Gangwon
2  2022/2/3    60    20  35    1    1   4  Bangsan-Myeon  Gangwon
3  2022/2/4    51    27  33    1    1   4  Bangsan-Myeon  Gangwon
4  2022/2/5    57    24  27    2    1   5  Bangsan-Myeon  Gangwon


In [None]:
#### 2. Data Filtering(outliers 기준) #### "Data value Changes랑 비슷할수도"
pollutions = ['pm25', 'pm10', 'o3', 'no2', 'so2', 'co']
frames = []

for city, group in df.groupby('City'):
    for col in pollutions:
        Q1 = group[col].quantile(0.25)
        Q3 = group[col].quantile(0.75)
        IQR = Q3 - Q1
        # outliers 제거 
        # IQR 사용
        # 근데 높은 값을 가지는 이상치는 제거 해야할까..? 안하는게 나을수도
        mask = ~((group[col] < (Q1 - 1.5 * IQR)) |(group[col] > (Q3 + 1.5 * IQR)))
        group = group.loc[mask]
    frames.append(group)

# 다시 합치기
df_clean = pd.concat(frames)

print(df_clean)


           date  pm25  pm10  o3  no2  so2  co           City   District
1      2022/2/2    92    21  35    2    1   0  Bangsan-Myeon    Gangwon
2      2022/2/3    60    20  35    1    1   4  Bangsan-Myeon    Gangwon
3      2022/2/4    51    27  33    1    1   4  Bangsan-Myeon    Gangwon
4      2022/2/5    57    24  27    2    1   5  Bangsan-Myeon    Gangwon
5      2022/2/6    51    23  33    3    1   5  Bangsan-Myeon    Gangwon
...         ...   ...   ...  ..  ...  ...  ..            ...        ...
34516  2022/2/2    68    16  39    1    1   2       Taeha-Ri  Gyeongbuk
34517  2022/2/3    32    17  38    1    1   2       Taeha-Ri  Gyeongbuk
34518  2022/2/4    34    19  37    1    1   2       Taeha-Ri  Gyeongbuk
34519  2022/2/5    41    21  38    1    1   2       Taeha-Ri  Gyeongbuk
34521  2022/2/7    39    16  38    1    1   2       Taeha-Ri  Gyeongbuk

[29720 rows x 9 columns]


In [None]:
#### 3. Discretization(Binning) ####
# 각 오염지수를 정확한 수치가 아닌 Level(5단계)로 나타낼 수 있다
# 굳이..?
df_clean['pm25_Level'] = pd.cut(df_clean['pm25'], bins=5, labels=False)
df_clean['pm10_Level'] = pd.cut(df_clean['pm10'], bins=5, labels=False)
df_clean['o3_Level'] = pd.cut(df_clean['o3'], bins=5, labels=False)
df_clean['no2_Level'] = pd.cut(df_clean['no2'], bins=5, labels=False)
df_clean['so2_Level'] = pd.cut(df_clean['so2'], bins=5, labels=False)
df_clean['co_Level'] = pd.cut(df_clean['co'], bins=5, labels=False)

print(df_clean)

           date  pm25  pm10  o3  no2  so2  co           City   District   
1      2022/2/2    92    21  35    2    1   0  Bangsan-Myeon    Gangwon  \
2      2022/2/3    60    20  35    1    1   4  Bangsan-Myeon    Gangwon   
3      2022/2/4    51    27  33    1    1   4  Bangsan-Myeon    Gangwon   
4      2022/2/5    57    24  27    2    1   5  Bangsan-Myeon    Gangwon   
5      2022/2/6    51    23  33    3    1   5  Bangsan-Myeon    Gangwon   
...         ...   ...   ...  ..  ...  ...  ..            ...        ...   
34516  2022/2/2    68    16  39    1    1   2       Taeha-Ri  Gyeongbuk   
34517  2022/2/3    32    17  38    1    1   2       Taeha-Ri  Gyeongbuk   
34518  2022/2/4    34    19  37    1    1   2       Taeha-Ri  Gyeongbuk   
34519  2022/2/5    41    21  38    1    1   2       Taeha-Ri  Gyeongbuk   
34521  2022/2/7    39    16  38    1    1   2       Taeha-Ri  Gyeongbuk   

       pm25_discretized  pm25_Level  pm10_Level  o3_Level  no2_Level   
1                     1    

In [None]:
#### 4. Sampling #### 
# Probability - Propotional - to - size random

city_counts = df_clean['City'].value_counts().reset_index()
city_counts.columns = ['City', 'Counts']

city_counts['Sampling_Probability'] = city_counts['Counts'] / df.shape[0]

# 각 도시별 샘플링 확률을 기반으로 샘플 추출
# ex) 100개의 데이터가 있는 city는 10개 출력, 50개가 있는 city는 5개 출력
sampled_df = pd.DataFrame()
for city in city_counts['City']:
    temp = df_clean[df_clean['City'] == city]
    sample_size = int(city_counts.loc[city_counts['City'] == city, 'Sampling_Probability'] * temp.shape[0])
    temp_sample = temp.sample(n=sample_size)
    sampled_df = pd.concat([sampled_df, temp_sample])


print(sampled_df)

             date  pm25  pm10  o3  no2  so2  co         City  District   
8205    2017/5/23    73    31  42   24    4   3    Gwanak-Gu     Seoul  \
6729     2021/8/1    69    13  42   10    3   3    Gwanak-Gu     Seoul   
9251     2014/4/1     0    56  38   53    7   6    Gwanak-Gu     Seoul   
8477    2016/8/22    74    33  60   30    9   5    Gwanak-Gu     Seoul   
6794     2021/4/5    39    35  40   37    5   5    Gwanak-Gu     Seoul   
...           ...   ...   ...  ..  ...  ...  ..          ...       ...   
28018  2020/11/18    72    13  23    5    2   6  Saesol-Dong  Gyeonggi   
27640   2021/11/4    54    45  20   30    1   6  Saesol-Dong  Gyeonggi   
27769    2021/9/8    25    22  56   15    4   6  Saesol-Dong  Gyeonggi   
28152   2020/9/28    51    30  30   18    7   5  Saesol-Dong  Gyeonggi   
28154   2020/9/30    68    21  25    5    5   5  Saesol-Dong  Gyeonggi   

       pm25_discretized  pm25_Level  pm10_Level  o3_Level  no2_Level   
8205                  1           1    

  sample_size = int(city_counts.loc[city_counts['City'] == city, 'Sampling_Probability'] * temp.shape[0])
