In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
df_weather = pd.read_csv("../collect_data/data/kma/weather/kma_2022_seoul.csv")
df_finedust = pd.read_csv("../collect_data/data/kma/finedust/kma_2022_seoul.csv")

< Description >
* -9: Missing value

< Goal >
* Extract meaningful variables

# Check missing value rate

In [9]:
df_replaced = df_weather.replace(-9, np.nan)
missing_ratio = df_replaced.isnull().mean() * 100

print("Missing ratio (%)")
print(missing_ratio)

Missing ratio (%)
TM          0.000000
STN         0.000000
WD          0.000000
WS          0.000000
GST_WD     94.165504
GST_WS     94.165504
GST_TM     94.165504
PA          0.000000
PS          0.000000
PT         66.573687
PR         66.573687
TA          0.081358
TD          0.162715
HM          0.000000
PV          0.000000
RN         89.318921
RN_DAY     71.431892
RN_JUN     71.431892
RN_INT    100.000000
SD_HR3     99.628080
SD_DAY     98.523942
SD_TOT     94.304974
WC         82.647606
WP         80.892608
WW          0.000000
CA_TOT      0.000000
CA_MID      0.000000
CH_MIN     48.221757
CT          0.000000
CT_TOP     57.484891
CT_MID     45.211530
CT_LOW     44.060902
VS          0.000000
SS         45.385867
SI         45.385867
ST_GD     100.000000
TS          0.034868
TE_005      0.000000
TE_01       0.000000
TE_02       0.000000
TE_03       0.000000
ST_SEA    100.000000
WH        100.000000
BF        100.000000
IR          0.000000
IX         27.324500
dtype: float64


In [10]:
# 결측치 비율이 20%를 넘는 열 제거
df_filtered = df_replaced.loc[:, missing_ratio <= 20]
df_filtered.shape

(8604, 21)

In [11]:
# Recalculate missing ratio
missing_ratio = df_filtered.isnull().mean() * 100

print("Missing ratio (%)")
print(missing_ratio)

Missing ratio (%)
TM        0.000000
STN       0.000000
WD        0.000000
WS        0.000000
PA        0.000000
PS        0.000000
TA        0.081358
TD        0.162715
HM        0.000000
PV        0.000000
WW        0.000000
CA_TOT    0.000000
CA_MID    0.000000
CT        0.000000
VS        0.000000
TS        0.034868
TE_005    0.000000
TE_01     0.000000
TE_02     0.000000
TE_03     0.000000
IR        0.000000
dtype: float64


In [19]:
### Merge two dataframe
df_finedust.rename(columns={'STN_ID': 'STN'}, inplace=True)
df_merged = pd.merge(df_filtered, df_finedust, on=["TM", "STN"], how='outer')

# TM=20220101이 안 뽑힌 문제
df_merged["TM"] = df_merged["TM"].astype(str)
df_merged = df_merged[~df_merged["TM"].str.startswith("20220101")].reset_index(drop=True)
df_merged

Unnamed: 0,TM,STN,WD,WS,PA,PS,TA,TD,HM,PV,...,CA_MID,CT,VS,TS,TE_005,TE_01,TE_02,TE_03,IR,PM10
0,202201020000,108.0,5.0,2.3,1016.3,1027.3,-2.8,-10.6,55.0,2.7,...,7.0,Ac,2000.0,-3.3,-0.4,-0.7,-0.3,0.8,3.0,51.0
1,202201020100,108.0,5.0,2.1,1015.8,1026.8,-2.9,-10.2,57.0,2.8,...,8.0,-,2000.0,-3.4,-0.4,-0.7,-0.3,0.8,3.0,62.0
2,202201020200,108.0,5.0,1.9,1015.3,1026.3,-2.6,-9.9,57.0,2.9,...,8.0,-,2000.0,-2.5,-0.4,-0.7,-0.3,0.8,3.0,44.0
3,202201020300,108.0,5.0,2.5,1015.1,1026.1,-2.1,-9.7,56.0,2.9,...,9.0,Sc,2000.0,-2.1,-0.4,-0.7,-0.3,0.8,3.0,39.0
4,202201020400,108.0,5.0,2.0,1014.3,1025.3,-1.9,-9.7,55.0,2.9,...,8.0,Sc,2000.0,-2.1,-0.3,-0.7,-0.3,0.8,4.0,44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8753,202212312000,108.0,2.0,1.4,1018.1,1029.0,-0.8,-4.6,75.0,4.3,...,2.0,-,1171.0,-1.8,0.1,0.5,0.5,0.6,3.0,46.0
8754,202212312100,108.0,34.0,0.9,1018.2,1029.2,-1.0,-4.5,77.0,4.4,...,7.0,Sc,1134.0,-2.8,0.2,0.5,0.5,0.6,3.0,52.0
8755,202212312200,108.0,29.0,0.5,1018.3,1029.3,-0.8,-4.1,78.0,4.5,...,8.0,-,1094.0,-2.2,0.3,0.5,0.5,0.6,3.0,68.0
8756,202212312300,108.0,23.0,1.3,1018.5,1029.4,-0.2,-3.7,77.0,4.6,...,8.0,-,1314.0,-2.1,0.3,0.5,0.5,0.6,3.0,72.0


In [20]:
### Save dataframe
df_merged.to_csv("../collect_data/filtered/kma/kma_2022_seoul.csv", index=False)

# Preprocess all dataset

In [None]:
locations = ["seoul", "andong", "daegu", "gwangju", "jeonju"]

for lc in locations:
    df_weather = pd.read_csv(f"../collect_data/data/kma/weather/kma_2022_{lc}.csv")
    df_finedust = pd.read_csv(f"../collect_data/data/kma/finedust/kma_2022_{lc}.csv")

    df_replaced = df_weather.replace(-9, np.nan)
    missing_ratio = df_replaced.isnull().mean() * 100

    df_filtered = df_replaced.loc[:, missing_ratio <= 20]

    df_finedust.rename(columns={'STN_ID': 'STN'}, inplace=True)
    df_merged = pd.merge(df_filtered, df_finedust, on=["TM", "STN"], how='outer')

    # TM=20220101이 안 뽑힌 문제
    df_merged["TM"] = df_merged["TM"].astype(str)
    df_merged = df_merged[~df_merged["TM"].str.startswith("20220101")].reset_index(drop=True)

    df_merged.to_csv(f"../collect_data/filtered/kma/kma_2022_{lc}.csv", index=False)

In [8]:
# column 안에서 NULL 값이 하나라도 있으면 제거하기

locations = ["seoul", "andong", "daegu", "gwangju", "jeonju"]

for lc in locations:
    df = pd.read_csv(f"../collect_data/filtered/kma/kma_2022_{lc}.csv")
    df_clean = df.dropna()
    df_clean.to_csv(f"../collect_data/filtered/kma/kma_2022_{lc}_noNULL.csv", index=False)

In [33]:
# column 안에서 중복 값 제거 후 저장

locations = ["seoul", "andong", "daegu", "gwangju", "jeonju"]

for lc in locations:
    df = pd.read_csv(f"../collect_data/filtered/kma/kma_2022_{lc}_noNULL.csv")

    first_column_name = df.columns[0]
    duplicated = df[first_column_name].duplicated()

    df_duplicated_erase = df.drop_duplicates(subset=[first_column_name])

    df_duplicated_erase.to_csv(f"../collect_data/filtered/kma/kma_2022_{lc}_noNULL_noDuplicated.csv", index=False)

In [34]:
locations = ["seoul", "andong", "daegu", "gwangju", "jeonju"]

for lc in locations:
    df = pd.read_csv(f"../collect_data/filtered/kma/kma_2022_{lc}_noNULL_noDuplicated.csv")

    first_column_name = df.columns[0]
    duplicated = df[first_column_name].duplicated()

    print(df[duplicated])

Empty DataFrame
Columns: [TM, STN, WD, WS, PA, PS, TA, TD, HM, PV, WW, CA_TOT, CA_MID, CT, VS, TS, TE_005, TE_01, TE_02, TE_03, IR, PM10]
Index: []

[0 rows x 22 columns]
Empty DataFrame
Columns: [TM, STN, WD, WS, PA, PS, TA, TD, HM, PV, WW, CA_TOT, CA_MID, CT, VS, TS, TE_005, TE_01, TE_02, TE_03, IR, PM10]
Index: []

[0 rows x 22 columns]
Empty DataFrame
Columns: [TM, STN, WD, WS, PA, PS, TA, TD, HM, PV, WW, CA_TOT, CA_MID, CT, VS, TS, TE_005, TE_01, TE_02, TE_03, IR, PM10]
Index: []

[0 rows x 22 columns]
Empty DataFrame
Columns: [TM, STN, WD, WS, PA, PS, TA, TD, HM, PV, WW, CA_TOT, CA_MID, CT, VS, TS, TE_005, TE_01, TE_02, TE_03, IR, PM10]
Index: []

[0 rows x 22 columns]
Empty DataFrame
Columns: [TM, STN, WD, WS, PA, PS, TA, TD, HM, PV, WW, CA_TOT, CA_MID, CT, VS, TS, TE_005, TE_01, TE_02, TE_03, IR, IX, PM10]
Index: []

[0 rows x 23 columns]
