In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# 중국 Top-5 지역
locations = ["qingdao", "dalian", "tongliao", "yanan", "chifeng"]

In [3]:
FINEDUST_DIR = "../collect_data/data/cma/finedust"
WEATHER_DIR = "../collect_data/data/cma/weather"

finedust_files = os.listdir(FINEDUST_DIR)
weather_files = os.listdir(WEATHER_DIR)

In [4]:
df_dust = pd.read_csv(os.path.join(FINEDUST_DIR, finedust_files[0]))
df_weather = pd.read_csv(os.path.join(WEATHER_DIR, weather_files[0]))

dust_columns = df_dust.columns
weather_columns = df_weather.columns

In [5]:
def convert_columns_to_float(df, columns):
    """
    Converts specified columns of a DataFrame to float.
    """
    # Apply pd.to_numeric with errors='coerce' to handle non-numeric values
    df[columns] = df[columns].apply(pd.to_numeric, errors='coerce')
    
    # Explicitly cast columns to float to ensure the correct dtype
    for col in columns:
        df[col] = df[col].astype(float)
    
    return df

In [6]:
def clean_tm_column(df, column_name):
    """
    Cleans the specified column by removing "." and ":" characters.
    """
    df[column_name] = df[column_name].apply(lambda x: x.replace('.', '').replace(':', ''))
    return df

In [7]:
def clean_missing_values(df, missing_values, threshold=0.05):
    """
    Replaces specified missing values with NaN and drops columns with
    missing value ratios above the threshold.
    """
    # Replace specified missing values with NaN
    df = df.replace(missing_values, np.nan)
    
    # Calculate the missing value ratio for each column
    missing_ratios = df.isnull().mean()
    
    # Drop columns with missing value ratio above the threshold
    columns_to_drop = missing_ratios[missing_ratios > threshold].index
    df = df.drop(columns=columns_to_drop)
    
    return df

In [19]:
def unify_and_merge(df1, df2, column1, column2, on, how='inner'):
    """
    Unifies the format of the specified columns in two DataFrames and merges them.

    Parameters:
    - df1 (pd.DataFrame): The first DataFrame.
    - df2 (pd.DataFrame): The second DataFrame.
    - column1 (str): The column in df1 to be unified (e.g., 'YYMMDDHHMI').
    - column2 (str): The column in df2 to be unified (e.g., 'TM').
    - on (list): The list of column names to merge on.
    - how (str): The merge method (default: 'inner').

    Returns:
    - pd.DataFrame: The merged DataFrame.
    """
    # Rename columns for consistency
    df1 = df1.rename(columns={column1: column2})

    # Merge the DataFrames
    merged_df = pd.merge(df1, df2, on=on, how=how)
    merged_df = merged_df.drop_duplicates().reset_index(drop=True)
    
    return merged_df

In [9]:
for dust_file, weather_file in zip(finedust_files, weather_files):
    print(dust_file)
    print(weather_file)
    print()

cma_2018_2022_tongliao.csv
cma_2018_2022_tongliao.csv

cma_2018_2022_qingdao.csv
cma_2018_2022_qingdao.csv

cma_2018_2022_dalian.csv
cma_2018_2022_dalian.csv

cma_2018_2022_yanan.csv
cma_2018_2022_yanan.csv

cma_2018_2022_chifeng.csv
cma_2018_2022_chifeng.csv



In [25]:
dust_file.split("_")[-1].split(".")[0]

'chifeng'

In [27]:
# Merge 파일 저장

FILTER_URL = "../collect_data/filtered/cma/filtered"
os.makedirs(FILTER_URL, exist_ok=True)

for dust_file, weather_file in zip(finedust_files, weather_files):
    loc = dust_file.split("_")[-1].split(".")[0]

    df_dust = pd.read_csv(os.path.join(FINEDUST_DIR, dust_file))
    df_weather = pd.read_csv(os.path.join(WEATHER_DIR, weather_file))

    df_dust = convert_columns_to_float(df_dust, ["STN", "PM10"])
    df_dust = clean_tm_column(df_dust, "TM")

    df_weather = convert_columns_to_float(df_weather, weather_columns[1:])
    df_weather["YYMMDDHHMI"] = df_weather["YYMMDDHHMI"].astype(str)
    df_weather = clean_missing_values(df_weather, [-9, -99, -999])

    df_merged = unify_and_merge(df_weather, df_dust, "YYMMDDHHMI", "TM", ["TM", "STN"], how='inner')
    df_merged.to_csv(f"{FILTER_URL}/{loc}.csv", index=False)

In [23]:
df_merged.isnull().mean() * 100

TM      0.000000
STN     0.000000
IW      0.000000
IR      0.000000
IX      0.000000
VV      0.058213
WD      0.019404
WS      0.067915
TA      0.116426
TD      0.261958
HM      0.174639
PS      0.029106
PT      0.000000
PR      0.019404
RH      0.000000
ORG     0.000000
PM10    0.000000
dtype: float64