In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [5]:
# 중국 Top-5 지역
locations = ["qingdao", "dalian", "tongliao", "yanan", "chifeng"]

In [6]:
FINEDUST_DIR = "../collect_data/data/cma/finedust"
WEATHER_DIR = "../collect_data/data/cma/weather"

finedust_files = os.listdir(FINEDUST_DIR)
weather_files = os.listdir(WEATHER_DIR)

In [7]:
df_dust = pd.read_csv(os.path.join(FINEDUST_DIR, finedust_files[0]))
df_weather = pd.read_csv(os.path.join(WEATHER_DIR, weather_files[0]))

dust_columns = df_dust.columns
weather_columns = df_weather.columns

In [8]:
def convert_columns_to_float(df, columns):
    """
    Converts specified columns of a DataFrame to float.
    """
    # Apply pd.to_numeric with errors='coerce' to handle non-numeric values
    df[columns] = df[columns].apply(pd.to_numeric, errors='coerce')
    
    # Explicitly cast columns to float to ensure the correct dtype
    for col in columns:
        df[col] = df[col].astype(float)
    
    return df

In [9]:
def clean_tm_column(df, column_name):
    """
    Cleans the specified column by removing "." and ":" characters.
    """
    df[column_name] = df[column_name].apply(lambda x: x.replace('.', '').replace(':', ''))
    return df

In [10]:
def clean_missing_values(df, missing_values, threshold=0.05):
    """
    Replaces specified missing values with NaN and drops columns with
    missing value ratios above the threshold.
    """
    # Replace specified missing values with NaN
    df = df.replace(missing_values, np.nan)
    
    # Calculate the missing value ratio for each column
    missing_ratios = df.isnull().mean()
    
    # Drop columns with missing value ratio above the threshold
    columns_to_drop = missing_ratios[missing_ratios > threshold].index
    df = df.drop(columns=columns_to_drop)
    
    return df

In [None]:
def unify_and_merge(df1, df2, column1, column2, on, how='inner'):
    """
    Unifies the format of the specified columns in two DataFrames and merges them.

    Parameters:
    - df1 (pd.DataFrame): The first DataFrame.
    - df2 (pd.DataFrame): The second DataFrame.
    - column1 (str): The column in df1 to be unified (e.g., 'YYMMDDHHMI').
    - column2 (str): The column in df2 to be unified (e.g., 'TM').
    - on (list): The list of column names to merge on.
    - how (str): The merge method (default: 'inner').

    Returns:
    - pd.DataFrame: The merged DataFrame.
    """
    # Rename columns for consistency
    df1 = df1.rename(columns={column1: column2})

    # Merge the DataFrames
    merged_df = pd.merge(df1, df2, on=on, how=how)
    merged_df = merged_df.drop_duplicates().reset_index(drop=True)
    
    return merged_df

In [17]:
for dust_file, weather_file in zip(finedust_files, weather_files):
    print(dust_file)
    print(weather_file)
    print()

cma_2018_2022_qingdao.csv
cma_2018_2022_qingdao.csv

cma_2018_2022_chifeng.csv
cma_2018_2022_chifeng.csv

cma_2018_2022_tongliao.csv
cma_2018_2022_tongliao.csv

cma_2018_2022_yanan.csv
cma_2018_2022_yanan.csv

cma_2018_2022_dalian.csv
cma_2018_2022_dalian.csv



In [18]:
dust_file.split("_")[-1].split(".")[0]

'dalian'

In [20]:
# Merge 파일 저장

FILTER_URL = "../collect_data/filtered/cma/filtered"
os.makedirs(FILTER_URL, exist_ok=True)

for dust_file, weather_file in zip(finedust_files, weather_files):
    loc = dust_file.split("_")[-1].split(".")[0]

    df_dust = pd.read_csv(os.path.join(FINEDUST_DIR, dust_file))
    df_weather = pd.read_csv(os.path.join(WEATHER_DIR, weather_file))

    df_dust = convert_columns_to_float(df_dust, ["STN", "PM10"])
    df_dust = clean_tm_column(df_dust, "TM")

    df_weather = convert_columns_to_float(df_weather, weather_columns[1:])
    df_weather["YYMMDDHHMI"] = df_weather["YYMMDDHHMI"].astype(str)
    df_weather = clean_missing_values(df_weather, [-9, -99, -999])

    df_merged = unify_and_merge(df_weather, df_dust, "YYMMDDHHMI", "TM", ["TM", "TM"], how='inner')
    df_merged.to_csv(f"{FILTER_URL}/{loc}.csv", index=False)

['TM']
['TM', 'ORG', 'STN', 'PM10']
['TM']
['TM', 'ORG', 'STN', 'PM10']
['TM']
['TM', 'ORG', 'STN', 'PM10']
['TM']
['TM', 'ORG', 'STN', 'PM10']
['TM']
['TM', 'ORG', 'STN', 'PM10']


In [21]:
df_merged.isnull().mean() * 100

TM     NaN
ORG    NaN
STN    NaN
PM10   NaN
dtype: float64

### china_loc.txt to csv

In [16]:
import pandas as pd

# 파일 경로 설정
input_file = '../collect_data/china_loc.txt'
output_file = '../collect_data/china_loc.csv'

# 데이터 읽기
with open(input_file, 'r', encoding='utf-8') as infile:
    lines = infile.readlines()

# 데이터 정리
data = []
for line in lines:
    # 주석(#)이나 빈 줄은 건너뜀
    if line.startswith('#') or line.strip() == '':
        continue
    # 공백 기준으로 데이터 나누기
    row = line.split()
    data.append(row)

# DataFrame 생성
columns = ["#", "STN", "TM_ED", "TM_ST", "STN_KO", "STN_EN", "STN_SP", "LON", "LAT", "HT", "STN_2", "FCT_ID"]
df = pd.DataFrame(data, columns=columns)
df = df.drop(columns=["#"])
df = df.drop(columns=["FCT_ID"])
df = df.drop(columns=["STN_2"])

df = df.iloc[2:].reset_index(drop=True)
df = df.iloc[:-1]

# CSV 파일로 저장
df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"CSV 파일이 '{output_file}'에 저장되었습니다.")

print(df)

CSV 파일이 '../collect_data/china_loc.csv'에 저장되었습니다.
      STN             TM_ED             TM_ST STN_KO         STN_EN STN_SP  \
0   52203  2100.12.31.00:00  2006.12.31.23:00     하미           Hami      2   
1   52418  2100.12.31.00:00  2006.12.31.23:00     둔황       Dunhuang      2   
2   53068  2100.12.31.00:00  2007.03.28.23:00  얼렌하오터        Erenhot      3   
3   53276  2100.12.31.00:00  2005.03.08.00:00    쥐르허         Jurihe      1   
4   53336  2100.12.31.00:00  2006.12.31.23:00  우라터중치  Wulatezhongqi      2   
5   53543  2100.12.31.00:00  2006.12.31.23:00     둥성      Dongsheng      2   
6   53787  2100.12.31.00:00  2005.03.08.00:00     위서          Yushe      1   
7   53845  2100.12.31.00:00  2006.12.31.22:00     야난          Yanan      2   
8   54135  2100.12.31.00:00  2005.03.08.00:00    통랴오       Tongliao      1   
9   54157  2100.12.31.00:00  2007.03.28.23:00     쓰핑         Siping      3   
10  54218  2100.12.31.00:00  2007.03.28.23:00     츠펑        Chifeng      3   
11  54497  210

In [19]:
meta_path = '../collect_data/china_loc.csv'
df_meta = pd.read_csv(meta_path)

locations = ["qingdao", "dalian", "tongliao", "yanan", "chifeng"]
BASE_PATH = "../collect_data/filtered/cma"

for loc in locations:
    df_loc = pd.read_csv(f"{BASE_PATH}/{loc}.csv")
    meta_subset = df_meta[["STN", "LON", "LAT"]]

    df_loc = pd.merge(df_loc, meta_subset, on="STN", how="left")
    df_loc.to_csv(f"{BASE_PATH}/{loc}_meta.csv", index=False)