In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
def calculate_daily_s_dot_average(target_dir):
    '''
    This function calculates the monthly average of the S_dot column from CSV files in the target directory.
    It reads each CSV file, extracts the S_dot column, and computes the monthly average.
    '''
    
    # get the list of CSV files in the target directory
    list_csv = os.listdir(target_dir)
    # filter the list to include only CSV files
    list_csv = [f for f in list_csv if f.endswith(".csv")]

    list_df = []
    for f in list_csv:
        # read the CSV file into a DataFrame
        df_tmp = pd.read_csv(os.path.join(target_dir, f), encoding="cp949", low_memory=False)
        df_tmp['측정시간'] = pd.to_datetime(df_tmp['측정시간'], format='%Y-%m-%d_%H:%M:%S')
        df_tmp['측정일'] = df_tmp['측정시간'].dt.date
        target_columns = [c for c in df_tmp.columns if "평균" in c]
        target_columns.append("측정일")
        target_columns.append("시리얼")
        list_df.append(df_tmp.loc[:,target_columns].copy())
    
    df = pd.concat(list_df, ignore_index=True)
    target_columns = [c for c in df.columns if "평균" in c]
    df[target_columns] = df[target_columns].replace(to_replace=r'.*[A-Za-z].*', value=np.nan, regex=True).astype(float)
    # group by '측정일' and '시리얼' and calculate the mean of the target columns
    df = df.groupby(['측정일', '시리얼'], as_index=False).mean()
    return df

In [3]:
target_dir = "raw_data/s_dot_nature_2023"
# calculate the monthly average of the S_dot column
df = calculate_daily_s_dot_average(target_dir)

In [4]:
df_temperature = df.loc[~pd.isna(df['온도 평균(℃)'])]
df_temperature = df_temperature.loc[:,['측정일','시리얼','온도 평균(℃)','습도 평균(%)']]
df_temperature = df_temperature.rename(columns={'측정일':'date','시리얼':'serial_id','온도 평균(℃)': 'temp_celsius', '습도 평균(%)': 'humidity'})

In [5]:
df_temperature 

Unnamed: 0,date,serial_id,temp_celsius,humidity
0,2023-01-01,OC3CL200010,-1.017391,71.565217
2,2023-01-01,OC3CL200012,2.391304,46.652174
3,2023-01-01,OC3CL200013,0.178261,51.217391
4,2023-01-01,OC3CL200014,2.778261,56.391304
5,2023-01-01,OC3CL200016,-40.000000,48.347826
...,...,...,...,...
364738,2023-12-24,V02Q2300003,-2.095238,62.428571
364739,2023-12-24,V02Q2300004,-3.535000,65.750000
364740,2023-12-24,V02Q2300005,-2.855000,68.350000
364741,2023-12-24,V02Q2300006,-2.895000,68.150000


In [15]:
import geopandas as gpd

gdf_sensors = gpd.read_file('metadata/sdot_coords.csv')
points = gpd.points_from_xy(gdf_sensors['경도'], gdf_sensors['위도'], crs="EPSG:4326")
gdf_sensors = gpd.GeoDataFrame(gdf_sensors, geometry=points, crs="EPSG:4326")
gdf_sensors = gdf_sensors.rename(columns={'모델 시리얼(*)':'serial_id'})

In [18]:
df_temperature = df_temperature.merge(gdf_sensors, on='serial_id', how='left')

In [22]:
df_sample = df_temperature.loc[df_temperature['date'].astype(str) == '2023-01-01', ['date','serial_id','temp_celsius','humidity','위도','경도']]
df_sample = df_sample.rename(columns={'위도':'latitude','경도':'longitude'})

In [23]:
def clear_outliers(df, column):
    '''
    This function removes outliers from the specified column in the DataFrame.
    It filters outliers based on the interquartile range (IQR) method.
    '''
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]


df_sample = clear_outliers(df_sample, 'temp_celsius')
df_sample = clear_outliers(df_sample, 'humidity')

In [25]:
df_sample.to_json('../src/utils/data/data_sample.json', orient='records')