In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
from shapely.geometry import LineString
import os

if not os.path.exists('mined_data_stay'):
    os.mkdir('mined_data_stay')
if not os.path.exists('raw_data'):
    os.mkdir('raw_data')

In [None]:
given_keys = pd.read_csv('geoshp/KIKmix_20230701.csv')
given_keys.set_index('법정동코드', inplace=True)
folder_path = 'raw_data'
DFS = {}
for folder in os.listdir(folder_path):
    if not os.path.isdir(os.path.join(folder_path, folder)):
        continue
    if folder.startswith('od'):
        continue
    temp_path = os.path.join(folder_path, folder)
    for filename in os.listdir(temp_path):
        if filename.endswith(".csv"):
            filepath = os.path.join(temp_path, filename)
            df_name = os.path.splitext(filename)[0]  # Remove file extension
            DFS[df_name] = pd.read_csv(filepath)
            print(df_name)

In [None]:
# 검색기 함수
# area: 시도명_시군구명_읍면동명_동리명 : 끝까지 안 채워도 됨
# 예: 서울특별시_종로구_청운효자동_청운동: O
# 예: 서울특별시: O
# 예: 서울특별시_종로구_청운동: X
# date: MMDD 4개 숫자 붙여서 문자열 숫자 1개여도 리스트로 전달.
# keywords = ori_only, dest_only, start_time, end_time, gender: 동일
# age, modal, origin_purpose, dest_purpose, od_dist_avg, : 범위(리스트, range)
# od_duration_avg, od_cnts : 범위(list, range)


def searcher(**kwargs):
    target_dfs = []
    target_codes = None
    rt_df = pd.DataFrame()
    if not kwargs:
        print("No arguments provided")
        return target_dfs
    if kwargs.get('area'):
        parsed = kwargs['area'].split('_')
        if parsed[2].endswith('구'):
            parsed[1] = ' '.join([parsed[1], parsed[2]])
            parsed.pop(2)
        tempdf = given_keys.reset_index()
        for item in parsed:
            tempdf = tempdf[tempdf.eq(item).any(axis=1)]
        target_codes = tempdf['법정동코드'].values.tolist()
        target_codes = set(target_codes)
        target_codes.update(tempdf['행정동코드'].values.tolist())
    if kwargs.get('date'):
        for date in kwargs['date']:
            target_dfs.append(DFS['stay_2023'+date+'_1'])
    else:
        target_dfs = DFS.values()
    for item in target_dfs:
        if target_codes:
            query = item['hdong_cd'].isin(target_codes)
            temp = item[query]
            rt_df = pd.concat([rt_df, temp])
        else:
            rt_df = pd.concat([rt_df, item])
    if kwargs.get('time'):
        rt_df = rt_df[rt_df['time'].str.startswith(kwargs['time'])]
    if kwargs.get('gender'):
        rt_df = rt_df[rt_df['gender'] == kwargs['gender']]
    if kwargs.get('age'):
        rt_df = rt_df[rt_df['age'].isin(kwargs['age'])]
    if kwargs.get('purpose'):
        rt_df = rt_df[rt_df['purpose'].isin(kwargs['purpose'])]
    if kwargs.get('stay_cnts'):
        rt_df = rt_df[rt_df['od_cnts'].isin(kwargs['od_cnts'])]
    if kwargs.get('custom_query'):
        rt_df = rt_df
    return rt_df

In [None]:
# 전체 기간 날짜
DATES = [f'09{x:02}' for x in range(1, 31)]
DATES.extend([f'10{x:02}' for x in range(1, 16)])

In [None]:
from datetime import datetime, timedelta
from typing import Callable
# 전체 od, 커스텀 쿼리
def analyse(area_dict, groupby_method : Callable, all_dates = False):
    areas = area_dict.keys()
    for area in areas:
        mined_data = pd.DataFrame()
        fest_date = area_dict[area].split(' ')
        fest = fest_date[0]
        fest_date = fest_date[1:]
        dates = []
        start_date = datetime.strptime(fest_date[0], '%m%d')
        end_date = datetime.strptime(fest_date[1], '%m%d')
        current_date = start_date
        while current_date <= end_date:
            dates.append(current_date.strftime('%m%d'))
            current_date += timedelta(days=1)
        if all_dates:
            dates = DATES
        for date in dates:
            rt_df = searcher(area=area, date=[date])
            if rt_df.empty:
                continue
            if groupby_method:
                rt_df = groupby_method(rt_df)
            mined_data = pd.concat([mined_data, rt_df])
        analysis = groupby_method.__name__ if groupby_method else 'None'
        ival = '_'.join(fest_date) if not all_dates else 'all'
        mined_data.to_csv(f'mined_data_stay/{fest}_{analysis}_{ival}.csv')


In [None]:
def hourly_rate(rt_df):
    temp = rt_df.drop('hdong_cd', axis=1)
    temp = temp.groupby(['time'])[['stay_cnts']].sum().reset_index()
    temp['date'] = rt_df['date'].unique()[0]
    return temp

In [None]:
def gender(rt_df):
    temp = rt_df.drop('hdong_cd', axis=1)
    temp = temp.groupby(['gender'])[['stay_cnts']].sum().reset_index()
    temp['date'] = rt_df['date'].unique()[0]
    return temp

In [None]:
def age(rt_df):
    temp = rt_df.drop('hdong_cd', axis=1)
    temp = temp.groupby(['age'])[['stay_cnts']].sum().reset_index()
    temp['date'] = rt_df['date'].unique()[0]
    return temp

In [None]:
# 분석할 축제 리스트
# 축제가 일어나는 지역 : 축제명 시작일 종료일
# 지역이 아주 정확할 필요는 없음: 강원-춘천시-강남동 이 정도도 굴러는감
areas = {
    '강원특별자치도_춘천시_강남동_삼천동': '춘천술페스타 0922 0923',
    '강원특별자치도_원주시_명륜1동_명륜동': '원주댄싱카니발 0922 0924',
    #'강원특별자치도_원주시_중앙동_중앙동': '원주치맥축제 0904 0907',
    '강원특별자치도_평창군_용평면_장평리': '평창농악축제 0915 0918',
    '강원특별자치도_동해시_천곡동_천곡동': '동해무릉제 0922 0924',
    #'강원특별자치도_횡성군_횡성읍_북천리': '횡성한우축제 1006 1010',
    '강원특별자치도_횡성군_청일면_유동리': '횡성더덕축제 0915 0917',
    '강원특별자치도_정선군_정선읍_봉양리': '정선아리랑제 0914 0917',
    #'강원특별자치도_속초시_조양동_조양동': '설악문화제 1006 1008',
}


In [None]:
analyse(areas, hourly_rate, all_dates=True)
