In [99]:
import json
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
from shapely.geometry import LineString
import os

if not os.path.exists('mined_data'):
    os.mkdir('mined_data')
if not os.path.exists('raw_data'):
    os.mkdir('raw_data')

In [4]:
given_keys = pd.read_csv('geoshp/KIKmix_20230701.csv')
given_keys.set_index('법정동코드', inplace=True)
folder_path = 'raw_data'
DFS = {}
for folder in os.listdir(folder_path):
    if not os.path.isdir(os.path.join(folder_path, folder)):
        continue
    temp_path = os.path.join(folder_path, folder)
    for filename in os.listdir(temp_path):
        if filename.endswith(".csv"):
            filepath = os.path.join(temp_path, filename)
            df_name = os.path.splitext(filename)[0]  # Remove file extension
            DFS[df_name] = pd.read_csv(filepath)
            print(df_name)

od_20230926_1
od_20230924_1
od_20230929_1
od_20230928_1
od_20230921_1
od_20230927_1
od_20230930_1
od_20230925_1
od_20230923_1
od_20230922_1
od_20230914_1
od_20230916_1
od_20230918_1
od_20230915_1
od_20230919_1
od_20230913_1
od_20230911_1
od_20230912_1
od_20230920_1
od_20230917_1


In [100]:
# 검색기 함수
# area: 시도명_시군구명_읍면동명_동리명 : 끝까지 안 채워도 됨
# 예: 서울특별시_종로구_청운효자동_청운동: O
# 예: 서울특별시: O
# 예: 서울특별시_종로구_청운동: X
# date: MMDD 4개 숫자 붙여서 문자열 숫자 1개여도 리스트로 전달.
# keywords = ori_only, dest_only, start_time, end_time, gender: 동일
# age, modal, origin_purpose, dest_purpose, od_dist_avg, : 범위(리스트, range)
# od_duration_avg, od_cnts : 범위(list, range)


def searcher(**kwargs):
    target_dfs = []
    target_codes = None
    rt_df = pd.DataFrame()
    if not kwargs:
        print("No arguments provided")
        return target_dfs
    if kwargs.get('area'):
        parsed = kwargs['area'].split('_')
        if parsed[2].endswith('구'):
            parsed[1] = ' '.join([parsed[1], parsed[2]])
            parsed.pop(2)
        tempdf = given_keys.reset_index()
        for item in parsed:
            tempdf = tempdf[tempdf.eq(item).any(axis=1)]
        target_codes = tempdf['법정동코드'].values.tolist()
        target_codes = set(target_codes)
        target_codes.update(tempdf['행정동코드'].values.tolist())
    if kwargs.get('date'):
        for date in kwargs['date']:
            target_dfs.append(DFS['od_2023'+date+'_1'])
    else:
        target_dfs = DFS.values()
    for item in target_dfs:
        if target_codes:
            if kwargs.get('ori_only'):
                query = item['origin_hdong_cd'].isin(target_codes)
            elif kwargs.get('dest_only'):
                query = item['dest_hdong_cd'].isin(target_codes)
            else:
                query = item['origin_hdong_cd'].isin(target_codes)|item['dest_hdong_cd'].isin(target_codes)
            temp = item[query]
            rt_df = pd.concat([rt_df, temp])
        else:
            rt_df = pd.concat([rt_df, item])
    if kwargs.get('start_time'):
        rt_df = rt_df[rt_df['start_time'].str.startswith(kwargs['start_time'])]
    if kwargs.get('end_time'):
        rt_df = rt_df[rt_df['end_time'].str.startswith(kwargs['end_time'])]
    if kwargs.get('gender'):
        rt_df = rt_df[rt_df['gender'] == kwargs['gender']]
    if kwargs.get('age'):
        rt_df = rt_df[rt_df['age'].isin(kwargs['age'])]
    if kwargs.get('modal'):
        rt_df = rt_df[rt_df['modal'].isin(kwargs['modal'])]
    if kwargs.get('origin_purpose'):
        rt_df = rt_df[rt_df['origin_purpose'].isin(kwargs['origin_purpose'])]
    if kwargs.get('dest_purpose'):
        rt_df = rt_df[rt_df['dest_purpose'].isin(kwargs['dest_purpose'])]
    if kwargs.get('od_dist_avg'):
        rt_df = rt_df[rt_df['od_dist_avg'].isin(kwargs['od_dist_avg'])]
    if kwargs.get('od_duration_avg'):
        rt_df = rt_df[rt_df['od_duration_avg'].isin(kwargs['od_duration_avg'])]
    if kwargs.get('od_cnts'):
        rt_df = rt_df[rt_df['od_cnts'].isin(kwargs['od_cnts'])]
    if kwargs.get('custom_query'):
        rt_df = rt_df
    return rt_df

In [101]:
# 전체 기간 날짜
DATES = [f'09{x:02}' for x in range(1, 31)]
DATES.extend([f'10{x:02}' for x in range(1, 16)])

In [102]:
from datetime import datetime, timedelta
from typing import Callable
# 전체 od, 커스텀 쿼리
def analyse(area_dict, groupby_method : Callable, all_dates = False):
    areas = area_dict.keys()
    for area in areas:
        mined_data = pd.DataFrame()
        fest_date = area_dict[area].split(' ')
        fest = fest_date[0]
        fest_date = fest_date[1:]
        dates = []
        start_date = datetime.strptime(fest_date[0], '%m%d')
        end_date = datetime.strptime(fest_date[1], '%m%d')
        current_date = start_date
        while current_date <= end_date:
            dates.append(current_date.strftime('%m%d'))
            current_date += timedelta(days=1)
        if all_dates:
            dates = DATES
        for date in dates:
            rt_df = searcher(area=area, date=[date], dest_only=True)
            if rt_df.empty:
                continue
            if groupby_method:
                rt_df = groupby_method(rt_df)
            mined_data = pd.concat([mined_data, rt_df])
        analysis = groupby_method.__name__ if groupby_method else 'None'
        ival = '_'.join(fest_date) if not all_dates else 'all'
        mined_data.to_csv(f'mined_data/{fest}_{analysis}_{ival}.csv')


In [103]:
def public_transport(rt_df):
    # 4번 도보는 일단 대중교통으로 취급
    car = rt_df[rt_df['modal'].isin([0])]
    public = rt_df[~rt_df['modal'].isin([0])]
    columns = ['car_cnt', 'car_net_duration', 'car_net_dist',
                                    'car_avg_duration', 'car_avg_dist',
                                    'public_cnt', 'public_duration', 'public_dist',
                                    'public_avg_duration', 'public_avg_dist',
                                    'net_cnt', 'avg_duration', 'avg_dist']
    result = {}
    result['car_cnt'] = car['od_cnts'].sum()
    result['car_net_duration'] = (car['od_duration_avg']*car['od_cnts']).sum()
    result['car_net_dist'] = (car['od_dist_avg']*car['od_cnts']).sum()
    result['car_avg_duration'] = round(result['car_net_duration']/result['car_cnt'], 3) if result['car_cnt'] else 0
    result['car_avg_dist'] = round(result['car_net_dist']/result['car_cnt'], 3)
    result['public_cnt'] = public['od_cnts'].sum()
    result['public_duration'] = (public['od_duration_avg']*public['od_cnts']).sum()
    result['public_dist'] = (public['od_dist_avg']*public['od_cnts']).sum()
    result['public_avg_duration'] = round(result['public_duration']/result['public_cnt'], 3) if result['public_cnt'] else 0
    result['public_avg_dist'] = round(result['public_dist']/result['public_cnt'], 3) if result['public_cnt'] else 0
    result['net_cnt'] = result['car_cnt'] + result['public_cnt']
    result['avg_duration'] = round((result['car_net_duration'] + result['public_duration'])/result['net_cnt'], 3) if result['net_cnt'] else 0
    result['avg_dist'] = round((result['car_net_dist'] + result['public_dist'])/result['net_cnt'], 3) if result['net_cnt'] else 0
    return pd.DataFrame(data=result, index=rt_df['date'].unique())



In [104]:
def hourly_rate(rt_df):
    temp = rt_df.drop('origin_hdong_cd', axis=1)
    temp['od_dist'] = temp['od_dist_avg']*temp['od_cnts']
    temp['od_duration'] = temp['od_duration_avg']*temp['od_cnts']
    temp = temp.groupby(['end_time'])[['od_cnts', 'od_dist', 'od_duration']].sum()
    temp['avg_dist'] = temp['od_dist']/temp['od_cnts']
    temp['avg_duration'] = temp['od_duration']/temp['od_cnts']
    temp['date'] = rt_df['date'].unique()[0]
    temp = temp[['date', 'od_cnts',
                 'od_dist', 'od_duration', 'avg_dist', 'avg_duration']]
    return temp

In [105]:
def inner_outer(rt_df):
    # 같은 시: 앞 4자리 동일
    si = 1000000
    inner = rt_df[rt_df['origin_hdong_cd']//si==rt_df['dest_hdong_cd']//si]
    outer = rt_df[rt_df['origin_hdong_cd']//si!=rt_df['dest_hdong_cd']//si]
    # 수도권 = 서울 경기 인천
    central_cd = [41, 28, 11]
    central = rt_df[(rt_df['origin_hdong_cd']//(si*100)).isin(central_cd)]
    result = {}
    for df in [inner, outer, central]:
        name = 'inner' if df is inner else 'outer' if df is outer else 'central'
        result[name+'_cnt'] = df['od_cnts'].sum()
        result[name+'_duration'] = (df['od_duration_avg']*df['od_cnts']).sum()
        result[name+'_dist'] = (df['od_dist_avg']*df['od_cnts']).sum()
        result[name+'_avg_duration'] = round(result[name+'_duration']/result[name+'_cnt'], 3) if result[name+'_cnt'] else 0
        result[name+'_avg_dist'] = round(result[name+'_dist']/result[name+'_cnt'], 3) if result[name+'_cnt'] else 0
        result[name+'_unique'] = df['origin_hdong_cd'].nunique()
    result['net_cnt'] = rt_df['od_cnts'].sum()
    result['inner_ratio'] = round(result['inner_cnt']/result['net_cnt'], 3)
    result['outer_ratio'] = round(result['outer_cnt']/result['net_cnt'], 3)
    result['central_ratio'] = round(result['central_cnt']/result['net_cnt'], 3)

    return pd.DataFrame(data=result, index=rt_df['date'].unique())

In [106]:
# 분석할 축제 리스트
# 축제가 일어나는 지역 : 축제명 시작일 종료일
# 지역이 아주 정확할 필요는 없음: 강원-춘천시-강남동 이 정도도 굴러는감
areas = {
    '강원특별자치도_춘천시_강남동_삼천동': '춘천술페스타 0922 0923',
    '강원특별자치도_원주시_명륜1동_명륜동': '원주댄싱카니발 0922 0924',
    #'강원특별자치도_원주시_중앙동_중앙동': '원주치맥축제 0904 0907',
    '강원특별자치도_평창군_용평면_장평리': '평창농악축제 0915 0918',
    '강원특별자치도_동해시_천곡동_천곡동': '동해무릉제 0922 0924',
    #'강원특별자치도_횡성군_횡성읍_북천리': '횡성한우축제 1006 1010',
    '강원특별자치도_횡성군_청일면_유동리': '횡성더덕축제 0915 0917',
    '강원특별자치도_정선군_정선읍_봉양리': '정선아리랑제 0914 0917',
    #'강원특별자치도_속초시_조양동_조양동': '설악문화제 1006 1008',
}


In [107]:
analyse(areas, public_transport)

In [108]:
analyse(areas, hourly_rate)

In [109]:
analyse(areas, inner_outer)

In [110]:
!zip mined_data.zip mined_data/*

  adding: mined_data/정선아리랑제_hourly_rate_0914_0917.csv (deflated 56%)
  adding: mined_data/횡성더덕축제_hourly_rate_0915_0917.csv (deflated 56%)
  adding: mined_data/평창농악축제_hourly_rate_0915_0918.csv (deflated 57%)
  adding: mined_data/춘천술페스타_hourly_rate_0922_0923.csv (deflated 55%)
  adding: mined_data/원주댄싱카니발_hourly_rate_0922_0924.csv (deflated 56%)
  adding: mined_data/동해무릉제_hourly_rate_0922_0924.csv (deflated 53%)
  adding: mined_data/정선아리랑제_inner_outer_0914_0917.csv (deflated 53%)
  adding: mined_data/횡성더덕축제_inner_outer_0915_0917.csv (deflated 54%)
  adding: mined_data/평창농악축제_inner_outer_0915_0918.csv (deflated 53%)
  adding: mined_data/춘천술페스타_inner_outer_0922_0923.csv (deflated 55%)
  adding: mined_data/원주댄싱카니발_inner_outer_0922_0924.csv (deflated 54%)
  adding: mined_data/동해무릉제_inner_outer_0922_0924.csv (deflated 53%)
  adding: mined_data/정선아리랑제_public_transport_0914_0917.csv (deflated 51%)
  adding: mined_data/횡성더덕축제_public_transport_0915_0917.csv (deflated 49%)
  adding: mined_data/평창농