## 구글 드라이브 마운트

In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/BigContest')

MessageError: Error: credential propagation was unsuccessful

## 라이브러리

In [None]:
import dask.dataframe as dd
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime

# -------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

## 데이터 불러오기

In [None]:
od = dd.read_csv('data/OD_all.csv', assume_missing = True, header = 0)

In [None]:
od.head()

In [None]:
od.start_time = od.start_time.str[:2]
od.end_time = od.end_time.str[:2]
od.origin_hdong_cd = od.origin_hdong_cd.astype('str')
od.dest_hdong_cd = od.dest_hdong_cd.astype('str')
od.date = od.date.astype('int')
od.origin_purpose = od.origin_purpose.astype('str')
od.dest_purpose = od.dest_purpose.astype('str')

In [None]:
od.origin_hdong_cd = od.origin_hdong_cd.astype('str')
od.dest_hdong_cd = od.dest_hdong_cd.astype('str')
od.date = od.date.astype('int')
od.modal = od.modal.astype('str')
od.origin_purpose = od.origin_purpose.astype('str')
od.dest_purpose = od.dest_purpose.astype('str')

In [None]:
od.head()

In [None]:
od['origin'] = od.origin_hdong_cd.str[:2]
od.origin = od.origin.astype('string')
od.head()

## 방문 데이터

In [None]:
def get_visit_data(od, origin):
  od = change_types(od)
  filtered_od = od[od.origin_hdong_cd.str[:2] == origin]
  filtered_od = filtered_od[filtered_od.dest_purpose.isin(['3.0', '4.0', '5.0'])]
  filtered_od = filtered_od[filtered_od.modal != '0']
  return filtered_od

In [None]:
def change_types(od):
  od.modal = od.modal.astype('str')
  od.origin_purpose = od.origin_purpose.astype('str')
  od.dest_purpose = od.dest_purpose.astype('str')
  od.od_cnts = od.od_cnts.astype('int')

  return od

### 대전

In [None]:
od_d = od[od.dest_hdong_cd == '3020055000.0'].compute()

In [None]:
# 출발지: 서울, 도착지: 대전 축제 지역
od_sd = get_visit_data(od_d, '11')
od_sd.shape

(80, 14)

In [None]:
# 출발지: 부산역, 도착지: 대전 축제 지역
od_bd = get_visit_data(od_d, '26')
od_bd.shape

(19, 14)

In [None]:
# 출발지: 대전 전체, 도착지: 대전 축제 지역
od_dd = get_visit_data(od_d, '30')
od_dd.shape

(22716, 14)

In [None]:
d = pd.concat([od_sd, od_bd, od_dd])
d

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts,origin
416540,1117062500.0,3020055000.0,20230902,12,16,1.0,0.0,0.0,0.0,5.0,272315.0,285.0,21,11
418637,1168065500.0,3020055000.0,20230902,10,14,0.0,0.0,0.0,0.0,5.0,262264.0,233.0,27,11
257288,1150062000.0,3020055000.0,20230902,10,14,0.0,0.0,5.0,0.0,5.0,283507.0,214.0,27,11
467630,1171053200.0,3020055000.0,20230902,10,13,0.0,1.0,0.0,2.0,4.0,294463.0,172.0,9,11
516480,1165066000.0,3020055000.0,20230902,17,19,0.0,0.0,0.0,0.0,5.0,158665.0,156.0,27,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700621,3017064000.0,3020055000.0,20231015,15,15,1.0,3.0,0.0,4.0,4.0,11082.0,19.0,5,30
701919,3020052700.0,3020055000.0,20231015,15,15,1.0,4.0,0.0,0.0,4.0,13946.0,30.0,5,30
704557,3020060000.0,3020055000.0,20231015,10,10,1.0,3.0,1.0,5.0,5.0,17542.0,26.0,5,30
708038,3017065000.0,3020055000.0,20231015,14,15,1.0,4.0,0.0,3.0,3.0,14900.0,40.0,5,30


### 부산

In [None]:
od_b = od[od.dest_hdong_cd == '2635052000.0'].compute()

In [None]:
# 출발지: 서울, 도착지: 부산 축제 지역
od_sb = get_visit_data(od_b, '11')
od_sb.shape

(10, 13)

In [None]:
# 출발지: 부산, 도착지: 부산 축제 지역
od_bb = get_visit_data(od_b, '26')
od_bb.shape

(11978, 13)

In [None]:
b = pd.concat([od_sb, od_bb])
b

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
544556,1150062000.0,2635052000.0,20230908,19,22,0.0,3.0,7,4.0,5.0,801753.0,165.0,6
677801,1150062000.0,2635052000.0,20230910,10,13,1.0,3.0,7,5.0,5.0,775150.0,201.0,6
171622,1150062000.0,2635052000.0,20230929,19,21,0.0,3.0,7,4.0,5.0,738005.0,155.0,6
61462,1150062000.0,2635052000.0,20231004,15,18,0.0,2.0,7,4.0,5.0,709303.0,196.0,6
563449,1150062000.0,2635052000.0,20231005,09,12,0.0,3.0,7,4.0,5.0,624110.0,202.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
659831,2635051000.0,2635052000.0,20231015,12,13,1.0,3.0,2,5.0,5.0,86961.0,67.0,6
667263,2671025000.0,2635052000.0,20231015,15,16,1.0,2.0,1,4.0,3.0,41651.0,55.0,5
670299,2650076000.0,2635052000.0,20231015,17,17,1.0,2.0,1,0.0,3.0,11408.0,17.0,5
689070,2635053000.0,2635052000.0,20231015,16,16,0.0,3.0,2,0.0,3.0,11429.0,26.0,6


### 임실

In [None]:
od_i = od[od.dest_hdong_cd == '4575034000.0'].compute()

In [None]:
# 출발지: 서울, 도착지: 임실 축제 지역
od_si = get_visit_data(od_i, '11')
od_si.shape

(16, 13)

In [None]:
# 출발지: 부산, 도착지: 임실 축제 지역
od_bi = get_visit_data(od_i, '26')
od_bi.shape

(6, 13)

In [None]:
# 출발지: 임실, 도착지: 임실 축제 지역
od_ii = get_visit_data(od_i, '45')
od_ii.shape

(4002, 13)

In [None]:
i = pd.concat([od_si, od_bi, od_ii])
i

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
55522,1114065000.0,4575034000.0,20230901,17:00,23:00,0.0,3.0,0.0,4.0,5.0,824091.0,361.0,5
501264,1123056000.0,4575034000.0,20230918,09:00,15:00,0.0,3.0,0.0,4.0,5.0,521635.0,367.0,5
115509,1168064000.0,4575034000.0,20231001,13:00,18:00,0.0,3.0,0.0,5.0,5.0,662898.0,282.0,5
288839,1168065500.0,4575034000.0,20231001,14:00,18:00,0.0,3.0,0.0,4.0,5.0,646915.0,239.0,5
534714,1165065100.0,4575034000.0,20231003,09:00,14:00,0.0,3.0,0.0,0.0,5.0,636641.0,325.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
579197,4511358000.0,4575034000.0,20231015,11:00,12:00,0.0,3.0,0.0,4.0,4.0,53509.0,60.0,5
615468,4511354000.0,4575034000.0,20231015,09:00,11:00,0.0,3.0,0.0,5.0,5.0,108036.0,111.0,5
629618,4511173000.0,4575034000.0,20231015,10:00,11:00,0.0,3.0,0.0,0.0,4.0,114664.0,112.0,5
640075,4511173000.0,4575034000.0,20231015,08:00,09:00,0.0,3.0,0.0,5.0,5.0,61838.0,41.0,5


### 서울

In [None]:
od_s = od[od.dest_hdong_cd == '1156054000.0'].compute()

In [None]:
# 출발지: 서울, 도착지: 서울 축제 지역
od_ss = get_visit_data(od_s, '11')
od_ss.shape

(39621, 13)

In [None]:
# 출발지: 부산, 도착지: 서울 축제 지역
od_bs = get_visit_data(od_s, '26')
od_bs.shape

(6, 13)

In [None]:
s = pd.concat([od_ss, od_bs])
s

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
1324,1114055000.0,1156054000.0,20230901,16:00,17:00,1.0,3.0,2.0,1.0,3.0,28195.0,41.0,7
4181,1150061100.0,1156054000.0,20230901,12:00,13:00,1.0,0.0,0.0,5.0,5.0,66357.0,52.0,19
4648,1154551000.0,1156054000.0,20230901,17:00,18:00,0.0,3.0,0.0,1.0,3.0,50964.0,42.0,7
5032,1156051500.0,1156054000.0,20230901,13:00,13:00,1.0,2.0,1.0,0.0,3.0,12739.0,19.0,7
6486,1156053500.0,1156054000.0,20230901,18:00,18:00,0.0,3.0,0.0,4.0,4.0,17311.0,21.0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43552,2671025000.0,1156054000.0,20230918,11:00,19:00,1.0,0.0,0.0,5.0,4.0,522036.0,445.0,19
319417,2644051000.0,1156054000.0,20230920,10:00,18:00,0.0,2.0,0.0,4.0,3.0,962175.0,470.0,5
125661,2644051000.0,1156054000.0,20230920,12:00,19:00,0.0,3.0,0.0,4.0,4.0,916380.0,443.0,5
72524,2620065000.0,1156054000.0,20230923,12:00,17:00,1.0,0.0,0.0,0.0,3.0,663418.0,257.0,19


### 강릉(1)

In [None]:
od_ga = od[od.dest_hdong_cd == '5115057200.0'].compute()

In [None]:
# 출발지: 서울, 도착지: 강릉 스피드 스케이팅 경기장
od_sga = get_visit_data(od_ga, '11')
od_sga.shape

(673, 13)

In [None]:
# 출발지: 부산, 도착지: 강릉 스피드 스케이팅 경기장
od_bga = get_visit_data(od_ga, '26')
od_bga.shape

(5, 13)

In [None]:
# 출발지: 강릉, 도착지: 강릉 스피드 스케이팅 경기장
od_gga = get_visit_data(od_ga, '51')
od_gga.shape

(39919, 13)

In [None]:
ga = pd.concat([od_bga, od_sga, od_gga])
ga

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
118062,2671025600.0,5115057200.0,20230921,16:00,21:00,1.0,4.0,0.0,0.0,5.0,816508.0,321.0,5
379158,2671025600.0,5115057200.0,20230921,16:00,21:00,0.0,1.0,0.0,0.0,5.0,490979.0,316.0,6
378192,2623067000.0,5115057200.0,20230928,11:00,19:00,1.0,4.0,0.0,0.0,5.0,761377.0,468.0,5
646900,2626076100.0,5115057200.0,20230929,08:00,14:00,0.0,0.0,0.0,0.0,5.0,659219.0,357.0,35
274051,2638056200.0,5115057200.0,20231007,13:00,18:00,1.0,3.0,0.0,4.0,5.0,886946.0,306.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
711120,5115052000.0,5115057200.0,20231015,12:00,13:00,1.0,4.0,0.0,3.0,3.0,40098.0,14.0,5
713687,5115057100.0,5115057200.0,20231015,20:00,20:00,0.0,2.0,0.0,3.0,3.0,7792.0,16.0,5
713795,5115056000.0,5115057200.0,20231015,17:00,17:00,1.0,3.0,1.0,0.0,3.0,14336.0,9.0,5
714799,5115066500.0,5115057200.0,20231015,13:00,15:00,1.0,1.0,0.0,3.0,5.0,320011.0,101.0,6


### 강릉(2)

In [None]:
od_gb = od[od.dest_hdong_cd == '5115058000.0'].compute()

In [None]:
# 출발지: 서울, 도착지: 강릉 경포호수광장
od_sgb = get_visit_data(od_gb, '11')
od_sgb.shape

(261, 13)

In [None]:
# 출발지: 부산, 도착지: 강릉 경포호수광장
od_bgb = get_visit_data(od_gb, '26')
od_bgb.shape

(5, 13)

In [None]:
# 출발지: 강릉, 도착지: 강릉 경포호수광장
od_ggb = get_visit_data(od_gb, '51')
od_ggb.shape

(23446, 13)

In [None]:
gb = pd.concat([od_sgb, od_bgb, od_ggb])
gb

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
29620,1165058100.0,5115058000.0,20230901,17:00,21:00,1.0,3.0,0.0,4.0,5.0,567221.0,242.0,7
86698,1174054000.0,5115058000.0,20230901,11:00,14:00,0.0,1.0,0.0,0.0,5.0,356188.0,182.0,7
36586,1135071000.0,5115058000.0,20230901,10:00,17:00,0.0,1.0,0.0,4.0,5.0,205557.0,412.0,7
174691,1135062100.0,5115058000.0,20230901,15:00,18:00,0.0,1.0,0.0,0.0,5.0,244374.0,173.0,7
534441,1168059000.0,5115058000.0,20230901,10:00,15:00,0.0,4.0,0.0,4.0,5.0,530850.0,318.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
630175,5176034000.0,5115058000.0,20231015,12:00,14:00,0.0,4.0,0.0,5.0,5.0,227797.0,94.0,6
635330,5115064500.0,5115058000.0,20231015,11:00,12:00,0.0,5.0,0.0,0.0,4.0,14623.0,22.0,5
640355,5176038000.0,5115058000.0,20231015,13:00,14:00,0.0,3.0,0.0,5.0,5.0,92098.0,45.0,5
663406,5115051000.0,5115058000.0,20231015,11:00,11:00,0.0,3.0,0.0,5.0,5.0,29664.0,19.0,5


### concat

In [None]:
od_dd.head()

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts,origin
98311,3020054000,3020055000,20230901,11,11,0,5,1,1,3,8539.0,15.0,12,30
99728,3017055500,3020055000,20230901,9,10,0,0,4,0,3,3878.0,40.0,27,30
99999,3020060000,3020055000,20230901,9,9,1,3,1,0,4,11536.0,21.0,8,30
122004,3017064000,3020055000,20230901,11,11,1,0,1,0,3,4817.0,13.0,21,30
129692,3020054000,3020055000,20230901,11,11,0,3,1,1,3,6842.0,8.0,19,30


In [None]:
visit = pd.concat([d, b, i, s, ga, gb])
visit

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts,origin
416540,1117062500.0,3020055000.0,20230902,12,16,1.0,0.0,0.0,0.0,5.0,272315.0,285.0,21,11
418637,1168065500.0,3020055000.0,20230902,10,14,0.0,0.0,0.0,0.0,5.0,262264.0,233.0,27,11
257288,1150062000.0,3020055000.0,20230902,10,14,0.0,0.0,5.0,0.0,5.0,283507.0,214.0,27,11
467630,1171053200.0,3020055000.0,20230902,10,13,0.0,1.0,0.0,2.0,4.0,294463.0,172.0,9,11
516480,1165066000.0,3020055000.0,20230902,17,19,0.0,0.0,0.0,0.0,5.0,158665.0,156.0,27,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
630175,5176034000.0,5115058000.0,20231015,12:00,14:00,0.0,4.0,0.0,5.0,5.0,227797.0,94.0,6,
635330,5115064500.0,5115058000.0,20231015,11:00,12:00,0.0,5.0,0.0,0.0,4.0,14623.0,22.0,5,
640355,5176038000.0,5115058000.0,20231015,13:00,14:00,0.0,3.0,0.0,5.0,5.0,92098.0,45.0,5,
663406,5115051000.0,5115058000.0,20231015,11:00,11:00,0.0,3.0,0.0,5.0,5.0,29664.0,19.0,5,


In [None]:
visit.to_csv('data/visit.csv')

## 귀가 데이터

In [None]:
def get_home_data(od, origin):
  od = change_types(od)
  filtered_od = od[od.origin_hdong_cd == origin].compute()
  filtered_od = filtered_od[filtered_od.origin_purpose.isin(['3.0', '4.0', '5.0'])]
  filtered_od = filtered_od[filtered_od.modal != '0']
  return filtered_od

In [None]:
def change_types(od):
  od.modal = od.modal.astype('str')
  od.origin_purpose = od.origin_purpose.astype('str')
  od.dest_purpose = od.dest_purpose.astype('str')
  od.od_cnts = od.od_cnts.astype('int')

  return od

### 대전

In [None]:
home_d = get_home_data(od, '3020055000.0')
home_d

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
107,3020055000.0,4372039000.0,20230901,08:00,09:00,0.0,3.0,0.0,4.0,4.0,153144.0,56.0,7
1416,3020055000.0,3023052500.0,20230901,08:00,09:00,0.0,5.0,0.0,4.0,4.0,14091.0,31.0,8
2336,3020055000.0,3017058800.0,20230901,21:00,21:00,1.0,2.0,0.0,3.0,0.0,10497.0,27.0,7
2484,3020055000.0,3020054000.0,20230901,17:00,18:00,0.0,2.0,0.0,3.0,4.0,35412.0,49.0,7
11393,3020055000.0,3020054000.0,20230901,09:00,10:00,1.0,1.0,0.0,5.0,5.0,15195.0,60.0,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
703813,3020055000.0,3017058800.0,20231015,16:00,17:00,0.0,4.0,0.0,4.0,0.0,17826.0,40.0,5
705053,3020055000.0,3017053500.0,20231015,16:00,17:00,1.0,3.0,1.0,5.0,5.0,9719.0,38.0,5
706148,3020055000.0,3017064000.0,20231015,17:00,18:00,0.0,1.0,0.0,4.0,3.0,23573.0,48.0,5
712563,3020055000.0,3014057500.0,20231015,19:00,19:00,0.0,3.0,0.0,3.0,0.0,15228.0,22.0,6


### 부산

In [None]:
home_b = get_home_data(od, '2635052000.0')
home_b

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
1990,2635052000.0,4831039000.0,20230901,19:00,22:00,1.0,3.0,0.0,4.0,5.0,319689.0,170.0,7
4573,2635052000.0,2620064000.0,20230901,14:00,15:00,0.0,4.0,0.0,4.0,4.0,108069.0,49.0,8
5444,2635052000.0,2635051000.0,20230901,11:00,11:00,1.0,1.0,0.0,5.0,5.0,38530.0,27.0,14
5881,2635052000.0,2635051000.0,20230901,19:00,19:00,1.0,1.0,0.0,4.0,3.0,22282.0,14.0,7
11167,2635052000.0,2626052000.0,20230901,19:00,20:00,1.0,3.0,0.0,3.0,3.0,50249.0,54.0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
713609,2635052000.0,2629064500.0,20231015,18:00,19:00,1.0,5.0,0.0,4.0,0.0,25823.0,23.0,6
713624,2635052000.0,2647072000.0,20231015,19:00,19:00,1.0,2.0,0.0,3.0,4.0,30006.0,44.0,5
713848,2635052000.0,2635051000.0,20231015,14:00,15:00,0.0,3.0,3.0,4.0,4.0,27033.0,62.0,5
714024,2635052000.0,2635051000.0,20231015,09:00,09:00,1.0,2.0,0.0,4.0,0.0,12756.0,17.0,5


### 임실

In [None]:
home_i = get_home_data(od, '4575034000.0')
home_i

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
425467,4575034000.0,4575035500.0,20230901,14:00,14:00,0.0,3.0,4.0,4.0,4.0,355.0,1.0,5
688918,4575034000.0,4617032000.0,20230901,13:00,15:00,0.0,3.0,0.0,4.0,4.0,216279.0,141.0,5
296995,4575034000.0,4574031000.0,20230901,17:00,17:00,0.0,3.0,0.0,5.0,5.0,32615.0,15.0,7
388078,4575034000.0,4575032000.0,20230901,12:00,14:00,0.0,4.0,1.0,5.0,3.0,55430.0,118.0,6
401602,4575034000.0,4511359000.0,20230901,11:00,12:00,1.0,1.0,0.0,4.0,0.0,33562.0,44.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
649349,4575034000.0,4373025000.0,20231015,15:00,18:00,1.0,3.0,0.0,5.0,0.0,177750.0,127.0,5
675750,4575034000.0,4511354000.0,20231015,17:00,17:00,0.0,3.0,0.0,4.0,0.0,53267.0,40.0,5
687251,4575034000.0,4577034000.0,20231015,15:00,16:00,1.0,2.0,0.0,5.0,5.0,71232.0,30.0,5
689935,4575034000.0,4575032000.0,20231015,12:00,12:00,1.0,4.0,0.0,4.0,4.0,85154.0,45.0,6


In [None]:
b = pd.concat([home_d, home_b, home_i]).to_csv('data/temp.csv')

### 서울

In [None]:
home_s = get_home_data(od, '1156054000.0')
home_s

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts,origin
330,1156054000.0,4180032000.0,20230901,16,20,1.0,2.0,0.0,4.0,5.0,234893.0,233.0,8,11
1588,1156054000.0,1144074000.0,20230901,12,13,0.0,4.0,0.0,3.0,1.0,18723.0,30.0,13,11
8293,1156054000.0,1144066000.0,20230901,15,16,1.0,2.0,0.0,3.0,3.0,30630.0,58.0,7,11
13179,1156054000.0,4623036000.0,20230901,17,22,1.0,2.0,0.0,5.0,0.0,866133.0,322.0,15,11
14170,1156054000.0,1156066000.0,20230901,14,15,1.0,5.0,0.0,3.0,0.0,40629.0,61.0,7,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709176,1156054000.0,4139052000.0,20231015,21,23,1.0,2.0,0.0,5.0,0.0,90806.0,126.0,5,11
709688,1156054000.0,1147062000.0,20231015,17,20,1.0,1.0,0.0,3.0,0.0,164178.0,172.0,5,11
710559,1156054000.0,4139052000.0,20231015,21,22,0.0,2.0,0.0,4.0,0.0,87742.0,99.0,5,11
711356,1156054000.0,4165025000.0,20231015,16,18,1.0,3.0,0.0,4.0,0.0,245594.0,126.0,5,11


### 강릉(1)

In [None]:
home_ga = get_home_data(od, '5115057200.0')
home_ga

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts,origin
5575,5115057200.0,5115064500.0,20230901,18,18,1.0,0.0,1.0,4.0,0.0,33727.0,24.0,14,51
58526,5115057200.0,5115059000.0,20230901,17,18,0.0,0.0,4.0,3.0,0.0,8065.0,45.0,16,51
59107,5115057200.0,5115059000.0,20230901,15,15,0.0,1.0,0.0,4.0,4.0,1962.0,3.0,8,51
65634,5115057200.0,5115059000.0,20230901,10,10,0.0,1.0,0.0,4.0,4.0,5282.0,8.0,8,51
115425,5115057200.0,5115058000.0,20230901,12,13,0.0,1.0,0.0,4.0,4.0,42160.0,39.0,28,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
680510,5115057200.0,5115059000.0,20231015,12,13,1.0,1.0,0.0,4.0,0.0,23789.0,40.0,5,51
684082,5115057200.0,5115066500.0,20231015,13,13,1.0,2.0,1.0,3.0,3.0,11640.0,12.0,5,51
691250,5115057200.0,5115052000.0,20231015,17,18,1.0,2.0,0.0,4.0,0.0,26447.0,37.0,5,51
694731,5115057200.0,5115064500.0,20231015,17,18,1.0,3.0,0.0,4.0,4.0,19561.0,72.0,5,51


### 강릉(2)

In [None]:
home_gb = get_home_data(od, '5115058000.0')
home_gb

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts,origin
3302,5115058000.0,5115066500.0,20230901,19,19,1.0,1.0,0.0,5.0,5.0,10232.0,7.0,8,51
6500,5115058000.0,5115025000.0,20230901,13,15,1.0,5.0,0.0,5.0,5.0,120666.0,114.0,9,51
46933,5115058000.0,5115066500.0,20230901,19,21,1.0,2.0,0.0,5.0,5.0,100035.0,99.0,14,51
80944,5115058000.0,5115066500.0,20230901,16,16,1.0,2.0,0.0,5.0,5.0,17613.0,12.0,10,51
81560,5115058000.0,5115066500.0,20230901,10,13,1.0,2.0,0.0,5.0,5.0,116246.0,161.0,10,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685147,5115058000.0,5115066500.0,20231015,12,13,1.0,2.0,0.0,4.0,4.0,44245.0,78.0,5,51
691071,5115058000.0,1156058500.0,20231015,13,19,1.0,2.0,0.0,5.0,0.0,483037.0,378.0,5,51
698302,5115058000.0,5115055000.0,20231015,20,22,0.0,2.0,0.0,4.0,0.0,177680.0,130.0,5,51
703637,5115058000.0,5115025000.0,20231015,14,15,1.0,7.0,0.0,5.0,5.0,66479.0,67.0,5,51


In [None]:
temp_2 = pd.concat([home_s, home_ga, home_gb])

In [None]:
temp_1 = pd.read_csv('data/temp.csv').drop(columns = 'Unnamed: 0')
temp_1

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
0,3.020055e+09,4.372039e+09,20230901,08:00,09:00,0.0,3.0,0.0,4.0,4.0,153144.0,56.0,7
1,3.020055e+09,3.023052e+09,20230901,08:00,09:00,0.0,5.0,0.0,4.0,4.0,14091.0,31.0,8
2,3.020055e+09,3.017059e+09,20230901,21:00,21:00,1.0,2.0,0.0,3.0,0.0,10497.0,27.0,7
3,3.020055e+09,3.020054e+09,20230901,17:00,18:00,0.0,2.0,0.0,3.0,4.0,35412.0,49.0,7
4,3.020055e+09,3.020054e+09,20230901,09:00,10:00,1.0,1.0,0.0,5.0,5.0,15195.0,60.0,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
268150,4.575034e+09,4.373025e+09,20231015,15:00,18:00,1.0,3.0,0.0,5.0,0.0,177750.0,127.0,5
268151,4.575034e+09,4.511354e+09,20231015,17:00,17:00,0.0,3.0,0.0,4.0,0.0,53267.0,40.0,5
268152,4.575034e+09,4.577034e+09,20231015,15:00,16:00,1.0,2.0,0.0,5.0,5.0,71232.0,30.0,5
268153,4.575034e+09,4.575032e+09,20231015,12:00,12:00,1.0,4.0,0.0,4.0,4.0,85154.0,45.0,6


In [None]:
temp_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268155 entries, 0 to 268154
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   origin_hdong_cd  268155 non-null  object 
 1   dest_hdong_cd    268155 non-null  object 
 2   date             268155 non-null  int64  
 3   start_time       268155 non-null  object 
 4   end_time         268155 non-null  object 
 5   gender           268155 non-null  float64
 6   age              268155 non-null  float64
 7   modal            268155 non-null  object 
 8   origin_purpose   268155 non-null  object 
 9   dest_purpose     268155 non-null  object 
 10  od_dist_avg      268155 non-null  float64
 11  od_duration_avg  268155 non-null  float64
 12  od_cnts          268155 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 26.6+ MB


In [None]:
temp_1.origin_hdong_cd = temp_1.origin_hdong_cd.astype('str')
temp_1.dest_hdong_cd = temp_1.dest_hdong_cd.astype('str')
temp_1.start_time = temp_1.start_time.str[:2]
temp_1.end_time = temp_1.end_time.str[:2]
temp_1.modal = temp_1.modal.astype('str')
temp_1.origin_purpose = temp_1.origin_purpose.astype('str')
temp_1.dest_purpose = temp_1.dest_purpose.astype('str')

In [None]:
temp_2 = temp_2.drop(columns = 'origin')
temp_2

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
330,1156054000.0,4180032000.0,20230901,16,20,1.0,2.0,0.0,4.0,5.0,234893.0,233.0,8
1588,1156054000.0,1144074000.0,20230901,12,13,0.0,4.0,0.0,3.0,1.0,18723.0,30.0,13
8293,1156054000.0,1144066000.0,20230901,15,16,1.0,2.0,0.0,3.0,3.0,30630.0,58.0,7
13179,1156054000.0,4623036000.0,20230901,17,22,1.0,2.0,0.0,5.0,0.0,866133.0,322.0,15
14170,1156054000.0,1156066000.0,20230901,14,15,1.0,5.0,0.0,3.0,0.0,40629.0,61.0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685147,5115058000.0,5115066500.0,20231015,12,13,1.0,2.0,0.0,4.0,4.0,44245.0,78.0,5
691071,5115058000.0,1156058500.0,20231015,13,19,1.0,2.0,0.0,5.0,0.0,483037.0,378.0,5
698302,5115058000.0,5115055000.0,20231015,20,22,0.0,2.0,0.0,4.0,0.0,177680.0,130.0,5
703637,5115058000.0,5115025000.0,20231015,14,15,1.0,7.0,0.0,5.0,5.0,66479.0,67.0,5


In [None]:
home = pd.concat([temp_1, temp_2])
home

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
0,3020055000.0,4372039000.0,20230901,08,09,0.0,3.0,0.0,4.0,4.0,153144.0,56.0,7
1,3020055000.0,3023052500.0,20230901,08,09,0.0,5.0,0.0,4.0,4.0,14091.0,31.0,8
2,3020055000.0,3017058800.0,20230901,21,21,1.0,2.0,0.0,3.0,0.0,10497.0,27.0,7
3,3020055000.0,3020054000.0,20230901,17,18,0.0,2.0,0.0,3.0,4.0,35412.0,49.0,7
4,3020055000.0,3020054000.0,20230901,09,10,1.0,1.0,0.0,5.0,5.0,15195.0,60.0,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685147,5115058000.0,5115066500.0,20231015,12,13,1.0,2.0,0.0,4.0,4.0,44245.0,78.0,5
691071,5115058000.0,1156058500.0,20231015,13,19,1.0,2.0,0.0,5.0,0.0,483037.0,378.0,5
698302,5115058000.0,5115055000.0,20231015,20,22,0.0,2.0,0.0,4.0,0.0,177680.0,130.0,5
703637,5115058000.0,5115025000.0,20231015,14,15,1.0,7.0,0.0,5.0,5.0,66479.0,67.0,5


In [None]:
home.to_csv('data/home.csv')

In [None]:
df_visit = pd.read_csv('data/visit.csv')
df_home = pd.read_csv('data/home.csv')

final = pd.concat([df_visit, df_home]).drop(columns = 'Unnamed: 0')
final

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts,origin
0,1.117062e+09,3.020055e+09,20230902,12,16,1.0,0.0,0.0,0.0,5.0,272315.0,285.0,21,11.0
1,1.168066e+09,3.020055e+09,20230902,10,14,0.0,0.0,0.0,0.0,5.0,262264.0,233.0,27,11.0
2,1.150062e+09,3.020055e+09,20230902,10,14,0.0,0.0,5.0,0.0,5.0,283507.0,214.0,27,11.0
3,1.171053e+09,3.020055e+09,20230902,10,13,0.0,1.0,0.0,2.0,4.0,294463.0,172.0,9,11.0
4,1.165066e+09,3.020055e+09,20230902,17,19,0.0,0.0,0.0,0.0,5.0,158665.0,156.0,27,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445420,5.115058e+09,5.115066e+09,20231015,12,13,1.0,2.0,0.0,4.0,4.0,44245.0,78.0,5,
445421,5.115058e+09,1.156058e+09,20231015,13,19,1.0,2.0,0.0,5.0,0.0,483037.0,378.0,5,
445422,5.115058e+09,5.115055e+09,20231015,20,22,0.0,2.0,0.0,4.0,0.0,177680.0,130.0,5,
445423,5.115058e+09,5.115025e+09,20231015,14,15,1.0,7.0,0.0,5.0,5.0,66479.0,67.0,5,


In [None]:
final.to_csv('data/OD_final.csv')

## JOIN & MERGE

### 방문 데이터

In [None]:
visit = pd.read_csv('data/visit.csv').drop(columns = ['Unnamed: 0', 'origin'])
visit

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
0,1.117062e+09,3.020055e+09,20230902,12,16,1.0,0.0,0.0,0.0,5.0,272315.0,285.0,21
1,1.168066e+09,3.020055e+09,20230902,10,14,0.0,0.0,0.0,0.0,5.0,262264.0,233.0,27
2,1.150062e+09,3.020055e+09,20230902,10,14,0.0,0.0,5.0,0.0,5.0,283507.0,214.0,27
3,1.171053e+09,3.020055e+09,20230902,10,13,0.0,1.0,0.0,2.0,4.0,294463.0,172.0,9
4,1.165066e+09,3.020055e+09,20230902,17,19,0.0,0.0,0.0,0.0,5.0,158665.0,156.0,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...
142758,5.176034e+09,5.115058e+09,20231015,12:00,14:00,0.0,4.0,0.0,5.0,5.0,227797.0,94.0,6
142759,5.115064e+09,5.115058e+09,20231015,11:00,12:00,0.0,5.0,0.0,0.0,4.0,14623.0,22.0,5
142760,5.176038e+09,5.115058e+09,20231015,13:00,14:00,0.0,3.0,0.0,5.0,5.0,92098.0,45.0,5
142761,5.115051e+09,5.115058e+09,20231015,11:00,11:00,0.0,3.0,0.0,5.0,5.0,29664.0,19.0,5


In [None]:
visit.origin_hdong_cd = visit.origin_hdong_cd.astype('int').astype('str')
visit.dest_hdong_cd = visit.dest_hdong_cd.astype('int').astype('str')

In [None]:
visit.origin_hdong_cd.str[:2].unique()

array(['11', '26', '30', '45', '51'], dtype=object)

In [None]:
region_cd = {'11': 'S',
            '26': 'B',
            '30': 'D',
            '45': 'I',
            '51': 'G'}
visit['origin'] = visit.origin_hdong_cd.str[:2].map(region_cd)
visit

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts,origin
0,1117062500,3020055000,20230902,12,16,1.0,0.0,0.0,0.0,5.0,272315.0,285.0,21,S
1,1168065500,3020055000,20230902,10,14,0.0,0.0,0.0,0.0,5.0,262264.0,233.0,27,S
2,1150062000,3020055000,20230902,10,14,0.0,0.0,5.0,0.0,5.0,283507.0,214.0,27,S
3,1171053200,3020055000,20230902,10,13,0.0,1.0,0.0,2.0,4.0,294463.0,172.0,9,S
4,1165066000,3020055000,20230902,17,19,0.0,0.0,0.0,0.0,5.0,158665.0,156.0,27,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142758,5176034000,5115058000,20231015,12:00,14:00,0.0,4.0,0.0,5.0,5.0,227797.0,94.0,6,G
142759,5115064500,5115058000,20231015,11:00,12:00,0.0,5.0,0.0,0.0,4.0,14623.0,22.0,5,G
142760,5176038000,5115058000,20231015,13:00,14:00,0.0,3.0,0.0,5.0,5.0,92098.0,45.0,5,G
142761,5115051000,5115058000,20231015,11:00,11:00,0.0,3.0,0.0,5.0,5.0,29664.0,19.0,5,G


In [None]:
hdong_cd = {'3020055000': 'D',
            '2635052000': 'B',
            '4575034000': 'I',
            '1156054000': 'S',
            '5115057200': 'GA',
            '5115058000': 'GB'}
visit['dest'] = visit.dest_hdong_cd.map(hdong_cd)
visit

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts,origin,dest
0,1117062500,3020055000,20230902,12,16,1.0,0.0,0.0,0.0,5.0,272315.0,285.0,21,S,D
1,1168065500,3020055000,20230902,10,14,0.0,0.0,0.0,0.0,5.0,262264.0,233.0,27,S,D
2,1150062000,3020055000,20230902,10,14,0.0,0.0,5.0,0.0,5.0,283507.0,214.0,27,S,D
3,1171053200,3020055000,20230902,10,13,0.0,1.0,0.0,2.0,4.0,294463.0,172.0,9,S,D
4,1165066000,3020055000,20230902,17,19,0.0,0.0,0.0,0.0,5.0,158665.0,156.0,27,S,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142758,5176034000,5115058000,20231015,12:00,14:00,0.0,4.0,0.0,5.0,5.0,227797.0,94.0,6,G,GB
142759,5115064500,5115058000,20231015,11:00,12:00,0.0,5.0,0.0,0.0,4.0,14623.0,22.0,5,G,GB
142760,5176038000,5115058000,20231015,13:00,14:00,0.0,3.0,0.0,5.0,5.0,92098.0,45.0,5,G,GB
142761,5115051000,5115058000,20231015,11:00,11:00,0.0,3.0,0.0,5.0,5.0,29664.0,19.0,5,G,GB


In [None]:
visit.start_time = visit.start_time.str[:2].astype('int')
visit.end_time = visit.end_time.str[:2].astype('int')

In [None]:
visit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142763 entries, 0 to 142762
Data columns (total 15 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   origin_hdong_cd  142763 non-null  object 
 1   dest_hdong_cd    142763 non-null  object 
 2   date             142763 non-null  int64  
 3   start_time       142763 non-null  int64  
 4   end_time         142763 non-null  int64  
 5   gender           142763 non-null  float64
 6   age              142763 non-null  float64
 7   modal            142763 non-null  float64
 8   origin_purpose   142696 non-null  float64
 9   dest_purpose     142763 non-null  float64
 10  od_dist_avg      142763 non-null  float64
 11  od_duration_avg  142763 non-null  float64
 12  od_cnts          142763 non-null  int64  
 13  origin           142763 non-null  object 
 14  dest             142763 non-null  object 
dtypes: float64(7), int64(4), object(4)
memory usage: 16.3+ MB


### 귀가 데이터

In [None]:
home = pd.read_csv('data/home.csv').drop(columns = 'Unnamed: 0')
home

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
0,3.020055e+09,4.372039e+09,20230901,8,9,0.0,3.0,0.0,4.0,4.0,153144.0,56.0,7
1,3.020055e+09,3.023052e+09,20230901,8,9,0.0,5.0,0.0,4.0,4.0,14091.0,31.0,8
2,3.020055e+09,3.017059e+09,20230901,21,21,1.0,2.0,0.0,3.0,0.0,10497.0,27.0,7
3,3.020055e+09,3.020054e+09,20230901,17,18,0.0,2.0,0.0,3.0,4.0,35412.0,49.0,7
4,3.020055e+09,3.020054e+09,20230901,9,10,1.0,1.0,0.0,5.0,5.0,15195.0,60.0,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
445420,5.115058e+09,5.115066e+09,20231015,12,13,1.0,2.0,0.0,4.0,4.0,44245.0,78.0,5
445421,5.115058e+09,1.156058e+09,20231015,13,19,1.0,2.0,0.0,5.0,0.0,483037.0,378.0,5
445422,5.115058e+09,5.115055e+09,20231015,20,22,0.0,2.0,0.0,4.0,0.0,177680.0,130.0,5
445423,5.115058e+09,5.115025e+09,20231015,14,15,1.0,7.0,0.0,5.0,5.0,66479.0,67.0,5


In [None]:
home.origin_hdong_cd = home.origin_hdong_cd.astype('int').astype('str')
home.dest_hdong_cd = home.dest_hdong_cd.astype('int').astype('str')

In [None]:
home.origin_hdong_cd.unique()

array(['3020055000', '2635052000', '4575034000', '1156054000',
       '5115057200', '5115058000'], dtype=object)

In [None]:
hdong_cd = {'3020055000': 'D',
            '2635052000': 'B',
            '4575034000': 'I',
            '1156054000': 'S',
            '5115057200': 'GA',
            '5115058000': 'GB'}
home['dest'] = home.origin_hdong_cd.map(hdong_cd)
home

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts,dest
0,3020055000,4372039000,20230901,8,9,0.0,3.0,0.0,4.0,4.0,153144.0,56.0,7,D
1,3020055000,3023052500,20230901,8,9,0.0,5.0,0.0,4.0,4.0,14091.0,31.0,8,D
2,3020055000,3017058800,20230901,21,21,1.0,2.0,0.0,3.0,0.0,10497.0,27.0,7,D
3,3020055000,3020054000,20230901,17,18,0.0,2.0,0.0,3.0,4.0,35412.0,49.0,7,D
4,3020055000,3020054000,20230901,9,10,1.0,1.0,0.0,5.0,5.0,15195.0,60.0,22,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445420,5115058000,5115066500,20231015,12,13,1.0,2.0,0.0,4.0,4.0,44245.0,78.0,5,GB
445421,5115058000,1156058500,20231015,13,19,1.0,2.0,0.0,5.0,0.0,483037.0,378.0,5,GB
445422,5115058000,5115055000,20231015,20,22,0.0,2.0,0.0,4.0,0.0,177680.0,130.0,5,GB
445423,5115058000,5115025000,20231015,14,15,1.0,7.0,0.0,5.0,5.0,66479.0,67.0,5,GB


In [None]:
home.isna().sum()

Unnamed: 0,0
origin_hdong_cd,0
dest_hdong_cd,0
date,0
start_time,0
end_time,0
gender,0
age,0
modal,0
origin_purpose,0
dest_purpose,0


In [None]:
region_cd = {'11': 'S',
            '26': 'B',
            '30': 'D',
            '45': 'I',
            '51': 'G'}
home['origin'] = home.dest_hdong_cd.str[:2].map(region_cd)

In [None]:
home.origin.unique()

array([nan, 'D', 'I', 'S', 'B', 'G'], dtype=object)

In [None]:
home = home.dropna(subset = 'origin', how = 'any')
home.isna().sum()

Unnamed: 0,0
origin_hdong_cd,0
dest_hdong_cd,0
date,0
start_time,0
end_time,0
gender,0
age,0
modal,0
origin_purpose,0
dest_purpose,0


In [None]:
home.isna().sum()

Unnamed: 0,0
origin_hdong_cd,0
dest_hdong_cd,0
date,0
start_time,0
end_time,0
gender,0
age,0
modal,0
origin_purpose,0
dest_purpose,0


In [None]:
home = home[['origin_hdong_cd', 'dest_hdong_cd', 'date', 'start_time', 'end_time', 'gender', 'age', 'modal', 'origin_purpose', 'dest_purpose', 'od_dist_avg', 'od_duration_avg', 'od_cnts', 'origin', 'dest']]
home.info()

<class 'pandas.core.frame.DataFrame'>
Index: 375829 entries, 1 to 445424
Data columns (total 15 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   origin_hdong_cd  375829 non-null  object 
 1   dest_hdong_cd    375829 non-null  object 
 2   date             375829 non-null  int64  
 3   start_time       375829 non-null  int64  
 4   end_time         375829 non-null  int64  
 5   gender           375829 non-null  float64
 6   age              375829 non-null  float64
 7   modal            375829 non-null  float64
 8   origin_purpose   375829 non-null  float64
 9   dest_purpose     375829 non-null  float64
 10  od_dist_avg      375829 non-null  float64
 11  od_duration_avg  375829 non-null  float64
 12  od_cnts          375829 non-null  int64  
 13  origin           375829 non-null  object 
 14  dest             375829 non-null  object 
dtypes: float64(7), int64(4), object(4)
memory usage: 45.9+ MB


### visit + home = od_final

In [None]:
od_final = pd.concat([visit, home])
od_final

Unnamed: 0,origin_hdong_cd,dest_hdong_cd,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts,origin,dest
0,1117062500,3020055000,20230902,12,16,1.0,0.0,0.0,0.0,5.0,272315.0,285.0,21,S,D
1,1168065500,3020055000,20230902,10,14,0.0,0.0,0.0,0.0,5.0,262264.0,233.0,27,S,D
2,1150062000,3020055000,20230902,10,14,0.0,0.0,5.0,0.0,5.0,283507.0,214.0,27,S,D
3,1171053200,3020055000,20230902,10,13,0.0,1.0,0.0,2.0,4.0,294463.0,172.0,9,S,D
4,1165066000,3020055000,20230902,17,19,0.0,0.0,0.0,0.0,5.0,158665.0,156.0,27,S,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445420,5115058000,5115066500,20231015,12,13,1.0,2.0,0.0,4.0,4.0,44245.0,78.0,5,G,GB
445421,5115058000,1156058500,20231015,13,19,1.0,2.0,0.0,5.0,0.0,483037.0,378.0,5,S,GB
445422,5115058000,5115055000,20231015,20,22,0.0,2.0,0.0,4.0,0.0,177680.0,130.0,5,G,GB
445423,5115058000,5115025000,20231015,14,15,1.0,7.0,0.0,5.0,5.0,66479.0,67.0,5,G,GB


In [None]:
od_final = od_final.dropna(subset = 'origin_purpose', how = 'any')
od_final.isna().sum()

Unnamed: 0,0
origin_hdong_cd,0
dest_hdong_cd,0
date,0
start_time,0
end_time,0
gender,0
age,0
modal,0
origin_purpose,0
dest_purpose,0


In [None]:
od_final.date = od_final.date.astype('str')
od_final.start_time = od_final.start_time.astype('int')
od_final.end_time = od_final.end_time.astype('int')
od_final.gender = od_final.gender.astype('int').astype('str')
od_final.age = od_final.age.astype('int')
od_final.modal = od_final.modal.astype('int').astype('str')
od_final.origin_purpose = od_final.origin_purpose.astype('str')
od_final.dest_purpose = od_final.dest_purpose.astype('int').astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  od_final.date = od_final.date.astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  od_final.start_time = od_final.start_time.astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  od_final.end_time = od_final.end_time.astype('int')
A value is trying to be set on a copy of a slice from

In [None]:
od_final.shape

(518525, 15)

In [None]:
od_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 518525 entries, 0 to 445424
Data columns (total 15 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   origin_hdong_cd  518525 non-null  object 
 1   dest_hdong_cd    518525 non-null  object 
 2   date             518525 non-null  object 
 3   start_time       518525 non-null  int64  
 4   end_time         518525 non-null  int64  
 5   gender           518525 non-null  object 
 6   age              518525 non-null  int64  
 7   modal            518525 non-null  object 
 8   origin_purpose   518525 non-null  object 
 9   dest_purpose     518525 non-null  object 
 10  od_dist_avg      518525 non-null  float64
 11  od_duration_avg  518525 non-null  float64
 12  od_cnts          518525 non-null  int64  
 13  origin           518525 non-null  object 
 14  dest             518525 non-null  object 
dtypes: float64(2), int64(4), object(9)
memory usage: 63.3+ MB


In [None]:
od = od_final[['origin', 'dest', 'date', 'start_time', 'end_time', 'gender', 'age', 'modal', 'origin_purpose', 'dest_purpose', 'od_dist_avg', 'od_duration_avg', 'od_cnts']]
od

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
0,S,D,20230902,12,16,1,0,0,0.0,5,272315.0,285.0,21
1,S,D,20230902,10,14,0,0,0,0.0,5,262264.0,233.0,27
2,S,D,20230902,10,14,0,0,5,0.0,5,283507.0,214.0,27
3,S,D,20230902,10,13,0,1,0,2.0,4,294463.0,172.0,9
4,S,D,20230902,17,19,0,0,0,0.0,5,158665.0,156.0,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...
445420,G,GB,20231015,12,13,1,2,0,4.0,4,44245.0,78.0,5
445421,S,GB,20231015,13,19,1,2,0,5.0,0,483037.0,378.0,5
445422,G,GB,20231015,20,22,0,2,0,4.0,0,177680.0,130.0,5
445423,G,GB,20231015,14,15,1,7,0,5.0,5,66479.0,67.0,5


In [None]:
print(od.origin.unique())
print(od.dest.unique() )

['S' 'B' 'D' 'I' 'G']
['D' 'B' 'I' 'S' 'GA' 'GB']


In [None]:
od.origin = od.origin.map({'GA': 'G',
                    'GB': 'G'})
od.origin.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  od.origin = od.origin.map({'GA': 'G',


array([nan, 'G', 'B'], dtype=object)

In [None]:
transit = pd.read_csv('data/transit_columns2.csv')
transit

Unnamed: 0,구분,max_time,min_time,num_trainsit_methods,median_time,예상 배차시간,최대 이동거리,최소 이동거리,이동거리 중간값,최대 교통수단 수,환승횟수,마지막 도보시간 중간값,마지막 도보시간 평균
0,BB,5413,2758,2,3066.0,287.931818,25962,13040,13699,3,3.013699,3.3,8.6
1,BD,14581,7437,4,8565.0,195.618421,211083,207278,209009,3,4.298507,4.15,5.05
2,BGA,23378,2471,5,22828.0,331.551724,308473,299594,303565,3,5.165605,2.333333,17.25
3,BGB,24011,2511,5,22093.0,193.017241,311981,30419,304010,3,5.165605,2.333333,5.2
4,BI,21593,16229,5,18940.0,290.399015,262567,230051,256362,4,5.705405,3.783333,21.733333
5,BS,23197,11864,3,16002.5,286.430769,338414,33448,335721,3,3.867257,4.55,3.65
6,DD,4145,1958,2,2387.0,210.227273,15483,8619,10085,2,2.958904,3.841667,5.05
7,GGA,2488,1904,2,2215.0,442.641509,5725,3754,4508,1,2.965909,3.1,17.25
8,GGB,6124,1844,3,2155.0,223.325581,31749,7262,7296,2,3.037209,3.1,4.083333
9,II,185,107,2,265.0,218.333333,347738,115,13547,1,3.0,1.966667,7.883333


In [None]:
transit['origin'] = transit.구분.str[0]
transit['dest'] = transit.구분.str[1:]
transit = transit.drop(columns = '구분')
transit

Unnamed: 0,max_time,min_time,num_trainsit_methods,median_time,예상 배차시간,최대 이동거리,최소 이동거리,이동거리 중간값,최대 교통수단 수,환승횟수,마지막 도보시간 중간값,마지막 도보시간 평균,origin,dest
0,5413,2758,2,3066.0,287.931818,25962,13040,13699,3,3.013699,3.3,8.6,B,B
1,14581,7437,4,8565.0,195.618421,211083,207278,209009,3,4.298507,4.15,5.05,B,D
2,23378,2471,5,22828.0,331.551724,308473,299594,303565,3,5.165605,2.333333,17.25,B,GA
3,24011,2511,5,22093.0,193.017241,311981,30419,304010,3,5.165605,2.333333,5.2,B,GB
4,21593,16229,5,18940.0,290.399015,262567,230051,256362,4,5.705405,3.783333,21.733333,B,I
5,23197,11864,3,16002.5,286.430769,338414,33448,335721,3,3.867257,4.55,3.65,B,S
6,4145,1958,2,2387.0,210.227273,15483,8619,10085,2,2.958904,3.841667,5.05,D,D
7,2488,1904,2,2215.0,442.641509,5725,3754,4508,1,2.965909,3.1,17.25,G,GA
8,6124,1844,3,2155.0,223.325581,31749,7262,7296,2,3.037209,3.1,4.083333,G,GB
9,185,107,2,265.0,218.333333,347738,115,13547,1,3.0,1.966667,7.883333,I,I


In [None]:
transit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   max_time              16 non-null     int64  
 1   min_time              16 non-null     int64  
 2   num_trainsit_methods  16 non-null     int64  
 3   median_time           16 non-null     float64
 4   예상 배차시간               16 non-null     float64
 5   최대 이동거리               16 non-null     int64  
 6   최소 이동거리               16 non-null     int64  
 7   이동거리 중간값              16 non-null     int64  
 8   최대 교통수단 수             16 non-null     int64  
 9   환승횟수                  16 non-null     float64
 10  마지막 도보시간 중간값          16 non-null     float64
 11  마지막 도보시간 평균           16 non-null     float64
 12  origin                16 non-null     object 
 13  dest                  16 non-null     object 
dtypes: float64(5), int64(7), object(2)
memory usage: 1.9+ KB


In [None]:
transit = transit[['origin', 'dest', 'max_time', 'min_time', 'num_trainsit_methods', 'median_time', '예상 배차시간', '최대 이동거리', '이동거리 중간값',
                   '최대 교통수단 수', '환승횟수', '마지막 도보시간 중간값', '마지막 도보시간 평균']]
transit

Unnamed: 0,origin,dest,max_time,min_time,num_trainsit_methods,median_time,예상 배차시간,최대 이동거리,이동거리 중간값,최대 교통수단 수,환승횟수,마지막 도보시간 중간값,마지막 도보시간 평균
0,B,B,5413,2758,2,3066.0,287.931818,25962,13699,3,3.013699,3.3,8.6
1,B,D,14581,7437,4,8565.0,195.618421,211083,209009,3,4.298507,4.15,5.05
2,B,GA,23378,2471,5,22828.0,331.551724,308473,303565,3,5.165605,2.333333,17.25
3,B,GB,24011,2511,5,22093.0,193.017241,311981,304010,3,5.165605,2.333333,5.2
4,B,I,21593,16229,5,18940.0,290.399015,262567,256362,4,5.705405,3.783333,21.733333
5,B,S,23197,11864,3,16002.5,286.430769,338414,335721,3,3.867257,4.55,3.65
6,D,D,4145,1958,2,2387.0,210.227273,15483,10085,2,2.958904,3.841667,5.05
7,G,GA,2488,1904,2,2215.0,442.641509,5725,4508,1,2.965909,3.1,17.25
8,G,GB,6124,1844,3,2155.0,223.325581,31749,7296,2,3.037209,3.1,4.083333
9,I,I,185,107,2,265.0,218.333333,347738,13547,1,3.0,1.966667,7.883333


In [None]:
transit.columns = ['origin', 'dest', 'max_time', 'min_time', 'num_transit_methods', 'median_time', 'predict_time', 'max_distance', 'median_distance', 'max_transit_methods', 'num_transfer', 'median_last_walk', 'avg_last_walk']
transit

Unnamed: 0,origin,dest,max_time,min_time,num_transit_methods,median_time,predict_time,max_distance,median_distance,max_transit_methods,num_transfer,median_last_walk,avg_last_walk
0,B,B,5413,2758,2,3066.0,287.931818,25962,13699,3,3.013699,3.3,8.6
1,B,D,14581,7437,4,8565.0,195.618421,211083,209009,3,4.298507,4.15,5.05
2,B,GA,23378,2471,5,22828.0,331.551724,308473,303565,3,5.165605,2.333333,17.25
3,B,GB,24011,2511,5,22093.0,193.017241,311981,304010,3,5.165605,2.333333,5.2
4,B,I,21593,16229,5,18940.0,290.399015,262567,256362,4,5.705405,3.783333,21.733333
5,B,S,23197,11864,3,16002.5,286.430769,338414,335721,3,3.867257,4.55,3.65
6,D,D,4145,1958,2,2387.0,210.227273,15483,10085,2,2.958904,3.841667,5.05
7,G,GA,2488,1904,2,2215.0,442.641509,5725,4508,1,2.965909,3.1,17.25
8,G,GB,6124,1844,3,2155.0,223.325581,31749,7296,2,3.037209,3.1,4.083333
9,I,I,185,107,2,265.0,218.333333,347738,13547,1,3.0,1.966667,7.883333


In [None]:
od

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,od_dist_avg,od_duration_avg,od_cnts
0,S,D,20230902,12,16,1,0,0,0.0,5,272315.0,285.0,21
1,S,D,20230902,10,14,0,0,0,0.0,5,262264.0,233.0,27
2,S,D,20230902,10,14,0,0,5,0.0,5,283507.0,214.0,27
3,S,D,20230902,10,13,0,1,0,2.0,4,294463.0,172.0,9
4,S,D,20230902,17,19,0,0,0,0.0,5,158665.0,156.0,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...
445420,G,GB,20231015,12,13,1,2,0,4.0,4,44245.0,78.0,5
445421,S,GB,20231015,13,19,1,2,0,5.0,0,483037.0,378.0,5
445422,G,GB,20231015,20,22,0,2,0,4.0,0,177680.0,130.0,5
445423,G,GB,20231015,14,15,1,7,0,5.0,5,66479.0,67.0,5


In [None]:
tot = pd.merge(od, transit, how = 'left', on = ['origin', 'dest'])
tot

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,...,min_time,num_transit_methods,median_time,predict_time,max_distance,median_distance,max_transit_methods,num_transfer,median_last_walk,avg_last_walk
0,S,D,20230902,12,16,1,0,0,0.0,5,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.150000,4.150000
1,S,D,20230902,10,14,0,0,0,0.0,5,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.150000,4.150000
2,S,D,20230902,10,14,0,0,5,0.0,5,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.150000,4.150000
3,S,D,20230902,10,13,0,1,0,2.0,4,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.150000,4.150000
4,S,D,20230902,17,19,0,0,0,0.0,5,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.150000,4.150000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518520,G,GB,20231015,12,13,1,2,0,4.0,4,...,1844.0,3.0,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.100000,4.083333
518521,S,GB,20231015,13,19,1,2,0,5.0,0,...,9004.0,5.0,13121.5,211.992126,206356.0,182214.0,4.0,5.861171,3.983333,4.083333
518522,G,GB,20231015,20,22,0,2,0,4.0,0,...,1844.0,3.0,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.100000,4.083333
518523,G,GB,20231015,14,15,1,7,0,5.0,5,...,1844.0,3.0,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.100000,4.083333


In [None]:
tot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 518525 entries, 0 to 518524
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   origin               518525 non-null  object 
 1   dest                 518525 non-null  object 
 2   date                 518525 non-null  object 
 3   start_time           518525 non-null  int64  
 4   end_time             518525 non-null  int64  
 5   gender               518525 non-null  object 
 6   age                  518525 non-null  int64  
 7   modal                518525 non-null  object 
 8   origin_purpose       518525 non-null  object 
 9   dest_purpose         518525 non-null  object 
 10  od_dist_avg          518525 non-null  float64
 11  od_duration_avg      518525 non-null  float64
 12  od_cnts              518525 non-null  int64  
 13  max_time             514580 non-null  float64
 14  min_time             514580 non-null  float64
 15  num_transit_metho

In [None]:
tot = tot[['origin', 'dest', 'date', 'start_time', 'end_time', 'gender', 'age', 'modal', 'origin_purpose', 'dest_purpose',
           'od_dist_avg', 'od_duration_avg', 'max_time', 'min_time', 'num_transit_methods', 'median_time', 'predict_time',
           'max_distance', 'median_distance', 'max_transit_methods', 'num_transfer', 'median_last_walk', 'avg_last_walk', 'od_cnts']]
tot

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,...,num_transit_methods,median_time,predict_time,max_distance,median_distance,max_transit_methods,num_transfer,median_last_walk,avg_last_walk,od_cnts
0,S,D,20230902,12,16,1,0,0,0.0,5,...,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.150000,4.150000,21
1,S,D,20230902,10,14,0,0,0,0.0,5,...,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.150000,4.150000,27
2,S,D,20230902,10,14,0,0,5,0.0,5,...,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.150000,4.150000,27
3,S,D,20230902,10,13,0,1,0,2.0,4,...,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.150000,4.150000,9
4,S,D,20230902,17,19,0,0,0,0.0,5,...,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.150000,4.150000,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518520,G,GB,20231015,12,13,1,2,0,4.0,4,...,3.0,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.100000,4.083333,5
518521,S,GB,20231015,13,19,1,2,0,5.0,0,...,5.0,13121.5,211.992126,206356.0,182214.0,4.0,5.861171,3.983333,4.083333,5
518522,G,GB,20231015,20,22,0,2,0,4.0,0,...,3.0,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.100000,4.083333,5
518523,G,GB,20231015,14,15,1,7,0,5.0,5,...,3.0,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.100000,4.083333,5


In [None]:
final_data = tot.loc[np.repeat(tot.index.values, tot.od_cnts)]
final_data = final_data.drop(columns = 'od_cnts')
final_data

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,...,min_time,num_transit_methods,median_time,predict_time,max_distance,median_distance,max_transit_methods,num_transfer,median_last_walk,avg_last_walk
0,S,D,20230902,12,16,1,0,0,0.0,5,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000
0,S,D,20230902,12,16,1,0,0,0.0,5,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000
0,S,D,20230902,12,16,1,0,0,0.0,5,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000
0,S,D,20230902,12,16,1,0,0,0.0,5,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000
0,S,D,20230902,12,16,1,0,0,0.0,5,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518524,G,GB,20231015,21,22,1,2,0,5.0,5,...,1844.0,3.0,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.10,4.083333
518524,G,GB,20231015,21,22,1,2,0,5.0,5,...,1844.0,3.0,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.10,4.083333
518524,G,GB,20231015,21,22,1,2,0,5.0,5,...,1844.0,3.0,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.10,4.083333
518524,G,GB,20231015,21,22,1,2,0,5.0,5,...,1844.0,3.0,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.10,4.083333


In [None]:
final_data.to_csv('data/final_data(2).csv')

## 파생변수 추가

In [None]:
final_data = pd.read_csv('data/final_data(2).csv').drop(columns = 'Unnamed: 0')
final_data

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,...,min_time,num_transit_methods,median_time,predict_time,max_distance,median_distance,max_transit_methods,num_transfer,median_last_walk,avg_last_walk
0,S,D,20230902,12,16,1,0,0,0.0,5,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000
1,S,D,20230902,12,16,1,0,0,0.0,5,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000
2,S,D,20230902,12,16,1,0,0,0.0,5,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000
3,S,D,20230902,12,16,1,0,0,0.0,5,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000
4,S,D,20230902,12,16,1,0,0,0.0,5,...,6467.0,3.0,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3968786,G,GB,20231015,21,22,1,2,0,5.0,5,...,1844.0,3.0,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.10,4.083333
3968787,G,GB,20231015,21,22,1,2,0,5.0,5,...,1844.0,3.0,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.10,4.083333
3968788,G,GB,20231015,21,22,1,2,0,5.0,5,...,1844.0,3.0,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.10,4.083333
3968789,G,GB,20231015,21,22,1,2,0,5.0,5,...,1844.0,3.0,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.10,4.083333


In [None]:
final_data.date = final_data.date.astype('str')

### isFestival

In [None]:
def add_isFestival(df, origin, dest, f_dates):
  filtered_df = df[(df.origin == origin)&(df.dest == dest)]
  date_to_festival = {}
  filtered_df.date = filtered_df.date.astype('string')
  unique_dates = final_data.date.unique()
  for date in unique_dates:
    date_obj = datetime.strptime(date, '%Y%m%d')
    if date in f_dates:
      date_to_festival[date] = True
    else:
      date_to_festival[date] = False
  filtered_df['isFestival'] = filtered_df['date'].map(date_to_festival)
  return filtered_df

#### 대전

In [None]:
d_dates = ['20230908', '20230909', '20230910']


# 서울 -> 대전
sd = add_isFestival(final_data, 'S', 'D', d_dates)

# 부산 -> 대전
bd = add_isFestival(final_data, 'B', 'D', d_dates)

# 대전 -> 대전
dd = add_isFestival(final_data, 'D', 'D', d_dates)

# 강릉 -> 대전
gd = add_isFestival(final_data, 'G', 'D', d_dates)

# 임실 -> 대전
id = add_isFestival(final_data, 'I', 'D', d_dates)

d = pd.concat([sd, bd, dd, gd, id])
d

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.date = filtered_df.date.astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['isFestival'] = filtered_df['date'].map(date_to_festival)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.date = filtered_df.date.astype('string')
A value is trying to be set o

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,...,num_transit_methods,median_time,predict_time,max_distance,median_distance,max_transit_methods,num_transfer,median_last_walk,avg_last_walk,isFestival
0,S,D,20230902,12,16,1,0,0,0.0,5,...,3.0,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False
1,S,D,20230902,12,16,1,0,0,0.0,5,...,3.0,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False
2,S,D,20230902,12,16,1,0,0,0.0,5,...,3.0,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False
3,S,D,20230902,12,16,1,0,0,0.0,5,...,3.0,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False
4,S,D,20230902,12,16,1,0,0,0.0,5,...,3.0,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1772164,I,D,20231015,19,20,1,4,0,3.0,0,...,,,,,,,,,,False
1772165,I,D,20231015,19,20,1,4,0,3.0,0,...,,,,,,,,,,False
1772166,I,D,20231015,19,20,1,4,0,3.0,0,...,,,,,,,,,,False
1772167,I,D,20231015,19,20,1,4,0,3.0,0,...,,,,,,,,,,False


In [None]:
d.value_counts('isFestival')

Unnamed: 0_level_0,count
isFestival,Unnamed: 1_level_1
False,714596
True,75904


#### 부산

In [None]:
b_dates = ['20231004', '20231005', '20231006', '20231007', '20231008', '20231009', '20231010', '20231011', '20231012', '20231013']

# 서울 -> 부산
sb = add_isFestival(final_data, 'S', 'B', b_dates)

# 부산 -> 부산
bb = add_isFestival(final_data, 'B', 'B', b_dates)

# 강릉 -> 부산
gb = add_isFestival(final_data, 'G', 'B', b_dates)

# 대전 -> 부산
db = add_isFestival(final_data, 'D', 'B', b_dates)

# 임실 -> 부산
ib = add_isFestival(final_data, 'I', 'B', b_dates)

b = pd.concat([sb, bb, gb, db, ib])
b

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.date = filtered_df.date.astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['isFestival'] = filtered_df['date'].map(date_to_festival)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.date = filtered_df.date.astype('string')
A value is trying to be set o

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,...,num_transit_methods,median_time,predict_time,max_distance,median_distance,max_transit_methods,num_transfer,median_last_walk,avg_last_walk,isFestival
229274,S,B,20230908,19,22,0,3,7,4.0,5,...,4.0,20332.0,276.149733,369479.0,340841.0,4.0,4.082317,5.05,10.066667,False
229275,S,B,20230908,19,22,0,3,7,4.0,5,...,4.0,20332.0,276.149733,369479.0,340841.0,4.0,4.082317,5.05,10.066667,False
229276,S,B,20230908,19,22,0,3,7,4.0,5,...,4.0,20332.0,276.149733,369479.0,340841.0,4.0,4.082317,5.05,10.066667,False
229277,S,B,20230908,19,22,0,3,7,4.0,5,...,4.0,20332.0,276.149733,369479.0,340841.0,4.0,4.082317,5.05,10.066667,False
229278,S,B,20230908,19,22,0,3,7,4.0,5,...,4.0,20332.0,276.149733,369479.0,340841.0,4.0,4.082317,5.05,10.066667,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2899614,I,B,20231015,15,19,0,2,0,5.0,0,...,,,,,,,,,,False
2899615,I,B,20231015,15,19,0,2,0,5.0,0,...,,,,,,,,,,False
2899616,I,B,20231015,15,19,0,2,0,5.0,0,...,,,,,,,,,,False
2899617,I,B,20231015,15,19,0,2,0,5.0,0,...,,,,,,,,,,False


In [None]:
b.value_counts('isFestival')

Unnamed: 0_level_0,count
isFestival,Unnamed: 1_level_1
False,938677
True,291314


#### 임실

In [None]:
i_dates = ['20231006', '20231007', '20231008', '20231009']

# 서울 -> 임실
si = add_isFestival(final_data, 'S', 'I', i_dates)

# 부산 -> 임실
bi = add_isFestival(final_data, 'B', 'I', i_dates)

# 전북 -> 임실
ii = add_isFestival(final_data, 'I', 'I', i_dates)

# 대전 -> 임실
di = add_isFestival(final_data, 'D', 'I', i_dates)

i = pd.concat([si, bi, ii, di])
i

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.date = filtered_df.date.astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['isFestival'] = filtered_df['date'].map(date_to_festival)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.date = filtered_df.date.astype('string')
A value is trying to be set o

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,...,num_transit_methods,median_time,predict_time,max_distance,median_distance,max_transit_methods,num_transfer,median_last_walk,avg_last_walk,isFestival
319941,S,I,20230901,17,23,0,3,0,4.0,5,...,4.0,14428.0,481.507433,226695.0,221938.0,3.0,3.318584,4.383333,7.883333,False
319942,S,I,20230901,17,23,0,3,0,4.0,5,...,4.0,14428.0,481.507433,226695.0,221938.0,3.0,3.318584,4.383333,7.883333,False
319943,S,I,20230901,17,23,0,3,0,4.0,5,...,4.0,14428.0,481.507433,226695.0,221938.0,3.0,3.318584,4.383333,7.883333,False
319944,S,I,20230901,17,23,0,3,0,4.0,5,...,4.0,14428.0,481.507433,226695.0,221938.0,3.0,3.318584,4.383333,7.883333,False
319945,S,I,20230901,17,23,0,3,0,4.0,5,...,4.0,14428.0,481.507433,226695.0,221938.0,3.0,3.318584,4.383333,7.883333,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2949902,D,I,20231015,14,17,0,0,0,5.0,0,...,,,,,,,,,,False
2949903,D,I,20231015,14,17,0,0,0,5.0,0,...,,,,,,,,,,False
2949904,D,I,20231015,14,17,0,0,0,5.0,0,...,,,,,,,,,,False
2949905,D,I,20231015,14,17,0,0,0,5.0,0,...,,,,,,,,,,False


In [None]:
i.value_counts('isFestival')

Unnamed: 0_level_0,count
isFestival,Unnamed: 1_level_1
True,34113
False,30070


#### 서울

In [None]:
s_dates = ['20231007']

# 서울 -> 서울
ss = add_isFestival(final_data, 'S', 'S', s_dates)

# 부산 -> 서울
bs = add_isFestival(final_data, 'B', 'S', s_dates)

# 임실 -> 서울
iis = add_isFestival(final_data, 'I', 'S', s_dates)

# 강릉 -> 서울
gs = add_isFestival(final_data, 'G', 'S', s_dates)

# 대전 -> 서울
ds = add_isFestival(final_data, 'D', 'S', s_dates)

s = pd.concat([ss, bs, iis, gs, ds])
s

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.date = filtered_df.date.astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['isFestival'] = filtered_df['date'].map(date_to_festival)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.date = filtered_df.date.astype('string')
A value is trying to be set o

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,...,num_transit_methods,median_time,predict_time,max_distance,median_distance,max_transit_methods,num_transfer,median_last_walk,avg_last_walk,isFestival
347701,S,S,20230901,16,17,1,3,2,1.0,3,...,2.0,1717.0,199.423729,10875.0,6775.0,2.0,2.969388,3.65,5.15,False
347702,S,S,20230901,16,17,1,3,2,1.0,3,...,2.0,1717.0,199.423729,10875.0,6775.0,2.0,2.969388,3.65,5.15,False
347703,S,S,20230901,16,17,1,3,2,1.0,3,...,2.0,1717.0,199.423729,10875.0,6775.0,2.0,2.969388,3.65,5.15,False
347704,S,S,20230901,16,17,1,3,2,1.0,3,...,2.0,1717.0,199.423729,10875.0,6775.0,2.0,2.969388,3.65,5.15,False
347705,S,S,20230901,16,17,1,3,2,1.0,3,...,2.0,1717.0,199.423729,10875.0,6775.0,2.0,2.969388,3.65,5.15,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653501,D,S,20231015,16,19,1,2,0,3.0,0,...,,,,,,,,,,False
3653502,D,S,20231015,16,19,1,2,0,3.0,0,...,,,,,,,,,,False
3653503,D,S,20231015,16,19,1,2,0,3.0,0,...,,,,,,,,,,False
3653504,D,S,20231015,16,19,1,2,0,3.0,0,...,,,,,,,,,,False


#### 강릉(1)

In [None]:
g_dates = ['20231012', '20231013', '20231014', '20231015']

# 서울 -> 강릉(1)
sga = add_isFestival(final_data, 'S', 'GA', g_dates)

# 부산 -> 강릉(1)
bga = add_isFestival(final_data, 'B', 'GA', g_dates)

# 강릉 -> 강릉(1)
gga = add_isFestival(final_data, 'G', 'GA', g_dates)

# 임실 -> 강릉(1)
iga = add_isFestival(final_data, 'I', 'GA', g_dates)

# 대전 -> 강릉(1)
dga = add_isFestival(final_data, 'D', 'GA', g_dates)

ga = pd.concat([sga, bga, gga, iga, dga])
ga

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.date = filtered_df.date.astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['isFestival'] = filtered_df['date'].map(date_to_festival)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.date = filtered_df.date.astype('string')
A value is trying to be set o

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,...,num_transit_methods,median_time,predict_time,max_distance,median_distance,max_transit_methods,num_transfer,median_last_walk,avg_last_walk,isFestival
693608,S,GA,20230901,15,19,0,0,0,0.0,5,...,3.0,11640.0,696.526316,220586.0,204386.0,3.0,3.305556,4.0,29.0,False
693609,S,GA,20230901,15,19,0,0,0,0.0,5,...,3.0,11640.0,696.526316,220586.0,204386.0,3.0,3.305556,4.0,29.0,False
693610,S,GA,20230901,15,19,0,0,0,0.0,5,...,3.0,11640.0,696.526316,220586.0,204386.0,3.0,3.305556,4.0,29.0,False
693611,S,GA,20230901,15,19,0,0,0,0.0,5,...,3.0,11640.0,696.526316,220586.0,204386.0,3.0,3.305556,4.0,29.0,False
693612,S,GA,20230901,15,19,0,0,0,0.0,5,...,3.0,11640.0,696.526316,220586.0,204386.0,3.0,3.305556,4.0,29.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3725039,D,GA,20231010,12,16,0,3,0,5.0,0,...,,,,,,,,,,False
3725040,D,GA,20231010,12,16,0,3,0,5.0,0,...,,,,,,,,,,False
3725041,D,GA,20231010,12,16,0,3,0,5.0,0,...,,,,,,,,,,False
3725042,D,GA,20231010,12,16,0,3,0,5.0,0,...,,,,,,,,,,False


#### 강릉(2)

In [None]:
g_dates = ['20231012', '20231013', '20231014', '20231015']

# 서울 -> 강릉(2)
sgb = add_isFestival(final_data, 'S', 'GB', g_dates)

# 부산 -> 강릉(2)
bgb = add_isFestival(final_data, 'B', 'GB', g_dates)

# 강릉 -> 강릉(2)
ggb = add_isFestival(final_data, 'G', 'GB', g_dates)

# 대전 -> 강릉(2)
dgb = add_isFestival(final_data, 'D', 'GB', g_dates)

# 임실 -> 강릉(2)
igb = add_isFestival(final_data, 'I', 'GB', g_dates)

gb = pd.concat([sgb, bgb, ggb, dgb, igb])
gb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.date = filtered_df.date.astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['isFestival'] = filtered_df['date'].map(date_to_festival)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.date = filtered_df.date.astype('string')
A value is trying to be set o

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,...,num_transit_methods,median_time,predict_time,max_distance,median_distance,max_transit_methods,num_transfer,median_last_walk,avg_last_walk,isFestival
975155,S,GB,20230901,17,21,1,3,0,4.0,5,...,5.0,13121.5,211.992126,206356.0,182214.0,4.0,5.861171,3.983333,4.083333,False
975156,S,GB,20230901,17,21,1,3,0,4.0,5,...,5.0,13121.5,211.992126,206356.0,182214.0,4.0,5.861171,3.983333,4.083333,False
975157,S,GB,20230901,17,21,1,3,0,4.0,5,...,5.0,13121.5,211.992126,206356.0,182214.0,4.0,5.861171,3.983333,4.083333,False
975158,S,GB,20230901,17,21,1,3,0,4.0,5,...,5.0,13121.5,211.992126,206356.0,182214.0,4.0,5.861171,3.983333,4.083333,False
975159,S,GB,20230901,17,21,1,3,0,4.0,5,...,5.0,13121.5,211.992126,206356.0,182214.0,4.0,5.861171,3.983333,4.083333,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3962710,I,GB,20231014,12,18,1,1,0,5.0,0,...,,,,,,,,,,True
3962711,I,GB,20231014,12,18,1,1,0,5.0,0,...,,,,,,,,,,True
3962712,I,GB,20231014,12,18,1,1,0,5.0,0,...,,,,,,,,,,True
3962713,I,GB,20231014,12,18,1,1,0,5.0,0,...,,,,,,,,,,True


In [None]:
final_data = pd.concat([d, b, i, s, ga, gb])
final_data

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,...,num_transit_methods,median_time,predict_time,max_distance,median_distance,max_transit_methods,num_transfer,median_last_walk,avg_last_walk,isFestival
0,S,D,20230902,12,16,1,0,0,0.0,5,...,3.0,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False
1,S,D,20230902,12,16,1,0,0,0.0,5,...,3.0,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False
2,S,D,20230902,12,16,1,0,0,0.0,5,...,3.0,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False
3,S,D,20230902,12,16,1,0,0,0.0,5,...,3.0,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False
4,S,D,20230902,12,16,1,0,0,0.0,5,...,3.0,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3962710,I,GB,20231014,12,18,1,1,0,5.0,0,...,,,,,,,,,,True
3962711,I,GB,20231014,12,18,1,1,0,5.0,0,...,,,,,,,,,,True
3962712,I,GB,20231014,12,18,1,1,0,5.0,0,...,,,,,,,,,,True
3962713,I,GB,20231014,12,18,1,1,0,5.0,0,...,,,,,,,,,,True


In [None]:
final_data.value_counts('isFestival')

Unnamed: 0_level_0,count
isFestival,Unnamed: 1_level_1
False,3363014
True,605777


### isWeekend

In [None]:
def add_isWeekend(filtered_df):
  filtered_df.date = filtered_df.date.astype('string')
  unique_dates = filtered_df.date.unique()
  date_to_weekend = {}
  for date in unique_dates:
    date_obj = datetime.strptime(date, '%Y%m%d')
    date_to_weekend[date] = date_obj.weekday() >= 5
  filtered_df['isWeekend'] = filtered_df['date'].map(date_to_weekend)
  return filtered_df

In [None]:
final_data = add_isWeekend(final_data)
final_data

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,...,median_time,predict_time,max_distance,median_distance,max_transit_methods,num_transfer,median_last_walk,avg_last_walk,isFestival,isWeekend
0,S,D,20230902,12,16,1,0,0,0.0,5,...,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False,True
1,S,D,20230902,12,16,1,0,0,0.0,5,...,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False,True
2,S,D,20230902,12,16,1,0,0,0.0,5,...,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False,True
3,S,D,20230902,12,16,1,0,0,0.0,5,...,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False,True
4,S,D,20230902,12,16,1,0,0,0.0,5,...,9568.0,205.33871,154480.0,152219.0,3.0,4.201835,4.15,4.15,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3962710,I,GB,20231014,12,18,1,1,0,5.0,0,...,,,,,,,,,True,True
3962711,I,GB,20231014,12,18,1,1,0,5.0,0,...,,,,,,,,,True,True
3962712,I,GB,20231014,12,18,1,1,0,5.0,0,...,,,,,,,,,True,True
3962713,I,GB,20231014,12,18,1,1,0,5.0,0,...,,,,,,,,,True,True


In [None]:
final_data.value_counts('isWeekend')

Unnamed: 0_level_0,count
isWeekend,Unnamed: 1_level_1
False,2115100
True,1853691


In [None]:
final_data = final_data.dropna(how = 'any', subset = ['max_time', 'min_time'])
final_data

Unnamed: 0,origin,dest,date,start_time,end_time,gender,age,modal,origin_purpose,dest_purpose,...,median_time,predict_time,max_distance,median_distance,max_transit_methods,num_transfer,median_last_walk,avg_last_walk,isFestival,isWeekend
0,S,D,20230902,12,16,1,0,0,0.0,5,...,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000,False,True
1,S,D,20230902,12,16,1,0,0,0.0,5,...,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000,False,True
2,S,D,20230902,12,16,1,0,0,0.0,5,...,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000,False,True
3,S,D,20230902,12,16,1,0,0,0.0,5,...,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000,False,True
4,S,D,20230902,12,16,1,0,0,0.0,5,...,9568.0,205.338710,154480.0,152219.0,3.0,4.201835,4.15,4.150000,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3968786,G,GB,20231015,21,22,1,2,0,5.0,5,...,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.10,4.083333,True,True
3968787,G,GB,20231015,21,22,1,2,0,5.0,5,...,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.10,4.083333,True,True
3968788,G,GB,20231015,21,22,1,2,0,5.0,5,...,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.10,4.083333,True,True
3968789,G,GB,20231015,21,22,1,2,0,5.0,5,...,2155.0,223.325581,31749.0,7296.0,2.0,3.037209,3.10,4.083333,True,True


In [None]:
final_data.value_counts('isWeekend')

Unnamed: 0_level_0,count
isWeekend,Unnamed: 1_level_1
False,2100867
True,1840126


In [None]:
final_data.to_csv('data/final_model_data.csv')