## 구글 드라이브 마운트

In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/BigContest')

Mounted at /content/drive


## 라이브러리

In [2]:
import dask.dataframe as dd
import time
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



## 데이터 불러오기

In [3]:
transit = pd.read_csv('data/transit.csv')
transit

Unnamed: 0,routeID,transitID,transit,description,time
0,SD01,1,WALK,-,303.0
1,SD01,2,TRAIN_1,KTX산천,4215.0
2,SD01,3,WALK,-,254.0
3,SD01,4,BUS,일반:707,1974.0
4,SD01,5,WALK,-,249.0
...,...,...,...,...,...
5820,II10,4,BUS_2,농어촌:임실-신흥촌,185.0
5821,II10,5,WALK,-,473.0
5822,II10,99,totalDistance,-,2991.0
5823,II10,99,totalTime,-,990.0


In [4]:
transit['origin_cd'] = transit.routeID.str[0]
transit['dest_cd'] = transit.routeID.apply(lambda x: x[1:3] if len(x) == 5 else x[1])
transit

Unnamed: 0,routeID,transitID,transit,description,time,origin_cd,dest_cd
0,SD01,1,WALK,-,303.0,S,D
1,SD01,2,TRAIN_1,KTX산천,4215.0,S,D
2,SD01,3,WALK,-,254.0,S,D
3,SD01,4,BUS,일반:707,1974.0,S,D
4,SD01,5,WALK,-,249.0,S,D
...,...,...,...,...,...,...,...
5820,II10,4,BUS_2,농어촌:임실-신흥촌,185.0,I,I
5821,II10,5,WALK,-,473.0,I,I
5822,II10,99,totalDistance,-,2991.0,I,I
5823,II10,99,totalTime,-,990.0,I,I


In [5]:
cd = {'S': '서울',
      'B': '부산',
      'D': '대전',
      'GA': '강릉(스피드 스케이트 경기장)',
      'GB': '강릉(경포호수광장)',
      'I': '임실',
      }
transit.origin_cd = transit.origin_cd.map(cd)
transit.dest_cd = transit.dest_cd.map(cd)

In [None]:
df

Unnamed: 0,routeID,time
0,BB01,56.150000
1,BB02,47.350000
2,BB03,47.350000
3,BB04,49.716667
4,BB05,48.900000
...,...,...
464,SS16,28.766667
465,SS17,28.766667
466,SS18,42.966667
467,SS19,42.966667


In [6]:
transit.transit = transit.transit.apply(lambda x: x.split('_')[0])

## EDA

In [7]:
transit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5825 entries, 0 to 5824
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   routeID      5825 non-null   object 
 1   transitID    5825 non-null   int64  
 2   transit      5825 non-null   object 
 3   description  5798 non-null   object 
 4   time         5821 non-null   float64
 5   origin_cd    5197 non-null   object 
 6   dest_cd      5825 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 318.7+ KB


### 도보(환승) 접근성

#### (전체) 도보시간 합계 분포

In [8]:
import plotly.express as px
df = transit[transit.transit == 'WALK'].groupby('routeID').time.sum().reset_index()
df.time = df.time / 60
fig = px.box(df,y="time", title = '(전체) 도보시간 합계 분포 (단위: 분)')
fig.update_yaxes(tickformat=',')

fig.show()

In [None]:
df2 = pd.qcut(df.time, q = 3).value_counts()
df2

Unnamed: 0_level_0,count
time,Unnamed: 1_level_1
"(5.4159999999999995, 14.083]",158
"(19.483, 47.0]",156
"(14.083, 19.483]",154


In [None]:
df.describe()

Unnamed: 0,time
count,468.0
mean,18.727778
std,7.313284
min,5.416667
25%,13.729167
50%,16.183333
75%,23.154167
max,47.0


#### 마지막 도보시간 분포

In [None]:
import plotly.express as px
df = transit[transit.transit == 'WALK']
df.time = df.time / 60
fig = px.box(df,y="time", title = '마지막 도보시간  분포 (단위: 분)')
fig.update_yaxes(tickformat=',')

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
df2 = pd.qcut(df.time, q = 3).value_counts()
df2

Unnamed: 0_level_0,count
time,Unnamed: 1_level_1
"(-0.001, 2.333]",726
"(2.333, 4.55]",660
"(4.55, 31.0]",640


In [None]:
df.describe()

Unnamed: 0,transitID,time
count,2026.0,2026.0
mean,4.570582,4.326061
std,2.860925,4.300948
min,1.0,0.0
25%,3.0,1.933333
50%,5.0,3.733333
75%,7.0,5.05
max,13.0,31.0


#### 축제별 도보시간 합계 분포

In [9]:
df = transit[transit.transit == 'WALK'].groupby(['dest_cd', 'routeID']).time.sum().reset_index()
df.time = df.time/60
fig = px.box(df,x = 'dest_cd', y = "time", title = '축제별 도보시간 합계 분포 (단위: 분)')
fig.update_yaxes(tickformat=',')

fig.show()

In [None]:
df2 = pd.qcut(df.time, q = 3).value_counts()
df2

Unnamed: 0_level_0,count
time,Unnamed: 1_level_1
"(5.4159999999999995, 14.083]",158
"(19.483, 47.0]",156
"(14.083, 19.483]",154


In [None]:
df[df.dest_cd == '강릉(경포호수광장)'].describe()

Unnamed: 0,time
count,80.0
mean,18.214792
std,6.543904
min,10.1
25%,13.1
50%,14.041667
75%,26.883333
max,29.516667


In [None]:
df[df.dest_cd == '강릉(스피드 스케이트 경기장)'].describe()

Unnamed: 0,time
count,47.0
mean,29.843262
std,7.731326
min,18.133333
25%,24.316667
50%,26.316667
75%,37.0
max,47.0


In [None]:
df[df.dest_cd == '대전'].describe()

Unnamed: 0,time
count,135.0
mean,13.892346
std,1.059549
min,8.0
25%,13.441667
50%,13.666667
75%,14.491667
max,18.1


In [None]:
df[df.dest_cd == '부산'].describe()

Unnamed: 0,time
count,61.0
mean,17.570765
std,3.983032
min,11.216667
25%,16.433333
50%,16.433333
75%,16.433333
max,30.366667


In [None]:
df[df.dest_cd == '서울'].describe()

Unnamed: 0,time
count,88.0
mean,16.332955
std,4.730708
min,5.416667
25%,13.75
50%,16.183333
75%,21.733333
max,24.583333


In [None]:
df[df.dest_cd == '임실'].describe()

Unnamed: 0,time
count,57.0
mean,26.670175
std,6.851218
min,18.366667
25%,24.016667
50%,26.116667
75%,28.433333
max,45.383333


#### 축제별 마지막 도보시간 분포

In [13]:
df = transit[transit.transit == 'WALK'].drop_duplicates(subset = 'routeID', keep = 'last')
df.time = df.time / 60
fig = px.box(df, x = 'dest_cd', y = 'time', title = '축제별 마지막 도보시간 분포(단위: 분)')
fig.update_yaxes(tickformat = ',')
fig.update_layout(
    xaxis_title="도착지", yaxis_title="도보시간"
)
fig.show()

In [None]:
df[df.dest_cd == '대전'].describe()

Unnamed: 0,transitID,time
count,135.0,135.0
mean,7.325926,4.556667
std,1.17708,0.449577
min,1.0,4.15
25%,7.0,4.15
50%,7.0,4.15
75%,7.0,5.05
max,9.0,5.05


In [None]:
df[df.dest_cd == '부산'].describe()

Unnamed: 0,transitID,time
count,61.0,61.0
mean,6.57377,8.864481
std,1.161307,0.628457
min,3.0,7.133333
25%,7.0,8.6
50%,7.0,8.6
75%,7.0,8.6
max,9.0,10.066667


In [None]:
df[df.dest_cd == '강릉(스피드 스케이트 경기장)'].describe()

Unnamed: 0,transitID,time
count,47.0,47.0
mean,8.212766,19.442553
std,1.887443,5.45
min,4.0,14.0
25%,6.0,17.25
50%,9.0,17.25
75%,9.0,17.25
max,11.0,31.0


In [None]:
df[df.dest_cd == '강릉(경포호수광장)'].describe()

Unnamed: 0,transitID,time
count,80.0,80.0
mean,9.525,4.878958
std,2.375908,0.508588
min,5.0,4.083333
25%,7.0,4.083333
50%,9.0,5.2
75%,11.0,5.2
max,13.0,5.2


In [None]:
df[df.dest_cd == '임실'].describe()

Unnamed: 0,transitID,time
count,57.0,57.0
mean,8.473684,16.144737
std,2.659923,6.855225
min,5.0,7.883333
25%,5.0,7.883333
50%,9.0,21.733333
75%,11.0,21.733333
max,11.0,21.733333


In [None]:
df[df.dest_cd == '서울'].describe()

Unnamed: 0,transitID,time
count,88.0,88.0
mean,6.25,3.987879
std,1.019917,0.9488
min,3.0,3.216667
25%,5.0,3.216667
50%,7.0,3.433333
75%,7.0,5.15
max,7.0,5.466667


#### 평균 환승 횟수

In [None]:
df = transit[(transit.transitID != 99)].groupby('dest_cd').transitID.mean().reset_index()
layout = go.Layout(title = '<b> 평균 환승 횟수 <b>'
                  , xaxis_title = '목적지'
                  , yaxis_title = '환승 수'
                  , font_size = 14)
fig = go.Figure(data = go.Bar(x = df.dest_cd, y = df.transitID, width = 0.3, marker_color = '#80b0d3'), layout = layout)

fig.update_yaxes(tickformat=',')
fig.show()

In [None]:
df.describe()

Unnamed: 0,transitID
count,6.0
mean,4.56781
std,0.736909
min,3.707273
25%,3.982536
50%,4.542727
75%,5.064912
max,5.579355


In [None]:
df2 = pd.qcut(df.transitID, q = 3).value_counts()
df2


Unnamed: 0_level_0,count
transitID,Unnamed: 1_level_1
"(3.706, 4.14]",2
"(4.14, 4.928]",2
"(4.928, 5.579]",2


### 교통수단 접근성 관련

#### 대중교통 경로 비교

In [None]:
df = transit.groupby(['origin_cd', 'dest_cd']).routeID.nunique().reset_index()

df_S = df[df.origin_cd == '서울']
df_B = df[df.origin_cd == '부산']

data_S = go.Bar(x = df_S.dest_cd, y = df_S.routeID, name = '서울', width = 0.4, marker_color = '#80b0d3')
data_B = go.Bar(x = df_B.dest_cd, y = df_B.routeID, name = '부산', width = 0.4, marker_color = '#fb8072')

layout = go.Layout(title = '대중교통 경로 수 비교'
                  , xaxis_title = '축제'
                  , yaxis_title = '경로')

fig = go.Figure(data = [data_S, data_B], layout = layout)
fig.show()

#### 대중교통 수

In [None]:
df = transit[(transit.transitID != 99)&(transit.transit != 'WALK')].groupby(['dest_cd']).transit.nunique().reset_index()

layout = go.Layout(title = '<b> 이용 가능 대중교통 <b>'
                  , xaxis_title = '목적지'
                  , yaxis_title = '대중교통 수'
                  , font_size = 14)
fig = go.Figure(data = go.Bar(x = df.dest_cd, y = df.transit, width = 0.3, marker_color = '#80b0d3'), layout = layout)

fig.update_yaxes(tickformat=',')
fig.show()

In [None]:
df

Unnamed: 0,dest_cd,transit
0,강릉(경포호수광장),4
1,강릉(스피드 스케이트 경기장),4
2,대전,3
3,부산,4
4,서울,3
5,임실,4


In [None]:
df = transit[(transit.transitID != 99)&(transit.transit != 'WALK')].groupby(['dest_cd']).transit.value_counts().reset_index()
df


Unnamed: 0,dest_cd,transit,count
0,강릉(경포호수광장),BUS,220
1,강릉(경포호수광장),EXPRESSBUS,58
2,강릉(경포호수광장),SUBWAY,46
3,강릉(경포호수광장),TRAIN,23
4,강릉(스피드 스케이트 경기장),BUS,91
5,강릉(스피드 스케이트 경기장),SUBWAY,35
6,강릉(스피드 스케이트 경기장),EXPRESSBUS,34
7,강릉(스피드 스케이트 경기장),TRAIN,14
8,대전,BUS,198
9,대전,TRAIN,135


### 거리 접근성 관련

In [None]:
df = transit[(transit.transitID == 99)&((transit.transit == 'totalDistance'))]
df.time.describe()
fig = px.box(df,y="time", title = '(전체) 이동거리 분포')
fig.update_yaxes(tickformat=',')

fig.show()

In [None]:
df.time.describe()

Unnamed: 0,time
count,467.0
mean,233046.691649
std,92572.788266
min,6202.0
25%,196838.0
50%,217913.0
75%,308473.0
max,369479.0


In [None]:
df2 = pd.qcut(df.time, q = 3).value_counts()
df2

Unnamed: 0_level_0,count
time,Unnamed: 1_level_1
"(6201.999, 208146.0]",163
"(303365.0, 369479.0]",154
"(208146.0, 303365.0]",150


In [None]:
df = transit[(transit.transitID == 99)&((transit.transit == 'totalDistance'))].groupby(['origin_cd', 'dest_cd']).time.mean().reset_index()

df = df.groupby(['origin_cd', 'dest_cd']).time.mean().reset_index()
df.time = round(df.time) / 1000
df_S = df[df.origin_cd == '서울']
df_B = df[df.origin_cd == '부산']

data_S = go.Bar(x = df_S.time, y = df_S.dest_cd, name = '서울', width = 0.3, orientation='h', marker_color = '#80b0d3')
data_B = go.Bar(x = df_B.time, y = df_B.dest_cd, name = '부산', width = 0.3, orientation='h', marker_color = '#fb8072')

layout = go.Layout(title = '<b> 평균 이동거리 비교 <b>'
                  , xaxis_title = '이동거리(단위: km)'
                  , yaxis_title = '목적지'
                  , font_size = 14)

fig = go.Figure(data = [data_S, data_B], layout = layout)
fig.update_yaxes(tickformat=',')
fig.show()

### 시간 접근성 관련

#### 소요시간 분포

In [None]:
import plotly.express as px
df = transit[transit.transitID != 99].groupby('routeID').time.sum().reset_index()
df.time = df.time / 60
fig = px.box(df,y="time", title = '이동시간 합계 분포 (단위: 분)')
fig.update_yaxes(tickformat=',')

fig.show()

In [None]:
df.describe()

Unnamed: 0,time
count,468.0
mean,239.110399
std,106.775193
min,25.7
25%,158.5
50%,216.575
75%,338.133333
max,565.6


In [None]:
df[df.time == 4.3]

Unnamed: 0,routeID,time


In [None]:
transit[transit.routeID == 'BI33']

Unnamed: 0,routeID,transitID,transit,description,time,origin_cd,dest_cd


#### 평균 소요시간 비교

In [None]:
df = transit[(transit.transitID == 99) & (transit.transit == 'totalTime')]
df = df.groupby(['origin_cd', 'dest_cd']).time.mean().reset_index()
df.time = round(df.time) /60
df_S = df[df.origin_cd == '서울']
df_B = df[df.origin_cd == '부산']

data_S = go.Bar(x = df_S.time, y = df_S.dest_cd, name = '서울', width = 0.3, orientation='h', marker_color = '#80b0d3')
data_B = go.Bar(x = df_B.time, y = df_B.dest_cd, name = '부산', width = 0.3, orientation='h', marker_color = '#fb8072')

layout = go.Layout(title = '<b> 평균 소요시간 비교 (단위: 분) <b>'
                  , xaxis_title = '소요시간(단위: 분)'
                  , yaxis_title = '목적지'
                  , font_size = 14)

fig = go.Figure(data = [data_S, data_B], layout = layout)
fig.update_yaxes(tickformat=',')
fig.show()

#### 축제별 이동시간

In [None]:
df = transit[transit.transitID != '99'].groupby(['dest_cd','origin_cd','routeID']).time.sum().reset_index()
df.time = df.time / 60

fig = px.box(df,x = 'dest_cd', y = "time",color = 'origin_cd', title = '축제별 이동시간 합계 분포 (단위: 분)')
fig.update_yaxes(tickformat=',')

fig.show()

#### 이동수단별 이동시간

In [None]:
df = transit[transit.transitID != 99].groupby(['origin_cd','transit', 'routeID']).time.sum().reset_index()
fig = px.box(df, x = 'transit', y = 'time', color = 'origin_cd', title = '이동수단별 평균 이동시간')
fig.update_yaxes(tickformat = ',')
fig.show()