In [1]:
import copy
import collections
import glob
import re

import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

## Preprocess downloaded data

In [2]:
%%time
data_dir = "data_2mins"

df_list = list()
for i, file in tqdm(enumerate(glob.glob(f"{data_dir}/*.csv"))):
    try:
        df_sub = pd.read_csv(file)
        df_list.append(df_sub)
    except:
        print(f"Error reading {file}")
        pass
    
df = pd.concat(df_list)

0it [00:00, ?it/s]

Error reading data_2mins/20240510232222.csv
CPU times: user 1min 34s, sys: 12 s, total: 1min 46s
Wall time: 3min 45s


## Check Data
- sno(站點代號)
- sna(場站中文名稱)、snaen(場站名稱英文)
- sarea(場站區域)、sareaen(場站區域英文)
- mday(資料更新時間)
- ar(地點)、aren(地址英文)
- latitude(緯度)、longitude(經度)
- available_rent_bikes、available_return_bikes
- srcUpdateTime(YouBike2.0系統發布資料更新的時間)、updateTime(大數據平台經過處理後將資料存入DB的時間)
- infoTime(各場站來源資料更新時間)、infoDate(各場站來源資料更新時間)
- tot(場站總停車格)、sbi(場站目前車輛數量)
- lat(緯度)、lng(經度)
- bemp(空位數量)、act(全站禁用狀態)

In [3]:
df.head(1)

Unnamed: 0,sno,sna,sarea,mday,ar,sareaen,snaen,aren,act,srcUpdateTime,...,total,available_rent_bikes,latitude,longitude,available_return_bikes,tot,sbi,lat,lng,bemp
0,500101001,YouBike2.0_捷運科技大樓站,大安區,2024-05-04 00:52:13,復興南路二段235號前,Daan Dist.,YouBike2.0_MRT Technology Bldg. Sta.,No.235， Sec. 2， Fuxing S. Rd.,1,2024-05-04 03:15:23,...,28.0,0.0,25.02605,121.5436,28.0,,,,,


### Check NaN

In [4]:
df.isna().sum()
# df.info()

sno                              0
sna                              0
sarea                            0
mday                             0
ar                               0
sareaen                          0
snaen                            0
aren                             0
act                              0
srcUpdateTime                    0
updateTime                       0
infoTime                         0
infoDate                         0
total                       423900
available_rent_bikes        423900
latitude                    423900
longitude                   423900
available_return_bikes      423900
tot                       16058638
sbi                       16058638
lat                       16058638
lng                       16058638
bemp                      16058638
dtype: int64

### Time Range

In [5]:
df['srcUpdateTime'].min(), df['srcUpdateTime'].max()

('2024-05-03 07:56:26', '2024-05-20 12:49:24')

## Preprocess Data Types

In [6]:
# DateTime
df['srcUpdateTime'] = pd.to_datetime(df['srcUpdateTime'])
df['mday'] = pd.to_datetime(df['mday'])
df['infoTime'] = pd.to_datetime(df['infoTime'])
df['infoDate'] = pd.to_datetime(df['infoDate'])

In [7]:
df.dtypes

sno                                int64
sna                               object
sarea                             object
mday                      datetime64[ns]
ar                                object
sareaen                           object
snaen                             object
aren                              object
act                                int64
srcUpdateTime             datetime64[ns]
updateTime                        object
infoTime                  datetime64[ns]
infoDate                  datetime64[ns]
total                            float64
available_rent_bikes             float64
latitude                         float64
longitude                        float64
available_return_bikes           float64
tot                              float64
sbi                              float64
lat                              float64
lng                              float64
bemp                             float64
dtype: object

## Filter Data

### By sarea

In [8]:
# area: 大安區
mask = df["sarea"].isin(['大安區'])
df_area = df[mask]

### By date

In [9]:
# one day data
start_date = '2024-05-13'
end_date = pd.to_datetime(start_date) + pd.DateOffset(days=7)

df_area_day = df_area.loc[(df_area['srcUpdateTime'] >= start_date) & (df_area['srcUpdateTime'] < end_date)]

df_area_day['srcUpdateTime'].min(), df_area_day['srcUpdateTime'].max(), len(df_area_day)

(Timestamp('2024-05-13 00:00:31'), Timestamp('2024-05-19 23:59:23'), 900720)

In [10]:
len(df_area_day), len(df_area), len(df)

(900720, 2099520, 16482538)

## Write filtered data to file
- 大安區, 2024-05-13 ~ 2024-05-19

In [12]:
%%time
df_area_day_sorted = df_area_day.sort_values(by=["srcUpdateTime"])
filename = f"merged_{df_area_day['srcUpdateTime'].min()}_{df_area_day['srcUpdateTime'].max()}_2mins.csv"
filename = re.sub(":|-|\s", "", filename)

df_area_day_sorted.to_csv(filename)

print(f"Wrote {len(df_area_day_sorted)} data to {filename}.")

Wrote 900720 data to merged_20240513000031_20240519235923_2mins.csv.
CPU times: user 10.9 s, sys: 699 ms, total: 11.6 s
Wall time: 11.7 s
