# Environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import json
import zipfile
import pandas as pd
from datetime import timedelta

In [None]:
popularity_data = {
    '捷運公館站(2號出口)': 500101022,
    '捷運公館站(3號出口)': 500101181,
    '捷運科技大樓站': 500101001,
    '捷運芝山站(2號出口)_1': 500104108,
    '捷運中山國中站': 500107024,
    '捷運永春站(2號出口)': 500112070,
    '捷運圓山站(1號出口)': 500103009,
    '捷運六張犁站': 500101101,
    '捷運台北101/世貿站(2號出口)': 500112076,
    '捷運行天宮站(3號出口)': 500107036
}

rainfall_data = {
    '2024-04-25': 10.5,
    '2024-04-26': 29.5,
    '2024-04-27': 0.5,
    '2024-04-28': 1.0,
    '2024-04-29': 0.0,
    '2024-04-30': 4.5,
    '2024-05-01': 7.5,
    '2024-05-02': 9.5,
    '2024-05-03': 0.0,
    '2024-05-04': 0.1,
    '2024-05-05': 0.1,
    '2024-05-06': 0.0,
    '2024-05-07': 0.0,
    '2024-05-08': 0.0,
    '2024-05-09': 0.0,
    '2024-05-10': 0.0,
    '2024-05-11': 0.0,
    '2024-05-12': 16.0,
    '2024-05-13': 14.0,
    '2024-05-14': 0.5,
    '2024-05-15': 0.0,
    '2024-05-16': 0.0,
    '2024-05-17': 0.0,
    '2024-05-18': 0.0,
    '2024-05-19': 0.0,
    '2024-05-20': 0.0,
    '2024-05-21': 0.0,
    '2024-05-22': 0.0,
    '2024-05-23': 0.0,
    '2024-05-24': 0.0,
    '2024-05-25': 0.1
}

START="2024-04-25"
END="2024-05-19"
INTERVAL = 2
DURATION = 60 // INTERVAL

# TMPDATE= 500101001
TMPDATE = ""
RATE = 1

data_name_0513_0519 = "data_0513_0519_2mins.zip"
data_name_0506_0512 = "data_0506_0512_2mins.zip"


feature_name = f"feature_{TMPDATE}_{RATE}"

data_path_0506_0512 = f"/content/drive/MyDrive/AITermProject/{data_name_0506_0512}"
data_path_0513_0519 = f"/content/drive/MyDrive/AITermProject/{data_name_0513_0519}"
see_rate_path = "/content/drive/MyDrive/AITermProject/see_rate.csv"
mrt_distance_path = "/content/drive/MyDrive/AITermProject/mrt_ubike_shortest_dist.csv"

feature_save_path = f"/content/drive/MyDrive/AITermProject/{feature_name}.csv"
day_seq_path = f"/content/drive/MyDrive/AITermProject/day_{TMPDATE}.txt"

# Data

In [None]:
def data_reading(data_path):
  with zipfile.ZipFile(data_path, 'r') as z:
    z.printdir()
    csv_filename = z.namelist()[0]
    with z.open(csv_filename) as f:
      df = pd.read_csv(f)
    return df

In [None]:
df_0506_0512 = data_reading(data_path_0506_0512)
df_0513_0519 = data_reading(data_path_0513_0519)
df = pd.concat([df_0506_0512, df_0513_0519], axis=0, ignore_index=True)

if TMPDATE:
  df = df[df['sno'] == TMPDATE]
  df.to_csv(f"/content/drive/MyDrive/AITermProject/{TMPDATE}.csv")

# keep columns in need
columns_to_keep = ['sno', 'updateTime', 'act', 'total', 'available_rent_bikes', 'latitude', 'longitude']
# columns_to_keep = ['sno', 'updateTime', 'act', 'tot', 'sbi', 'lat', 'lng']
df = df[columns_to_keep]

# Rename columns
columns_to_rename = {
    'total': 'tot',
    'available_rent_bikes': 'sbi',
    'latitude': 'lat',
    'longitude': 'lng'
    }
df = df.rename(columns=columns_to_rename)

# Count total rows
total_rows = len(df)
print(f"Total rows: {total_rows}")
nan_rows_count = df.isna().sum().sum()
print(f"Rows with NaN values: {nan_rows_count}")
df = df.dropna()
print(df.head())

In [None]:
# @title sequence time data

# data features
df['updateTime'] = pd.to_datetime(df['updateTime'])
df['date'] = df['updateTime'].dt.date
df['date_value'] = df['date'].apply(lambda x: int(x.strftime('%m%d')))

# # data position in time sequence
# df['position'] = df['updateTime'].dt.hour * (60//INTERVAL) + df['updateTime'].dt.minute // INTERVAL
# print(df.head())

# day_point = {}
# for i, row in df.iterrows():
#   key = f"{row['sno']}_{row['date'].strftime('%Y-%m-%d')}"
#   if key not in day_point:
#     day_point[key] = [-1] * (24 * DURATION)
#   position = row['position']
#   day_point[key][position] = row['sbi']
# print('length of key list:', len(day_point))

# # Save day_point as a text file
# with open(os.path.join(day_seq_path), 'w') as file:
#     json.dump(day_point, file)
# print("day_point has been saved.")

In [None]:
# @title mask
def generate_padding_mask(sequence, pad_value=-1):
    return [x == pad_value for x in sequence]

In [None]:
# # @title get one-hour-before and day-before history data as features

# df = df.sample(frac=RATE, random_state=42)

# df['sbi_onehour'] = None
# df['sbi_history'] = None
# df['sbi_prediction'] = None
# # mask for padded position with 1
# df['sbi_onehour_mask'] = None
# df['sbi_history_mask'] = None
# df['sbi_prediction_mask'] = None

# for i, row in df.iterrows():
#   current_date = row['date']
#   pred_position = row['position'] + 1
#   sno = row['sno']
#   key = f"{sno}_{current_date.strftime('%Y-%m-%d')}"

#   # to get the precdiction data
#   if pred_position + DURATION < 24 * DURATION :
#     sbi_prediction = day_point[key][pred_position:pred_position+DURATION]
#   else:
#     sbi_prediction = day_point[key][pred_position:]
#     remaining_points = DURATION - len(sbi_prediction)
#     next_day_key = f"{sno}_{(current_date + pd.Timedelta(days=1)).strftime('%Y-%m-%d')}"
#     if next_day_key in day_point and remaining_points > 0:
#       sbi_prediction += day_point[next_day_key][:remaining_points]
#     else:
#       sbi_prediction += [-1] * remaining_points

#   # to get the previous one hour data
#   if pred_position >= DURATION:
#     sbi_onehour = day_point[key][pred_position-DURATION:pred_position]
#   else:
#     prev_day_key = f"{sno}_{(current_date - pd.Timedelta(days=1)).strftime('%Y-%m-%d')}"
#     if prev_day_key in day_point:
#       sbi_onehour = day_point[prev_day_key][-(DURATION-pred_position):] + day_point[key][:pred_position]
#     else:
#       sbi_onehour = [-1] * (DURATION - pred_position) + day_point[key][:pred_position]

#   # to get the history data
#   sbi_history_list = []
#   sbi_history_mask = []
#   previous_days = [current_date - timedelta(days=i) for i in range(1, 6)] # last 5 days
#   for previous_day in previous_days:
#     history_key = f"{sno}_{previous_day.strftime('%Y-%m-%d')}"
#     if history_key in day_point:
#       start_pos = max(0, pred_position - DURATION)
#       end_pos = min(24 * DURATION, pred_position + DURATION)
#       sbi_history = day_point[history_key][start_pos:end_pos]
#       next_day_key = f"{sno}_{(previous_day + timedelta(days=1)).strftime('%Y-%m-%d')}"
#       prev_day_key = f"{sno}_{(previous_day - timedelta(days=1)).strftime('%Y-%m-%d')}"
#       if len(sbi_history) < 2 * DURATION:
#         if pred_position - DURATION < 0:
#           gap = DURATION - pred_position
#           if prev_day_key in day_point:
#             sbi_history = day_point[prev_day_key][-gap:] + sbi_history
#           else:
#             sbi_history = [-1] * (gap) + sbi_history
#         if pred_position + DURATION > 24 * DURATION:
#           gap = pred_position + DURATION - 24 * DURATION
#           if next_day_key in day_point:
#             sbi_history += day_point[next_day_key][:gap]
#           else:
#             sbi_history += [-1] * (gap)
#       sbi_history_list.append(sbi_history)
#       sbi_history_mask.append(generate_padding_mask(sbi_history))

#   # store sequence in dataframe
#   df.at[i, 'sbi_prediction'] = sbi_prediction
#   df.at[i, 'sbi_onehour'] = sbi_onehour
#   df.at[i, 'sbi_history'] = sbi_history_list
#   df.at[i, 'sbi_prediction_mask'] = generate_padding_mask(sbi_prediction)
#   df.at[i, 'sbi_onehour_mask'] = generate_padding_mask(sbi_onehour)
#   df.at[i, 'sbi_history_mask'] = sbi_history_mask

print(df.head())

In [None]:
# @title get simple features

# time feature
df['time'] = df['updateTime'].dt.hour * 100 + df['updateTime'].dt.minute

# weekend feature
date_range = pd.date_range(START, END)
date_to_week = {date: 1 if date.weekday() < 5 else 0 for date in date_range}
df['week'] = df['date'].map(date_to_week)

# popularity
sno_to_popularity = {sno: 10 - i for i, sno in enumerate(popularity_data.values())}
df['popularity'] = df['sno'].map(sno_to_popularity).fillna(0).astype(int)

# rainfall
date_to_rainfall = {pd.to_datetime(date): rainfall for date, rainfall in rainfall_data.items()}
df['rainfall'] = df['date'].map(date_to_rainfall)

# see rate
see_rate = pd.read_csv(see_rate_path)
df = df.merge(see_rate[['sno', 'value']], on='sno', how='left')
df.rename(columns={'value': 'see_rate_value'}, inplace=True)
df['see_rate_value'].fillna(-1, inplace=True)

# 距离数据
distances = pd.read_csv(mrt_distance_path)
df = df.merge(distances[['sno', 'mrt_distances']], on='sno', how='left')
df['mrt_distances'].fillna(-1, inplace=True)

print(df.head())
df.to_csv(feature_save_path, index=False)