In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, random_split, Dataset
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import numpy as np

## Import Data

In [2]:
def parse_time(x):
    '''
        Convert unix time to informative time array
        Input: unix time 
        Output: dt.year, dt.month, dt.day, dt.hour, dt.weekday()
    '''
    dt = datetime.fromtimestamp(x["TIMESTAMP"])
    return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

def polyline_to_trip_duration(polyline):
    '''
        Convert polyline to time duration
    '''
    return max(polyline.count("[") - 2, 0) * 15

def visualize_data(Xs, ys, title=""):
    plt.figure(figsize=(12,9))
    plt.axhline(color="red")
    plt.axvline(color="red")
    for points_idx, (X, y) in enumerate(zip(Xs, ys)):
        plt.scatter(X, y, s=10, c=colors[points_idx])
    if title:
        plt.title(title, fontsize=24)
    plt.xlabel("X", fontsize=18)
    plt.ylabel("Y", fontsize=18)
    
def getFirstCoord(polyline):
    s = polyline.split("]")[0][2:].split(",")
    if s[0] == "": return [0,0]
    lng = float(s[0])
    lat = float(s[1])
    return lng, lat    

def expandTaxiStand(x):
    stand_name, stand_lat, stand_lng = taxiStand_to_geo[x["ORIGIN_STAND"]]
    return stand_name, stand_lat, stand_lng

# Read data

In [3]:
# RAW data
df_train_raw = pd.read_csv("dataset/train.csv")
df_test_raw = pd.read_csv("dataset/test_public.csv")

####  How many train data and test data, what is the original dimension

In [None]:
df_train_raw.head()

In [None]:
df_test_raw.head()

In [None]:
df_train_raw["DAY_TYPE"].unique()

In [None]:
df_test_raw.shape

In [None]:
#  Geo data
df_taxiStand = pd.read_csv("dataset/metaData_taxistandsID_name_GPSlocation.csv")
# convert the meta information to dict
taxiStand_to_geo = {0:("None", 0, 0)}
for _, row in df_taxiStand.iterrows():
    # taxiStand_to_geo[id] = (stand name, lat, lng)
    taxiStand_to_geo[row[0]] = (row[1], float(row[2]), float(row[3]))
    

# Read data and select some columns
# We currently select not all columns

df_train = pd.read_csv("dataset/train.csv")
df_train = df_train.fillna(0)
df_train[["YR", "MON", "DAY", "HR", "WK"]] = df_train[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
df_train["TIME_DURATION"] = df_train["POLYLINE"].apply(polyline_to_trip_duration)
df_train = pd.get_dummies(df_train, columns = ['CALL_TYPE'])
df_train = df_train.drop(['DAY_TYPE', 'TIMESTAMP'], axis=1)
df_train[["STAND_NAME", "STAND_LAT", "STAND_LNG"]] = df_train[["ORIGIN_STAND"]].apply(expandTaxiStand, axis=1, result_type="expand")


df_test = pd.read_csv("dataset/test_public.csv")
df_test = df_test.fillna(0)
df_test[["YR", "MON", "DAY", "HR", "WK"]] = df_test[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
df_test = pd.get_dummies(df_test, columns = ['CALL_TYPE'])
df_test = df_test.drop(['DAY_TYPE', 'TIMESTAMP'], axis=1)
df_test[["STAND_NAME", "STAND_LAT", "STAND_LNG"]] = df_test[["ORIGIN_STAND"]].apply(expandTaxiStand, axis=1, result_type="expand")


In [None]:
# global dictionaries for mapping id to index
from collections import defaultdict

# Taxi ID
taxiId = sorted(list(set(df_train["TAXI_ID"].unique())))
taxiId_to_ix = defaultdict(lambda: -1, { id:i for i,id in enumerate(taxiId)})
ix_to_taxiId = { i:id for i,id in enumerate(taxiId)}

df_train["TAXI_ID_ix"] = df_train["TAXI_ID"].apply(lambda x : taxiId_to_ix[x])
df_test["TAXI_ID_ix"] = df_test["TAXI_ID"].apply(lambda x : taxiId_to_ix[x])

# Call ID
callId = sorted(list(set(df_train["ORIGIN_CALL"].unique())))[1:] # remove 0 in the first
callId_to_ix = defaultdict(lambda: -1, { id:i for i,id in enumerate(callId)})
ix_to_callId = { i:id for i,id in enumerate(callId)}

df_train["CALL_ID_ix"] = df_train["ORIGIN_CALL"].apply(lambda x : callId_to_ix[x])
df_test["CALL_ID_ix"] = df_test["ORIGIN_CALL"].apply(lambda x : callId_to_ix[x])

# Stand ID is just fine

In [None]:
df_train[["ORIGIN_CALL", "TAXI_ID", "TAXI_ID_ix", "CALL_ID_ix"]][35:40]

# Analysis

## Visualize and Select outliers

In [None]:
# raw_data time distribution
sns.boxplot(data=df_train, x="TIME_DURATION")

In [None]:
sns.histplot(data=df_train, x='TIME_DURATION', kde=True)

In [None]:
mean = np.mean(df_train["TIME_DURATION"])
first_quartile = np.percentile(df_train["TIME_DURATION"], 25)
third_quartile = np.percentile(df_train["TIME_DURATION"], 75)
IQR = third_quartile - first_quartile
print("IQR = ", IQR)
print("third_quartile = ", third_quartile)


In [None]:
upper_bound = third_quartile + 3 * IQR
upper_bound

In [None]:
df_cleaned = df_train[df_train["TIME_DURATION"] < 5000]
df_cleaned.shape

In [None]:
df_cleaned = df_cleaned.drop(df_cleaned[df_cleaned['POLYLINE'] =='[]']['POLYLINE'].index)
df_cleaned = df_cleaned.drop(df_cleaned[df_cleaned["TIME_DURATION"] == 0].index).shape

In [None]:
sns.histplot(data=df_cleaned, x='TIME_DURATION', kde=True)

In [None]:
df_cleaned = df_cleaned.drop(df_cleaned[df_cleaned['POLYLINE'] =='[]']['POLYLINE'].index)

In [None]:
df_cleaned[df_cleaned["TIME_DURATION"] == 0]

In [None]:
data1 = df_cleaned[df_cleaned["CALL_TYPE_B"] == 1]["TIME_DURATION"]
data2 = df_cleaned[df_cleaned["CALL_TYPE_B"] == 0]["TIME_DURATION"]
combined_data = np.concatenate((data1, data2))

sns.histplot(data=data1, bins=30, kde=True, color='red', label='From Stand')
sns.histplot(data=data2, bins=30, kde=True, color = 'green', label = "Not From Stand")
sns.histplot(data=combined_data, bins=30, kde=True, color='blue', label='Total')
plt.legend()

plt.show()

## Feature vs. Time

In [None]:
mean, std = df_cleaned["TIME_DURATION"].mean(), df_cleaned["TIME_DURATION"].std()
median = df_cleaned["TIME_DURATION"].median()
print(f"{mean=} {median=} {std=}")

# First n samples to analyze. Set to -1 to use all data
end = -1

outlier_threshold = 3



# "Choose all data, where the trip length is less than 3 standard deviations away from the mean"
# This is to remove outliers. Otherwise, our plots would look very squished (since there are some
# VERRRRRY long taxi trips in the dataset)
# df_trimmed = df_tr[df_tr["LEN"] < mean + outlier_threshold * std]

# Because our y-values only take on multiples of 15, we want just enough buckets in a histogram
# such that each buckets counts one value's frequency. (e.x. one bucket counts how many 15s trips, 
# how many 30s trips, etc. )
buckets = (int(mean + outlier_threshold * std) // 15)

print(f"Using: {len(df_copy)}/{len(df_copy)}")

fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(22,14))

# Now, we visualize some features that we think might be useful
for idx, v in enumerate(["YR", "MON", "DAY", "HR", "WK", "ORIGIN_STAND", "CALL_ID_ix", "TAXI_ID_ix"]):
  # idx // 2 = row, idx % 4 = column
    ax = axs[idx // 4, idx % 4]
    
    df_subset = df_cleaned.copy()
#     # Remove any rows with invalid values
#     df_subset = df_copy.dropna(subset=v).copy()

    # Since we fill 0 for stand id and call id, we need to drop the rows with zero
    if v == "ORIGIN_STAND" :
        df_subset = df_subset.drop(df_subset[df_subset[v] == 0].any(axis=1).index, inplace = False)
    if v == "CALL_ID_ix":
        df_subset = df_subset.drop(df_subset[df_subset[v] == -1].any(axis=1).index, inplace = False)
    
    # Create a histogram. Look up the documentation for more details
    ax.hist2d(df_subset[v][:end], df_subset["TIME_DURATION"][:end], cmap="CMRmap", bins=(120,buckets))

    # Some stylistic things to make the graphs look nice
    ax.set_xlim(ax.get_xlim()[0], ax.get_xlim()[1] + 1)
    ax.set_facecolor("black")
    ax.set_ylabel("seconds", fontsize=18)
    ax.set_title(f"Feature: {v}", fontsize=20)


## Heatmap

In [None]:
df_taxiStand = pd.read_csv("dataset/metaData_taxistandsID_name_GPSlocation.csv")

# convert the meta information to dict
taxiStand_to_geo = {0:("None", 0, 0)}
for _, row in df_taxiStand.iterrows():
    # taxiStand_to_geo[id] = (stand name, lat, lng)
    taxiStand_to_geo[row[0]] = (row[1], float(row[2]), float(row[3]))
    
df_train[["STAND_NAME", "STAND_LAT", "STAND_LNG"]] = \
    df_train[["ORIGIN_STAND"]].apply(expandTaxiStand, axis=1, result_type="expand")

In [None]:
# Heatmap for statrt position of each trip
import folium
from folium.plugins import HeatMap

def getFirstCoord(polyline):
    s = polyline.split("]")[0][2:].split(",")
    if s[0] == "": return [0,0]
    lng = float(s[0])
    lat = float(s[1])
    return lng, lat    


lngs = []
lats = []
for p in df_train["POLYLINE"]:
    lng, lat = getFirstCoord(p)
    lngs.append(lng)
    lats.append(lat)
    
map_obj = folium.Map(location=[lats[0], lngs[0]], zoom_start=10)
heat_data = list(zip(lats, lngs))

gradient = {
    0.2: 'blue',
    0.4: 'cyan',
    0.6: 'lime',
    0.8: 'yellow',
    1.0: 'red'
}

HeatMap(heat_data, gradient=gradient).add_to(map_obj)
map_obj.save('heatmap_raw.html')

for key, (name, lat, lon) in taxiStand_to_geo.items():
    if name == "None": continue
    folium.Marker([lat, lon], popup=str(key)+":"+name).add_to(map_obj)
    
map_obj.save('heatmapWithStand.html')
display(map_obj)

In [None]:
import folium

# 创建地图对象
m = folium.Map(location=[51.5074, -0.1278], zoom_start=12)

# 创建折线坐标点列表
points = [[51.5074, -0.1278], [51.5085, -0.1225], [51.505, -0.1234], [51.506, -0.129]]

# 创建折线对象
line = folium.PolyLine(locations=points, color='red', weight=2)

# 将折线对象添加到地图上
line.add_to(m)

# 显示地图
m
