In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, random_split, Dataset
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm

# Data Preprocessing

In [2]:
def parse_time(x):
    '''
        Convert unix time to informative time array
        Input: unix time 
        Output: dt.year, dt.month, dt.day, dt.hour, dt.weekday()
    '''
    dt = datetime.fromtimestamp(x["TIMESTAMP"])
    return dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.weekday()

def polyline_to_trip_duration(polyline):
    '''
        Convert polyline to time duration
    '''
    return max(polyline.count("[") - 2, 0) * 15

def visualize_data(Xs, ys, title=""):
    plt.figure(figsize=(12,9))
    plt.axhline(color="red")
    plt.axvline(color="red")
    for points_idx, (X, y) in enumerate(zip(Xs, ys)):
        plt.scatter(X, y, s=10, c=colors[points_idx])
    if title:
        plt.title(title, fontsize=24)
    plt.xlabel("X", fontsize=18)
    plt.ylabel("Y", fontsize=18)
    
def expandTaxiStand(x):
    stand_name, stand_lat, stand_lng = taxiStand_to_geo[x["ORIGIN_STAND"]]
    return stand_name, stand_lat, stand_lng

def unstandardize_data(standardized_data, original_mean, original_std):
    original_data = [(value * original_std) + original_mean for value in standardized_data]
    return original_data

In [3]:
#  Geo data
df_taxiStand = pd.read_csv("dataset/metaData_taxistandsID_name_GPSlocation.csv")
# convert the meta information to dict
taxiStand_to_geo = {0:("None", 0, 0)}
for _, row in df_taxiStand.iterrows():
    # taxiStand_to_geo[id] = (stand name, lat, lng)
    taxiStand_to_geo[row[0]] = (row[1], float(row[2]), float(row[3]))
    
# Read data and select some columns
# We currently select not all columns

df_train = pd.read_csv("dataset/train.csv")
df_train = df_train.fillna(0)
df_train[["YR", "MON", "DAY", "HR","MIN","SEC", "WK"]] = df_train[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
df_train["TIME_DURATION"] = df_train["POLYLINE"].apply(polyline_to_trip_duration)
df_train = pd.get_dummies(df_train, columns = ['CALL_TYPE'])
df_train = df_train.drop(['DAY_TYPE', 'TIMESTAMP'], axis=1)
df_train[["STAND_NAME", "STAND_LAT", "STAND_LNG"]] = df_train[["ORIGIN_STAND"]].apply(expandTaxiStand, axis=1, result_type="expand")


df_test = pd.read_csv("dataset/test_public.csv")
df_test = df_test.fillna(0)
df_test[["YR", "MON", "DAY", "HR","MIN","SEC", "WK"]] = df_test[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
df_test = pd.get_dummies(df_test, columns = ['CALL_TYPE'])
df_test = df_test.drop(['DAY_TYPE', 'TIMESTAMP'], axis=1)
df_test[["STAND_NAME", "STAND_LAT", "STAND_LNG"]] = df_test[["ORIGIN_STAND"]].apply(expandTaxiStand, axis=1, result_type="expand")


In [4]:
##
# global dictionaries for mapping id to index
from collections import defaultdict

# Taxi ID
TAXI_ID_Frequency = df_train["TAXI_ID"].value_counts().to_dict()
TAXI_ID_Frequency = {key: value for key, value in TAXI_ID_Frequency.items() if value > 2000} 
taxiId = sorted(list(set(TAXI_ID_Frequency)))
taxiId_to_ix = defaultdict(lambda: 0, { id:i+1 for i,id in enumerate(taxiId)})
ix_to_taxiId = { i+1:id for i,id in enumerate(taxiId)}

df_train["TAXI_ID_ix"] = df_train["TAXI_ID"].apply(lambda x : taxiId_to_ix[x])
df_test["TAXI_ID_ix"] = df_test["TAXI_ID"].apply(lambda x : taxiId_to_ix[x])

# Call ID
callId = sorted(list(set(df_train["ORIGIN_CALL"].unique())))[1:] # remove 0 in the first
callId_to_ix = defaultdict(lambda: 0, { id:i+1 for i,id in enumerate(callId)})
ix_to_callId = { i+1:id for i,id in enumerate(callId)}

df_train["CALL_ID_ix"] = df_train["ORIGIN_CALL"].apply(lambda x : callId_to_ix[x])
df_test["CALL_ID_ix"] = df_test["ORIGIN_CALL"].apply(lambda x : callId_to_ix[x])

In [5]:
df_train_sub = df_train.sample(frac=1.0, random_state = 23)

train_valid_cutoff = 10000
df_valid = df_train_sub[0:train_valid_cutoff].copy()
df_train = df_train_sub[train_valid_cutoff:]

In [6]:
first_quartile = np.percentile(df_train["TIME_DURATION"], 25)
third_quartile = np.percentile(df_train["TIME_DURATION"], 75)
time_IQR = third_quartile - first_quartile
upper_bound = 5000
df_train = df_train[df_train["TIME_DURATION"] < upper_bound]
df_train = df_train[df_train["MISSING_DATA"] != True]
df_train = df_train[df_train["POLYLINE"] != '[]']

In [7]:
# RUN THIS CELL TO RESET DF_TRAIN
# Eliminate Outliers


##
# global dictionaries for mapping id to index
from collections import defaultdict

# Taxi ID
taxiId = sorted(list(set(df_train["TAXI_ID"].unique())))
taxiId_to_ix = defaultdict(lambda: 0, { id:i+1 for i,id in enumerate(taxiId)})
ix_to_taxiId = { i+1:id for i,id in enumerate(taxiId)}

df_train["TAXI_ID_ix"] = df_train["TAXI_ID"].apply(lambda x : taxiId_to_ix[x])
df_test["TAXI_ID_ix"] = df_test["TAXI_ID"].apply(lambda x : taxiId_to_ix[x])
df_valid["TAXI_ID_ix"] = df_valid["TAXI_ID"].apply(lambda x : taxiId_to_ix[x])

# Call ID
callId = sorted(list(set(df_train["ORIGIN_CALL"].unique())))[1:] # remove 0 in the first
callId_to_ix = defaultdict(lambda: 0, { id:i+1 for i,id in enumerate(callId)})
ix_to_callId = { i+1:id for i,id in enumerate(callId)}

df_train["CALL_ID_ix"] = df_train["ORIGIN_CALL"].apply(lambda x : callId_to_ix[x])
df_test["CALL_ID_ix"] = df_test["ORIGIN_CALL"].apply(lambda x : callId_to_ix[x])
df_valid["CALL_ID_ix"] = df_valid["ORIGIN_CALL"].apply(lambda x : callId_to_ix[x])

In [8]:
# df_train.to_csv('dataframe/df_train.csv', index=False)
# df_test.to_csv('dataframe/df_train.csv', index=False)
# df_valid.to_csv('dataframe/df_train.csv', index=False)


In [14]:
len(df_train)

1689787

# First Model

In [9]:
# Dataset
class BaseDataset(Dataset):
    def __init__(self, data, test = False):
        if test == False:
            self.inputs = data.iloc[:, :-1]
            self.labels = data.iloc[:, -1]
        else:
            self.inputs = data.iloc[:, :]
            self.labels = data.iloc[:, -1]

    def __len__(self):
        return self.inputs.shape[0]

    def __getitem__(self, index, test = False):
        input_list = self.inputs.iloc[index].tolist()
        label_list = self.labels.iloc[index].tolist()
        input_tensor = torch.tensor(input_list)
        label_tensor = torch.tensor(label_list).unsqueeze(dim=0)
 
        return input_tensor, label_tensor
    
# split
def train_val_split(train_dataset, portion = 0.8):
    train_size = int(len(train_dataset) * portion)
    val_size = len(train_dataset) - train_size
    train_subset, val_subset = random_split(train_dataset, [train_size, val_size], 
                                            generator=torch.Generator().manual_seed(42))
    return train_subset, val_subset


In [10]:
# Standardize time duration
time_mean = df_train['TIME_DURATION'].mean()
time_std = df_train['TIME_DURATION'].std()

df_train['TIME_DURATION_STD'] = (df_train['TIME_DURATION'] - df_train['TIME_DURATION'].mean()) / df_train['TIME_DURATION'].std()

# Standardize LAT/lng duration
df_train['LAT_STD'] = (df_train['STAND_LAT'] - df_train['STAND_LAT'].mean()) / df_train['STAND_LAT'].std()
df_train['LNG_STD'] = (df_train['STAND_LNG'] - df_train['STAND_LNG'].mean()) / df_train['STAND_LNG'].std()

# Standardize test
df_test['TIME_DURATION_STD'] = (df_train['TIME_DURATION'] - df_train['TIME_DURATION'].mean()) / df_train['TIME_DURATION'].std()
df_test['LAT_STD'] = (df_test['STAND_LAT'] - df_train['STAND_LAT'].mean()) / df_train['STAND_LAT'].std()
df_test['LNG_STD'] = (df_test['STAND_LNG'] - df_train['STAND_LNG'].mean()) / df_train['STAND_LNG'].std()

# Standardize valid
df_valid['TIME_DURATION_STD'] = (df_valid['TIME_DURATION'] - df_train['TIME_DURATION'].mean()) / df_train['TIME_DURATION'].std()
df_valid['LAT_STD'] = (df_valid['STAND_LAT'] - df_train['STAND_LAT'].mean()) / df_train['STAND_LAT'].std()
df_valid['LNG_STD'] = (df_valid['STAND_LNG'] - df_train['STAND_LNG'].mean()) / df_train['STAND_LNG'].std()

In [11]:
# df_features for train
feature_columns = ['MON','HR', 'WK', 'ORIGIN_STAND',"MIN",'SEC', 'CALL_TYPE_A', 'CALL_TYPE_B', 
                   'CALL_TYPE_C', 'LAT_STD', 'LNG_STD', 'TAXI_ID_ix', 'CALL_ID_ix']

df_features = df_train[feature_columns].copy()
df_features["ORIGIN_STAND"] = df_features['ORIGIN_STAND'].astype(int)
# concatenate the time-duration column
df_features = pd.concat([df_features, df_train['TIME_DURATION_STD']], axis = 1)


# df_features_test
df_features_test = df_test[feature_columns].copy()
df_features_valid = df_valid[feature_columns].copy()
df_features_valid = pd.concat([df_features_valid, df_valid['TIME_DURATION_STD']], axis = 1)

test_dataset = BaseDataset(df_features_test, test = True)

base_train_subset = BaseDataset(df_features, test = False)
base_val_subset = BaseDataset(df_features_valid, test = False)

# del df_features
# del df_features_test
# del time_duration

In [13]:
df_features

Unnamed: 0,MON,HR,WK,ORIGIN_STAND,MIN,SEC,CALL_TYPE_A,CALL_TYPE_B,CALL_TYPE_C,LAT_STD,LNG_STD,TAXI_ID_ix,CALL_ID_ix,TIME_DURATION_STD
1410946,5,23,5,23,44,29,0,1,0,1.055051,-1.055215,427,0,2.574508
750701,12,7,0,9,52,51,0,1,0,1.054965,-1.053621,344,0,-0.801193
135975,7,19,0,0,8,46,1,0,0,-0.947396,0.947391,402,2,-0.012478
1107893,2,19,2,47,51,27,0,1,0,1.056404,-1.064962,217,0,0.839335
381465,9,18,0,25,21,42,0,1,0,1.055065,-1.056392,107,0,0.271460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
727071,12,12,2,11,13,6,0,1,0,1.056074,-1.055376,305,0,-0.264866
1387446,4,10,1,21,35,22,0,1,0,1.055775,-1.059165,389,0,-0.643450
795688,12,19,2,0,35,20,1,0,0,-0.947396,0.947391,43,16470,1.880438
652006,11,8,6,0,38,34,1,0,0,-0.947396,0.947391,44,26707,0.145265


In [13]:
batch_size = 128
base_trainloader = DataLoader(base_train_subset, batch_size=batch_size, shuffle=True, num_workers=3)
base_valloader = DataLoader(base_val_subset, batch_size=batch_size, num_workers=3)

base_testloader = DataLoader(test_dataset, batch_size = 1, shuffle = False, num_workers = 3)

In [61]:
def train(model, optimizer, name,loss_fn = torch.nn.MSELoss(), early_stop = False, epoch = 3):
    # train the model for _ epochs
    train_loss_perStep_records = []
    val_loss_records = []
    train_loss_records = []
    
    for i in (range(epoch)):
        model.train()
        
        pre_val_loss = 1000000
        train_err = []
        val_err = []
        
        for d_train in tqdm(base_trainloader):
            X_train = d_train[0].to("cuda")
            y_train = d_train[1].to("cuda")

            y_pred = model(X_train) 
            loss = loss_fn(y_pred, y_train) # Compute MSE
            optimizer.zero_grad() 
            loss.backward() 
            optimizer.step() 

            # show RMSE on train
            model.eval()
            with torch.no_grad():
                predict_y_val = unstandardize_data(y_pred, time_mean, time_std)
                actual_y_val = unstandardize_data(y_train, time_mean, time_std)
                rmse_train_err = loss_fn(torch.tensor(predict_y_val), torch.tensor(actual_y_val)).item()**0.5
                train_err.append(rmse_train_err)
                train_loss_perStep_records.append(rmse_train_err)
                

        print(f'Epoch: {i+1} train: {sum(train_err) / len(train_err)}')
        train_loss_records.append(sum(train_err) / len(train_err))

        model.eval()

        for d_val in tqdm(base_valloader):
            X_val = d_val[0].to("cuda")
            y_val = d_val[1].to("cuda")
            with torch.no_grad():
                predict_y_val = unstandardize_data(model(X_val), time_mean, time_std)
                actual_y_val = unstandardize_data(y_val, time_mean, time_std)
                rmse_valid_err = loss_fn(torch.tensor(predict_y_val), torch.tensor(actual_y_val)).item()**0.5
                val_err.append(rmse_valid_err)
                
                if rmse_valid_err < pre_val_loss:
                    pre_val_loss = rmse_valid_err
                    torch.save(model, name)
                        
        print(f'Epoch: {i+1} validation: {sum(val_err) / len(val_err)}')
        val_loss_records.append(sum(val_err) / len(val_err))
    return model, train_loss_perStep_records, val_loss_records, train_loss_records

def predict_test(model):
    model.eval()
    predict_lst = []
    for d_val in tqdm(base_testloader):
        X_test = d_val[0].to("cuda")
        with torch.no_grad():
            predict_y_test = unstandardize_data(model(X_test), time_mean, time_std)
            predict_lst.append(predict_y_test)
    return predict_lst

# Different models

## LR

In [62]:
# Define Model
model = torch.nn.Sequential(
  torch.nn.Linear(13 , 1),
)
model.to("cuda")
# Define Loss Function / Objective Function
loss_fn = torch.nn.MSELoss()

# Define optimizer (this will perform your parameter updates use)
lr = 1e-3
opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)

In [63]:
train_loss_perStep_records = []
val_loss_records = []
train_loss_records = []

model, train_loss_perStep_records, val_loss_records, train_loss_records = train(model, name = "final/LR", optimizer = opt, epoch = 10)

100%|██████████| 13202/13202 [02:41<00:00, 81.63it/s]


Epoch: 1 train: 16701.71253843638


100%|██████████| 79/79 [00:01<00:00, 57.77it/s]


Epoch: 1 validation: 717.6069399033688


100%|██████████| 13202/13202 [02:42<00:00, 81.49it/s]


Epoch: 2 train: 566.1977884615641


100%|██████████| 79/79 [00:01<00:00, 57.79it/s]


Epoch: 2 validation: 756.9661509333638


100%|██████████| 13202/13202 [02:43<00:00, 80.84it/s]


Epoch: 3 train: 574.6801813443094


100%|██████████| 79/79 [00:01<00:00, 55.33it/s]


Epoch: 3 validation: 780.2134191496309


100%|██████████| 13202/13202 [02:43<00:00, 80.61it/s]


Epoch: 4 train: 567.6786507459922


100%|██████████| 79/79 [00:01<00:00, 56.70it/s]


Epoch: 4 validation: 812.7746010631564


100%|██████████| 13202/13202 [02:41<00:00, 81.70it/s]


Epoch: 5 train: 573.2521323061623


100%|██████████| 79/79 [00:01<00:00, 57.74it/s]


Epoch: 5 validation: 718.6990987495615


100%|██████████| 13202/13202 [02:40<00:00, 82.02it/s]


Epoch: 6 train: 574.0751429981914


100%|██████████| 79/79 [00:01<00:00, 55.87it/s]


Epoch: 6 validation: 704.0827911242944


100%|██████████| 13202/13202 [02:43<00:00, 80.74it/s]


Epoch: 7 train: 571.0937233388845


100%|██████████| 79/79 [00:01<00:00, 60.34it/s]


Epoch: 7 validation: 714.2559519835005


100%|██████████| 13202/13202 [02:41<00:00, 81.73it/s]


Epoch: 8 train: 570.3819283744056


100%|██████████| 79/79 [00:01<00:00, 55.04it/s]


Epoch: 8 validation: 710.7968926876084


100%|██████████| 13202/13202 [02:41<00:00, 81.62it/s]


Epoch: 9 train: 570.6192210526655


100%|██████████| 79/79 [00:01<00:00, 60.91it/s]


Epoch: 9 validation: 738.9802967065166


100%|██████████| 13202/13202 [03:00<00:00, 73.00it/s]


Epoch: 10 train: 572.207268166419


100%|██████████| 79/79 [00:01<00:00, 58.88it/s]

Epoch: 10 validation: 771.8433626650408





In [64]:
torch.save(model, 'final/model_LR.pth')
np.save('final/val_loss_records_LR.npy', val_loss_records)
np.save('final/train_loss_records_LR.npy', train_loss_records)

## Model with 3FC

In [82]:
# Define Model
model = torch.nn.Sequential(
  torch.nn.Linear(13 , 1024),
  torch.nn.ReLU(),
  torch.nn.Linear(1024, 1024),
  torch.nn.ReLU(),
  torch.nn.Linear(1024, 1),
)
model.to("cuda")
# Define Loss Function / Objective Function
loss_fn = torch.nn.MSELoss()

# Define optimizer (this will perform your parameter updates use)
lr = 1e-3
opt = torch.optim.Adam(model.parameters(), lr=lr)

In [83]:
train_loss_perStep_records = []
val_loss_records = []
train_loss_records = []

model, train_loss_perStep_records, val_loss_records, train_loss_records = train(model, name = "final/3FC", optimizer = opt, epoch = 10)

100%|██████████| 13202/13202 [02:55<00:00, 75.44it/s]


Epoch: 1 train: 3217.3643650649374


100%|██████████| 79/79 [00:01<00:00, 49.49it/s]


Epoch: 1 validation: 712.7780001021491


100%|██████████| 13202/13202 [02:55<00:00, 75.42it/s]


Epoch: 2 train: 473.85863129241386


100%|██████████| 79/79 [00:01<00:00, 47.37it/s]


Epoch: 2 validation: 698.8427127628289


100%|██████████| 13202/13202 [03:01<00:00, 72.90it/s]


Epoch: 3 train: 478.4021961145612


100%|██████████| 79/79 [00:01<00:00, 50.35it/s]


Epoch: 3 validation: 698.1065293392489


100%|██████████| 13202/13202 [02:55<00:00, 75.34it/s]


Epoch: 4 train: 470.9619410147333


100%|██████████| 79/79 [00:01<00:00, 54.87it/s]


Epoch: 4 validation: 696.796651651599


100%|██████████| 13202/13202 [02:50<00:00, 77.49it/s]


Epoch: 5 train: 461.6372519526838


100%|██████████| 79/79 [00:01<00:00, 51.24it/s]


Epoch: 5 validation: 696.3170944261772


100%|██████████| 13202/13202 [02:55<00:00, 75.12it/s]


Epoch: 6 train: 460.6690718527447


100%|██████████| 79/79 [00:01<00:00, 49.51it/s]


Epoch: 6 validation: 695.3150231802782


100%|██████████| 13202/13202 [02:50<00:00, 77.34it/s]


Epoch: 7 train: 460.40463420868446


100%|██████████| 79/79 [00:01<00:00, 51.65it/s]


Epoch: 7 validation: 693.5066846297901


100%|██████████| 13202/13202 [02:52<00:00, 76.61it/s]


Epoch: 8 train: 461.3148168494663


100%|██████████| 79/79 [00:01<00:00, 51.53it/s]


Epoch: 8 validation: 693.729721471331


100%|██████████| 13202/13202 [02:51<00:00, 77.07it/s]


Epoch: 9 train: 459.80185868549813


100%|██████████| 79/79 [00:01<00:00, 49.87it/s]


Epoch: 9 validation: 692.7013373536317


100%|██████████| 13202/13202 [02:49<00:00, 77.72it/s]


Epoch: 10 train: 460.16716186485763


100%|██████████| 79/79 [00:01<00:00, 50.85it/s]

Epoch: 10 validation: 693.7246529937734





In [84]:
torch.save(model, 'final/model_3FC.pth')
np.save('final/val_loss_records_3FC.npy', val_loss_records)
np.save('final/train_loss_records_3FC.npy', train_loss_records)

## Model with 6FC and leaky relu

In [68]:
# Define Model
model = torch.nn.Sequential(
    torch.nn.Linear(13 , 1024),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(1024, 1024),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(1024, 1024),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(1024 , 1024),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(1024, 1024),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(1024, 1),
)
model.to("cuda")
# Define Loss Function / Objective Function
loss_fn = torch.nn.MSELoss()

# Define optimizer (this will perform your parameter updates use)
lr = 1e-5
opt = torch.optim.Adam(model.parameters(), lr=lr)

In [69]:
train_loss_perStep_records = []
val_loss_records = []
train_loss_records = []

model, train_loss_perStep_records, val_loss_records, train_loss_records = train(model, name = "final/6FC", optimizer = opt, epoch = 10)

100%|██████████| 13202/13202 [03:20<00:00, 65.98it/s]


Epoch: 1 train: 502.06208034256997


100%|██████████| 79/79 [00:01<00:00, 42.84it/s]


Epoch: 1 validation: 699.8854670023753


100%|██████████| 13202/13202 [03:05<00:00, 71.18it/s]


Epoch: 2 train: 469.090923683275


100%|██████████| 79/79 [00:01<00:00, 47.31it/s]


Epoch: 2 validation: 698.8527863989389


100%|██████████| 13202/13202 [03:07<00:00, 70.43it/s]


Epoch: 3 train: 464.8681984664875


100%|██████████| 79/79 [00:01<00:00, 48.41it/s]


Epoch: 3 validation: 697.6340531625807


100%|██████████| 13202/13202 [03:05<00:00, 71.33it/s]


Epoch: 4 train: 463.73499713036097


100%|██████████| 79/79 [00:01<00:00, 43.11it/s]


Epoch: 4 validation: 697.4818303971421


100%|██████████| 13202/13202 [03:00<00:00, 73.13it/s]


Epoch: 5 train: 463.0079803550281


100%|██████████| 79/79 [00:01<00:00, 43.75it/s]


Epoch: 5 validation: 697.0085424853169


100%|██████████| 13202/13202 [03:07<00:00, 70.53it/s]


Epoch: 6 train: 462.1928469719196


100%|██████████| 79/79 [00:01<00:00, 43.81it/s]


Epoch: 6 validation: 697.9290113207356


100%|██████████| 13202/13202 [03:08<00:00, 69.95it/s]


Epoch: 7 train: 461.7796932982435


100%|██████████| 79/79 [00:01<00:00, 48.87it/s]


Epoch: 7 validation: 695.8515080741756


100%|██████████| 13202/13202 [03:11<00:00, 68.79it/s]


Epoch: 8 train: 461.38904005098925


100%|██████████| 79/79 [00:01<00:00, 44.50it/s]


Epoch: 8 validation: 696.5459809171831


100%|██████████| 13202/13202 [02:56<00:00, 74.77it/s]


Epoch: 9 train: 460.939955715363


100%|██████████| 79/79 [00:01<00:00, 42.61it/s]


Epoch: 9 validation: 697.6953855404915


100%|██████████| 13202/13202 [02:54<00:00, 75.56it/s]


Epoch: 10 train: 460.6475692364042


100%|██████████| 79/79 [00:01<00:00, 43.50it/s]

Epoch: 10 validation: 694.7965522791385





In [70]:
torch.save(model, 'final/model_6FC.pth')
np.save('final/val_loss_records_6FC.npy', val_loss_records)
np.save('final/train_loss_records_6FC.npy', train_loss_records)

## Embedding

In [71]:
# Feature: MON HR WK STANDID CALLA CALLB CALLC LAT_STD LNG_STD TAXIID CALLID
# Input: 0 1 2 3 9 10 needs to be embedded

MON_embedding_input_size = 13
HR_embedding_input_size = 24
WK_embedding_input_size = 7
MIN_embedding_input_size = 60
SEC_embedding_input_size = 60
STANDID_embedding_input_size = 65
TAXIID_embedding_input_size = 449
CALLID_embedding_input_size = 57105 


MON_embedding_dim = 6
HR_embedding_dim = 4
WK_embedding_dim = 3
MIN_embedding_dim = 10
SEC_embedding_dim = 10
STANDID_embedding_dim = 6
TAXIID_embedding_dim = 9
CALLID_embedding_dim = 16

embedding_dim_list = [(MON_embedding_input_size, MON_embedding_dim), # 0
            (HR_embedding_input_size, HR_embedding_dim),                 # 1
            (WK_embedding_input_size, WK_embedding_dim),                 # 2 
            (STANDID_embedding_input_size, STANDID_embedding_dim),       # 3
            (MIN_embedding_input_size, MIN_embedding_dim),                 # 4
            (SEC_embedding_input_size, SEC_embedding_dim),                 # 5  
            (TAXIID_embedding_input_size, TAXIID_embedding_dim),         # 6
            (CALLID_embedding_input_size, CALLID_embedding_dim), ]       # 7


class TripTimePredictor(nn.Module):
    def __init__(self, embedding_dim_list = embedding_dim_list):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(embedding_input_size, embedding_dim) for embedding_input_size, embedding_dim in embedding_dim_list
        ])   
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(sum([embedding_dim for _, embedding_dim in embedding_dim_list])+5, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 1)

    
    def forward(self, input):
        i_embedding = [0, 1, 2, 3, 4, 5, 11, 12]
        embedded_features = []
        
        
        for i, emb_layer in enumerate(self.embeddings):
            embedded = emb_layer(input[:, i_embedding[i]].long().unsqueeze(1))  # put the corresponding feature into the embeding layers
            embedded_features.append(embedded.squeeze(1))
        combined_features = torch.cat(embedded_features, dim=1)

        
        combined_features = torch.cat([combined_features, input[:, 6:11]], dim=1)
        
        out = self.fc1(combined_features)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)

        return out
        

In [72]:
model = TripTimePredictor().to("cuda")
# Define Loss Function / Objective Function
loss_fn = torch.nn.MSELoss()

# Define optimizer (this will perform your parameter updates use)
lr = 5e-4
opt = torch.optim.Adam(model.parameters(), lr=lr)

In [73]:
val_loss_records = []
train_loss_records = []

model, train_loss_perStep_records, val_loss_records, train_loss_records = train(model, name="final/embedding", optimizer = opt, early_stop = True, epoch = 10)

100%|██████████| 13202/13202 [03:12<00:00, 68.55it/s]


Epoch: 1 train: 450.708703133575


100%|██████████| 79/79 [00:01<00:00, 56.71it/s]


Epoch: 1 validation: 679.5103063669187


100%|██████████| 13202/13202 [03:15<00:00, 67.61it/s]


Epoch: 2 train: 441.2435533304968


100%|██████████| 79/79 [00:01<00:00, 47.67it/s]


Epoch: 2 validation: 672.993225836999


100%|██████████| 13202/13202 [03:19<00:00, 66.13it/s]


Epoch: 3 train: 436.9352024057794


100%|██████████| 79/79 [00:01<00:00, 47.77it/s]


Epoch: 3 validation: 672.2135593620485


100%|██████████| 13202/13202 [03:11<00:00, 68.92it/s]


Epoch: 4 train: 433.24367684019273


100%|██████████| 79/79 [00:01<00:00, 46.66it/s]


Epoch: 4 validation: 670.3962937849956


100%|██████████| 13202/13202 [03:06<00:00, 70.84it/s]


Epoch: 5 train: 430.2541492268549


100%|██████████| 79/79 [00:01<00:00, 58.30it/s]


Epoch: 5 validation: 671.2496770585319


100%|██████████| 13202/13202 [02:57<00:00, 74.19it/s]


Epoch: 6 train: 426.98163716896767


100%|██████████| 79/79 [00:01<00:00, 45.68it/s]


Epoch: 6 validation: 672.035293047175


100%|██████████| 13202/13202 [03:03<00:00, 71.99it/s]


Epoch: 7 train: 423.4328762790516


100%|██████████| 79/79 [00:01<00:00, 50.11it/s]


Epoch: 7 validation: 671.5171324590293


100%|██████████| 13202/13202 [03:17<00:00, 66.97it/s]


Epoch: 8 train: 420.3453489738823


100%|██████████| 79/79 [00:01<00:00, 45.36it/s]


Epoch: 8 validation: 671.3380288298015


100%|██████████| 13202/13202 [03:23<00:00, 64.83it/s]


Epoch: 9 train: 416.48719885102594


100%|██████████| 79/79 [00:01<00:00, 46.27it/s]


Epoch: 9 validation: 673.3310384673975


100%|██████████| 13202/13202 [03:21<00:00, 65.41it/s]


Epoch: 10 train: 412.8944379243012


100%|██████████| 79/79 [00:01<00:00, 46.67it/s]

Epoch: 10 validation: 672.3414121499327





In [74]:
torch.save(model, 'final/model_embedding.pth')
np.save('final/val_loss_records_embedding.npy', val_loss_records)
np.save('final/train_loss_records_embedding.npy', train_loss_records)

# Write the result

In [80]:
model= torch.load('final/model_LR_best.pth')
model.eval()
predict_lst = []
for d_val in tqdm(base_testloader):
    X_test = d_val[0].to("cuda")
    with torch.no_grad():
        predict_y_test = unstandardize_data(model(X_test), time_mean, time_std)
        predict_lst.append(predict_y_test)

result = []
for i in predict_lst:
    result.append(i[0][0].item())
submit = pd.read_csv("dataset/sampleSubmission.csv")
submit['TRAVEL_TIME'] = result
submit.to_csv('final/submit_LR.csv', sep=',', index = False)

100%|██████████| 320/320 [00:00<00:00, 328.07it/s]


In [None]:
model= torch.load('final/model_3FC.pth')
model.eval()
predict_lst = []
for d_val in tqdm(base_testloader):
    X_test = d_val[0].to("cuda")
    with torch.no_grad():
        predict_y_test = unstandardize_data(model(X_test), time_mean, time_std)
        predict_lst.append(predict_y_test)

result = []
for i in predict_lst:
    result.append(i[0][0].item())
submit = pd.read_csv("dataset/sampleSubmission.csv")
submit['TRAVEL_TIME'] = result
submit.to_csv('final/submit_3FC.csv', sep=',', index = False)

In [None]:
model= torch.load('final/model_6FC.pth')
model.eval()
predict_lst = []
for d_val in tqdm(base_testloader):
    X_test = d_val[0].to("cuda")
    with torch.no_grad():
        predict_y_test = unstandardize_data(model(X_test), time_mean, time_std)
        predict_lst.append(predict_y_test)

result = []
for i in predict_lst:
    result.append(i[0][0].item())
submit = pd.read_csv("dataset/sampleSubmission.csv")
submit['TRAVEL_TIME'] = result
submit.to_csv('final/submit_6FC.csv', sep=',', index = False)

In [None]:
model= torch.load('final/model_embedding.pth')
model.eval()
predict_lst = []
for d_val in tqdm(base_testloader):
    X_test = d_val[0].to("cuda")
    with torch.no_grad():
        predict_y_test = unstandardize_data(model(X_test), time_mean, time_std)
        predict_lst.append(predict_y_test)

result = []
for i in predict_lst:
    result.append(i[0][0].item())
submit = pd.read_csv("dataset/sampleSubmission.csv")
submit['TRAVEL_TIME'] = result
submit.to_csv('final/submit_EMB.csv', sep=',', index = False)