# CSE151B Project

In [1]:
import numpy as np
import pandas as pd
import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error as MSE
import matplotlib.pyplot as plt

## Task Description

<center>regression task to predict the duration of a taxi ride given information about the time it was called.</center>

## Preprocessing

In [2]:
df_tr = pd.read_csv("data/train.csv")
df_tr.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [3]:
# Over every single 
def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15, where polyline_length = count("[") - 1
df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)

In [4]:
def parse_time(x):
  # Each x is essentially a 1 row, 1 column pandas Series
  dt = datetime.fromtimestamp(x["TIMESTAMP"])
  return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

# Because we are assigning multiple values at a time, we need to "expand" our computed 
#(year, month, day, hour, weekday) tuples on the column axis, or axis 1
df_tr[["YR", "MON", "DAY", "HR", "WK"]] = df_tr[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [5]:
df_tr.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,LEN,YR,MON,DAY,HR,WK
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330,2013,6,30,17,6
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270,2013,6,30,17,6
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960,2013,6,30,17,6
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630,2013,6,30,17,6
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420,2013,6,30,17,6


In [6]:
# only 10 entries supposely had missing data
X = (
    df_tr[df_tr['MISSING_DATA']==False]
     [['CALL_TYPE','TIMESTAMP','YR','MON','DAY','HR','WK']]
    )

# testing a simplified dataset
X = (
    df_tr[df_tr['MISSING_DATA']==False]
     [['CALL_TYPE','TIMESTAMP']]
    )

# encode category
X['CALL_TYPE'] = X['CALL_TYPE'].apply(lambda x: 0 if x=='A' else 1 if x=='B' else 2)

y = df_tr[df_tr['MISSING_DATA']==False]['LEN']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Baseline

In [7]:
regr = RandomForestRegressor(max_depth=100, random_state=42, criterion = 'squared_error', n_jobs=-1)
regr.fit(X_train, y_train);

In [8]:
preds = regr.predict(X_test)
MSE(preds, y_test, squared=False)

782.554634729751

In [None]:
tuned_params = {'n_estimators': [100, 200, 300, 400, 500], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
random_regressor = RandomizedSearchCV(RandomForestRegressor(), tuned_params, n_iter = 20, scoring = 'neg_mean_absolute_error', cv = 5, n_jobs = -1)
random_regressor.fit(X_train, y_train)

In [None]:
preds = random_regressor.predict(X_test)
MSE(preds, y_test)

## Deep Learning Model

In [None]:
# convert to tensors
X_train = torch.from_numpy(X_train.values).to(torch.float32)
y_train = torch.tensor(np.array(y_train)).to(torch.float32).reshape(-1,1)
X_test = torch.from_numpy(X_test.values).to(torch.float32)
y_test = torch.tensor(np.array(y_test)).to(torch.float32).reshape(-1,1)

In [None]:
data_train = TensorDataset(X_train, y_train)
data_test = TensorDataset(X_test, y_test)

In [None]:
batch_size = 64
num_workers = 2
train_loader = DataLoader(data_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)

In [None]:
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()

    def forward(self,x,y):
        criterion = nn.MSELoss()
        eps = 1e-6
        loss = torch.sqrt(criterion(x, y) + eps)
        return loss

In [None]:
class model(nn.Module):
    def __init__(self):
        super(model, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Linear(2,256),
        )
        
        self.layer2 = nn.Sequential(
            nn.Linear(256,64),
        )

        self.layer3 = nn.Sequential(
            nn.Linear(64,16),
        )
        
        self.layer4 = nn.Sequential(
            nn.Linear(16,1)
        )
        
    def forward(self, x):
        x = self.layer1(x)      
        x = self.layer2(x)      
        x = self.layer3(x)      
        x = self.layer4(x)    
        return x

In [None]:
# simplified model because my hair is leaving my head
class model(nn.Module):
    def __init__(self):
        super(model, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Linear(2,8),
        )
        
        self.layer2 = nn.Sequential(
            nn.Linear(8,4),
        )
        
        self.layer3 = nn.Sequential(
            nn.Linear(4,1),
        )
        
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        return x

## Training

In [None]:
from tqdm import tqdm

# train step
criterion = RMSELoss()
net = model()
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-4)

for epoch in tqdm(range(10)):
    for x,y in train_loader:
        y_pred = net(x)
        loss  = criterion(y_pred, y)
        loss.backward()

        optimizer.step()

        optimizer.zero_grad()
        

    print('epoch {}, loss {}'.format(epoch, loss.item()))

print('Finished Training')