In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('NYCTaxiFares.csv')
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


In [10]:
 def haversine_distance(df, lat1, long1, lat2, long2):
        phi1 = np.radians(df[lat1])
        phi2 = np.radians(df[lat2])
        delta_phi = np.radians(df[lat2] - df[lat1])
        delta_lambda = np.radians(df[long2]-df[long1])
        a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
        return 6371* 2*np.arctan2(np.sqrt(a), np.sqrt(1-a))

In [11]:
df['dist_km'] = haversine_distance(df,'pickup_latitude', 'pickup_longitude', 
                                   'dropoff_latitude', 'dropoff_longitude')

In [12]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321


In [13]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [22]:
df['EDTdate'] = df['pickup_datetime'] - pd.Timedelta(hours=4)
df['Hour'] = df['EDTdate'].dt.hour
df['AMorPM'] = np.where(df['Hour']<12, 'am', 'pm')
df['Weekday'] = df['EDTdate'].dt.strftime("%a")

In [23]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM,Weekday
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56+00:00,4,am,Mon
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53+00:00,11,am,Sat
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26+00:00,7,am,Sat
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03+00:00,17,pm,Sun
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01+00:00,22,pm,Fri


In [26]:
cat_cols = ['Hour', 'AMorPM','Weekday']
cont_cols = ['pickup_longitude','pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count']

In [27]:
y_col = ['fare_amount']

In [31]:
for c in cat_cols:
    df[c] = df[c].astype('category')
df.dtypes

pickup_datetime      datetime64[ns, UTC]
fare_amount                      float64
fare_class                         int64
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dist_km                          float64
EDTdate              datetime64[ns, UTC]
Hour                            category
AMorPM                          category
Weekday                         category
dtype: object

In [43]:
cats = np.stack([df[c].cat.codes.values for c in cat_cols],1)
conts = np.stack([df[c].values for c in cont_cols],1)
cats = torch.tensor(cats, dtype=torch.int64)
conts = torch.tensor(conts, dtype=torch.float)
cats,conts

(tensor([[ 4,  0,  1],
         [11,  0,  2],
         [ 7,  0,  2],
         ...,
         [14,  1,  3],
         [ 4,  0,  5],
         [12,  1,  2]]),
 tensor([[-73.9924,  40.7305, -73.9755,  40.7447,   1.0000],
         [-73.9901,  40.7406, -73.9742,  40.7441,   1.0000],
         [-73.9941,  40.7511, -73.9601,  40.7662,   2.0000],
         ...,
         [-73.9886,  40.7498, -74.0115,  40.7078,   3.0000],
         [-74.0044,  40.7245, -73.9927,  40.7308,   1.0000],
         [-73.9554,  40.7719, -73.9676,  40.7630,   3.0000]]))

In [45]:
y=torch.tensor(df[y_col].values, dtype=torch.float).reshape(-1,1)

In [46]:
cats.shape, conts.shape, y.shape

(torch.Size([120000, 3]), torch.Size([120000, 5]), torch.Size([120000, 1]))

In [56]:
cat_size = [len(df[col].cat.categories) for col in cat_cols]
embedsize = [(size,min(50,(size+1)//2)) for size in cat_size]
embedsize, cat_size

([(24, 12), (2, 1), (7, 4)], [24, 2, 7])

In [59]:
selfembed = nn.ModuleList([nn.Embedding(n1,n2) for n1,n2 in embedsize])
selfembed

ModuleList(
  (0): Embedding(24, 12)
  (1): Embedding(2, 1)
  (2): Embedding(7, 4)
)

In [73]:
catz=cats[:2]
embeddings = []
for i,e in enumerate(selfembed):
    embeddings.append(e(catz[:,i]))
z = torch.cat(embeddings,1)
selfembeddropout = nn.Dropout(0.4)
z = selfembeddropout(z)
z

tensor([[ 0.0000, -0.0000, -0.0000,  0.0935, -0.0000,  1.3838, -0.3798, -0.0000,
         -1.2266, -0.0000, -3.7537, -0.0000, -2.9127,  1.2878, -0.0000, -0.0000,
         -0.0000],
        [ 0.4353, -0.0000, -1.6940, -0.0000,  0.0000,  0.0518, -0.6160, -2.2536,
         -0.0000, -0.0000, -0.5180,  0.9458, -0.0000,  1.3437,  0.0000, -0.0000,
          2.7875]], grad_fn=<MulBackward0>)

In [110]:
class Model(nn.Module):
    def __init__(self, emb_size, n_cont, out_size, layers, drop=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(n1,n2) for n1,n2 in emb_size])
        self.embdrop = nn.Dropout(drop)
        self.batchn_cont = nn.BatchNorm1d(n_cont)
        layerlist = []
        n_embeds = sum((n2 for n1,n2 in emb_size))
        n_in = n_embeds + n_cont
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(drop))
            n_in = i
        layerlist.append(nn.Linear(layers[-1], out_size))
        self.layers = nn.Sequential(*layerlist)
        
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.embdrop(x)
        x_cont = self.batchn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x
    

In [111]:
torch.manual_seed(33)
model = Model(embedsize, conts.shape[1], 1, [300,200], drop=0.4)
model

Model(
  (embeds): ModuleList(
    (0): Embedding(24, 12)
    (1): Embedding(2, 1)
    (2): Embedding(7, 4)
  )
  (embdrop): Dropout(p=0.4, inplace=False)
  (batchn_cont): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=22, out_features=300, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=300, out_features=200, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=200, out_features=1, bias=True)
  )
)

In [112]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [113]:
batch_size = 60000
test_size = int(batch_size*0.2)

In [114]:
cat_train = cats[:batch_size-test_size]
cat_test = cats[batch_size-test_size:batch_size]
cont_train = conts[:batch_size-test_size]
cont_test = conts[batch_size-test_size:batch_size]

In [115]:
y_train = y[:batch_size-test_size]
y_test = y[batch_size-test_size:batch_size]
len(cat_train),len(cat_test),len(cont_train),len(cont_test),len(y_train),len(y_test) 

(48000, 12000, 48000, 12000, 48000, 12000)

In [118]:
losses=[]
epochs=500
for i in range(epochs):
    #forward pass
    y_pred = model(cat_train, cont_train)
    loss = torch.sqrt(criterion(y_pred, y_train))
    losses.append(loss)
    if i%10:
        print(f'epoch : {epochs} | loss : {loss}')
    #backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    

epoch : 500 | loss : 12.175262451171875
epoch : 500 | loss : 12.067959785461426
epoch : 500 | loss : 11.967463493347168
epoch : 500 | loss : 11.873384475708008
epoch : 500 | loss : 11.79959774017334
epoch : 500 | loss : 11.739005088806152
epoch : 500 | loss : 11.68205451965332
epoch : 500 | loss : 11.625597953796387
epoch : 500 | loss : 11.587005615234375
epoch : 500 | loss : 11.51727294921875
epoch : 500 | loss : 11.482474327087402
epoch : 500 | loss : 11.46023941040039
epoch : 500 | loss : 11.43091869354248
epoch : 500 | loss : 11.415557861328125
epoch : 500 | loss : 11.397176742553711
epoch : 500 | loss : 11.357619285583496
epoch : 500 | loss : 11.314596176147461
epoch : 500 | loss : 11.294026374816895
epoch : 500 | loss : 11.234041213989258
epoch : 500 | loss : 11.201664924621582
epoch : 500 | loss : 11.15358829498291
epoch : 500 | loss : 11.127034187316895
epoch : 500 | loss : 11.081847190856934
epoch : 500 | loss : 11.038448333740234
epoch : 500 | loss : 10.994909286499023
epoch 

In [122]:
with torch.no_grad():
    y_val = model(cat_test, cont_test)
    loss = torch.sqrt(criterion(y_val, y_test))
loss

tensor(3.2350)

In [125]:
for i in range(10):
    print(f'{i}. ) Predicted : {y_val[i].item():2f}       Target : {y_test[i].item():2f}')

0. ) Predicted : 4.680586       Target : 2.900000
1. ) Predicted : 21.180355       Target : 5.700000
2. ) Predicted : 5.083493       Target : 7.700000
3. ) Predicted : 11.864877       Target : 12.500000
4. ) Predicted : 5.442344       Target : 4.100000
5. ) Predicted : 4.351744       Target : 5.300000
6. ) Predicted : 4.507971       Target : 3.700000
7. ) Predicted : 13.955557       Target : 14.500000
8. ) Predicted : 5.368070       Target : 5.700000
9. ) Predicted : 13.905837       Target : 10.100000


In [126]:
torch.save(model.state_dict(), 'TaxiModel.pt')