In [1]:
import torch
import torch.nn as nn
import modin.pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r"D:\PyTorch for Deep Learning with Python Bootcamp\1. Course Overview, Installs, and Setup\PYTORCH_NOTEBOOKS\Data\NYCTaxiFares.csv")

In [3]:
df.info()

<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  -----------------  ---------------  -----  
 0   pickup_datetime    120000 non-null  object
 1   fare_amount        120000 non-null  float64
 2   fare_class         120000 non-null  int64
 3   pickup_longitude   120000 non-null  float64
 4   pickup_latitude    120000 non-null  float64
 5   dropoff_longitude  120000 non-null  float64
 6   dropoff_latitude   120000 non-null  float64
 7   passenger_count    120000 non-null  int64
dtypes: float64(5), int64(2), object(1)
memory usage: 7.3 MB




In [4]:
df['pickup_hour'] = pd.DatetimeIndex(df['pickup_datetime']).hour

In [5]:
df['AM_or_PM'] = df.apply(lambda x: 'AM' if x['pickup_hour']<12 else 'PM', axis = 1)



In [6]:
def haversine_distance(lat1, long1, lat2, long2):
    r = 6371
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(long2 -long1)
    
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c)
    return d

In [7]:
df['distance_km'] = list(map(haversine_distance, df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'],df['dropoff_longitude']))

In [8]:
df['weekday'] = pd.DatetimeIndex(df['pickup_datetime']).weekday # 6 sun 0 mon

In [9]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_hour,AM_or_PM,distance_km,weekday
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,8,AM,2.126312,0
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,15,PM,1.392307,5
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,11,AM,3.326763,5
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,21,PM,1.864129,6
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,2,AM,7.231321,5


In [10]:
for i in ['pickup_hour','AM_or_PM','weekday']:
    df[i] = df[i].astype('category')

In [11]:
df = df.drop(columns=['pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'])

In [12]:
df.info()

<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype   
---  ---------------  ---------------  -----   
 0   fare_amount      120000 non-null  float64
 1   fare_class       120000 non-null  int64
 2   passenger_count  120000 non-null  int64
 3   pickup_hour      120000 non-null  category
 4   AM_or_PM         120000 non-null  category
 5   distance_km      120000 non-null  float64
 6   weekday          120000 non-null  category
dtypes: int64(2), float64(2), category(1), category(1), category(1)
memory usage: 4.0 MB




In [13]:
df['AM_or_PM'].cat.categories

To request implementation, send an email to feature_requests@modin.org.


Index(['AM', 'PM'], dtype='object')

In [14]:
df['weekday'].cat.codes



0         0
1         5
2         5
3         6
4         5
         ..
119995    6
119996    4
119997    6
119998    1
119999    5
Length: 120000, dtype: int8

In [15]:
cat_c = np.stack([df[i].cat.codes.values for i in ['pickup_hour','AM_or_PM','weekday']], axis = 1)



In [16]:
conc_c = np.stack([df[i].values for i in ['fare_class','passenger_count','distance_km']], axis = 1)

In [17]:
cat_c = torch.tensor(cat_c, dtype =torch.int64 )
conc_c = torch.tensor(conc_c, dtype =torch.float )
y = torch.tensor(df['fare_amount'].values, dtype = torch.float).reshape(-1,1)

In [18]:
y

tensor([[ 6.5000],
        [ 6.9000],
        [10.1000],
        ...,
        [12.5000],
        [ 4.9000],
        [ 5.3000]])

In [19]:
conc_c.shape

torch.Size([120000, 3])

# Now One hot encode using torch embedding which is a lookup table containg key value pair where key is the category and value is the vector of dim n

In [20]:
cat_size = [len(df[i].cat.categories) for i in ['pickup_hour','AM_or_PM','weekday']]



In [21]:
9//2

4

In [22]:
9/2

4.5

In [23]:
embed_size = [(size, min(50, (size+1)//2)) for size in cat_size] # // rounds down 50 because we want vector of size 50 or less

In [24]:
embed_size

[(24, 12), (2, 1), (7, 4)]

In [25]:
selfembed = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in embed_size])

In [26]:
cat_c[4]

tensor([2, 0, 5])

In [27]:
test_cat_c = torch.tensor([[2,1,2]])
test_cat_c

tensor([[2, 1, 2]])

In [28]:
embedding = []
for i,e in enumerate(selfembed):
    embedding.append(e(test_cat_c[:,i]))

In [29]:
embedding

[tensor([[ 1.6735,  0.2573, -0.3181, -0.1085, -1.3318, -2.3106,  0.8655, -1.7698,
           0.6201,  0.0875, -0.4120, -1.7357]], grad_fn=<EmbeddingBackward>),
 tensor([[1.4280]], grad_fn=<EmbeddingBackward>),
 tensor([[ 1.0426, -0.0650, -2.1356, -0.1983]], grad_fn=<EmbeddingBackward>)]

In [30]:
embedding = []
for i,e in enumerate(selfembed):
    embedding.append(e(cat_c[4,i]))
embedding

[tensor([ 1.6735,  0.2573, -0.3181, -0.1085, -1.3318, -2.3106,  0.8655, -1.7698,
          0.6201,  0.0875, -0.4120, -1.7357], grad_fn=<EmbeddingBackward>),
 tensor([-0.5656], grad_fn=<EmbeddingBackward>),
 tensor([-0.8459,  0.6071,  2.9163, -0.6494], grad_fn=<EmbeddingBackward>)]

# from the above experiment we saw that the vector for 2 remains the same 

In [31]:
torch.cat(embedding)

tensor([ 1.6735,  0.2573, -0.3181, -0.1085, -1.3318, -2.3106,  0.8655, -1.7698,
         0.6201,  0.0875, -0.4120, -1.7357, -0.5656, -0.8459,  0.6071,  2.9163,
        -0.6494], grad_fn=<CatBackward>)

# Setting up Tabular model

In [32]:
import torch.nn as nn

In [33]:
class TabularModel(nn.Module):
    
    def __init__(self, embed_size, n_cont, n_out, neuron_layer, prob = 0.5):
        super().__init__()
        self.embed = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in embed_size])
        self.embed_drop = nn.Dropout(prob)
        self.batchnorm_cont = nn.BatchNorm1d(n_cont)
        
        num_embed = sum([nf for ni,nf in embed_size])
        n_input = num_embed + n_cont
        
        layerlist = []
        for i in neuron_layer:
            layerlist.append(nn.Linear(n_input,i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(prob))
            n_input = i
        layerlist.append(nn.Linear(neuron_layer[-1], n_out))
        self.layers = nn.Sequential(*layerlist)
        
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embed):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, axis = 1)
        x = self.embed_drop(x)
        x_cont = self.batchnorm_cont(x_cont)
        x = torch.cat([x,x_cont],axis = 1)
        x = self.layers(x)
        return x
        

In [34]:
model = TabularModel(embed_size,conc_c.shape[1],1,[32,64,128])

In [35]:
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr = 1e-3)

In [36]:
train_size = 60000
test_size = int(train_size*0.2)

In [37]:
train_cat = cat_c[:train_size-test_size]
test_cat = cat_c[train_size-test_size:train_size]
train_cont = conc_c[:train_size-test_size]
test_cont = conc_c[train_size-test_size:train_size]
y_train = y[:train_size-test_size]
y_test = y[train_size-test_size:train_size]

In [38]:
train_cat.shape

torch.Size([48000, 3])

In [40]:


epochs = 300
losses = []

for i in range(epochs):
    i = i+1
    
    y_pred = model(train_cat, train_cont)
    loss = torch.sqrt(loss_func(y_pred, y_train))
    losses.append(loss)
    
    if i%10 == 0:
        print(f'Epoch {i} loss is {loss}')
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch 10 loss is 12.246439933776855
Epoch 20 loss is 11.748432159423828
Epoch 30 loss is 11.184530258178711
Epoch 40 loss is 10.802921295166016
Epoch 50 loss is 10.530203819274902
Epoch 60 loss is 10.295490264892578
Epoch 70 loss is 10.101095199584961
Epoch 80 loss is 9.819140434265137
Epoch 90 loss is 9.524518966674805
Epoch 100 loss is 9.185430526733398
Epoch 110 loss is 8.785958290100098
Epoch 120 loss is 8.312644958496094
Epoch 130 loss is 7.797293186187744
Epoch 140 loss is 7.269660949707031
Epoch 150 loss is 6.649996757507324
Epoch 160 loss is 6.114395618438721
Epoch 170 loss is 5.595784664154053
Epoch 180 loss is 5.084903240203857
Epoch 190 loss is 4.686479568481445
Epoch 200 loss is 4.425530910491943
Epoch 210 loss is 4.246530532836914
Epoch 220 loss is 4.1729888916015625
Epoch 230 loss is 4.099949836730957
Epoch 240 loss is 4.0685319900512695
Epoch 250 loss is 3.9654102325439453
Epoch 260 loss is 3.9484739303588867
Epoch 270 loss is 3.8836255073547363
Epoch 280 loss is 3.85851