In [24]:
import torch
import torch.nn as nn
import modin.pandas as pd
import numpy as np

In [10]:
df = pd.read_csv(r"D:\PyTorch for Deep Learning with Python Bootcamp\1. Course Overview, Installs, and Setup\PYTORCH_NOTEBOOKS\Data\NYCTaxiFares.csv")

In [11]:
df.info()

<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  -----------------  ---------------  -----  
 0   pickup_datetime    120000 non-null  object
 1   fare_amount        120000 non-null  float64
 2   fare_class         120000 non-null  int64
 3   pickup_longitude   120000 non-null  float64
 4   pickup_latitude    120000 non-null  float64
 5   dropoff_longitude  120000 non-null  float64
 6   dropoff_latitude   120000 non-null  float64
 7   passenger_count    120000 non-null  int64
dtypes: float64(5), int64(2), object(1)
memory usage: 7.3 MB




In [18]:
df['pickup_hour'] = pd.DatetimeIndex(df['pickup_datetime']).hour

In [22]:
df['AM_or_PM'] = df.apply(lambda x: 'AM' if x['pickup_hour']<12 else 'PM', axis = 1)



In [25]:
def haversine_distance(lat1, long1, lat2, long2):
    r = 6371
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(long2 -long1)
    
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c)
    return d

In [27]:
df['distance_km'] = list(map(haversine_distance, df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'],df['dropoff_longitude']))

In [34]:
df['weekday'] = pd.DatetimeIndex(df['pickup_datetime']).weekday # 6 sun 0 mon

In [36]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_hour,AM_or_PM,distance_km,weekday
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,8,AM,2.126312,0
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,15,PM,1.392307,5
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,11,AM,3.326763,5
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,21,PM,1.864129,6
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,2,AM,7.231321,5


In [49]:
for i in ['pickup_hour','AM_or_PM','weekday']:
    df[i] = df[i].astype('category')

In [43]:
df = df.drop(columns=['pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'])

In [50]:
df.info()

<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype   
---  ---------------  ---------------  -----   
 0   fare_amount      120000 non-null  float64
 1   fare_class       120000 non-null  int64
 2   passenger_count  120000 non-null  int64
 3   pickup_hour      120000 non-null  category
 4   AM_or_PM         120000 non-null  category
 5   distance_km      120000 non-null  float64
 6   weekday          120000 non-null  category
dtypes: float64(2), int64(2), category(1), category(1), category(1)
memory usage: 4.0 MB


In [53]:
df['AM_or_PM'].cat.categories

Index(['AM', 'PM'], dtype='object')

In [51]:
df['weekday'].cat.codes



0         0
1         5
2         5
3         6
4         5
         ..
119995    6
119996    4
119997    6
119998    1
119999    5
Length: 120000, dtype: int8

In [69]:
cat_c = np.stack([df[i].cat.codes.values for i in ['pickup_hour','AM_or_PM','weekday']], axis = 1)

In [75]:
conc_c = np.stack([df[i].values for i in ['fare_class','passenger_count','distance_km']], axis = 1)

In [78]:
cat_c = torch.tensor(cat_c, dtype =torch.int64 )
conc_c = torch.tensor(conc_c, dtype =torch.float64 )
y = torch.tensor(df['fare_amount'].values, dtype = torch.float64).reshape(-1,1)



In [79]:
y

tensor([[ 6.5000],
        [ 6.9000],
        [10.1000],
        ...,
        [12.5000],
        [ 4.9000],
        [ 5.3000]], dtype=torch.float64)

In [81]:
conc_c.shape

torch.Size([120000, 3])

# Now One hot encode using torch embedding which is a lookup table containg key value pair where key is the category and value is the vector of dim n

In [86]:
cat_size = [len(df[i].cat.categories) for i in ['pickup_hour','AM_or_PM','weekday']]

In [92]:
9//2

4

In [93]:
9/2

4.5

In [120]:
embed_size = [(size, min(50, (size+1)//2)) for size in cat_size] # // rounds down 50 because we want vector of size 50 or less

In [121]:
embed_size

[(24, 12), (2, 1), (7, 4)]

In [97]:
selfembed = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in embed_size])

In [114]:
cat_c[4]

tensor([2, 0, 5])

In [115]:
test_cat_c = torch.tensor([[2,1,2]])
test_cat_c

tensor([[2, 1, 2]])

In [116]:
embedding = []
for i,e in enumerate(selfembed):
    embedding.append(e(test_cat_c[:,i]))

In [117]:
embedding

[tensor([[-0.9741, -0.2391,  0.5787, -0.0100,  0.7568, -1.3769, -1.6004,  0.6238,
           0.2791, -0.5105, -0.7860,  0.1719]], grad_fn=<EmbeddingBackward>),
 tensor([[0.4045]], grad_fn=<EmbeddingBackward>),
 tensor([[-1.0774, -1.3415,  0.6052, -0.2177]], grad_fn=<EmbeddingBackward>)]

In [118]:
embedding = []
for i,e in enumerate(selfembed):
    embedding.append(e(cat_c[4,i]))
embedding

[tensor([-0.9741, -0.2391,  0.5787, -0.0100,  0.7568, -1.3769, -1.6004,  0.6238,
          0.2791, -0.5105, -0.7860,  0.1719], grad_fn=<EmbeddingBackward>),
 tensor([0.5953], grad_fn=<EmbeddingBackward>),
 tensor([ 0.9992, -0.6034, -0.7628,  0.9993], grad_fn=<EmbeddingBackward>)]

# from the above experiment we saw that the vector for 2 remains the same 

In [124]:
torch.cat(embedding)

tensor([-0.9741, -0.2391,  0.5787, -0.0100,  0.7568, -1.3769, -1.6004,  0.6238,
         0.2791, -0.5105, -0.7860,  0.1719,  0.5953,  0.9992, -0.6034, -0.7628,
         0.9993], grad_fn=<CatBackward>)

# Setting up Tabular model

In [125]:
import torch.nn as nn

In [None]:
class TabularModel(nn.Module):
    
    def __init__(self, embed_size, n_cont, n_out, neuron_layer, prob = 0.5):
        super().__init__()
        self.embed = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in embed_size])
        self.embed_drop = nn.Dropout(prob)
        self.batchnorm_cont = nn.BatchNorm1d(n_cont)
        
        num_embed = sum([nf for ni,nf in embed_size])
        n_input = num_embed + n_cont
        
        layerlist = []
        for i in neuron_layer:
            layerlist.append(nn.Linear(n_input,i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(prob))
            n_input = i
    def forward_pass(self):
        pass
        