In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
print("All imports loaded.")

All imports loaded.


In [5]:
df = pd.read_csv('train.csv', nrows = 1_000_000)
df = df.sample(frac=1, random_state = SEED).reset_index(drop = True) # shuffles rows randomly
print(f"Dataset shape: {df.shape}")
df.head()#shows first 5 rows

Dataset shape: (1000000, 11)


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id3435429,1,2016-02-18 22:34:53,2016-02-18 22:49:37,1,-73.956161,40.6945,-73.987869,40.720985,N,884
1,id2267606,1,2016-05-14 14:37:43,2016-05-14 14:52:09,1,-73.975922,40.757133,-73.950813,40.770882,N,866
2,id3771460,1,2016-06-15 01:42:25,2016-06-15 01:52:02,1,-73.982391,40.762222,-73.952019,40.777706,N,577
3,id2766058,2,2016-03-21 22:37:26,2016-03-21 22:42:10,2,-73.998482,40.740463,-74.004646,40.722782,N,284
4,id2834780,2,2016-06-07 21:33:57,2016-06-07 21:36:06,1,-73.998169,40.73555,-73.991913,40.744041,N,129


In [9]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])#converts it into an actual datetime object
df['hour'] = df['pickup_datetime'].dt.hour #creats a column called hour by extracting the hour from pickup time(5pm traffic is worse than other times)
df['day_of_week'] = df['pickup_datetime'].dt.day_of_week #extracts the day of week in number(sunday traffic is not as bad as monday morning traffic)
df['month'] = df['pickup_datetime'].dt.month #extracts month into numbers 1-12 
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int) #makes a column thats 1 if its sat or sun or 0 else
# Haversine distance (spatial feature)
#math formula that calculates the straight-line distance between two points on Earth
def haversine(lat1, lon1, lat2, lon2):
    R = 6371 # earth radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))
    #Calls that function with the pickup and dropoff coordinates from the dataset and saves the result as a new column
df['distance_km'] = haversine(
    df['pickup_latitude'], df['pickup_longitude'],
    df['dropoff_latitude'], df['dropoff_longitude']
    )
print("Features created.")
    #summary of new features
df[['hour', 'day_of_week', 'month', 'is_weekend', 'distance_km', 'passenger_count']].describe()


Features created.


Unnamed: 0,hour,day_of_week,month,is_weekend,distance_km,passenger_count
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,13.608696,3.05153,3.515646,0.285446,3.443271,1.664537
std,6.3999,1.953945,1.680567,0.451627,4.381411,1.314415
min,0.0,0.0,1.0,0.0,0.0,0.0
25%,9.0,1.0,2.0,0.0,1.232458,1.0
50%,14.0,3.0,4.0,0.0,2.094754,1.0
75%,19.0,5.0,5.0,1.0,3.877582,2.0
max,23.0,6.0,6.0,1.0,1240.908677,9.0


In [11]:
feature_cols = ['hour', 'day_of_week', 'month', 'is_weekend', 
                'distance_km', 'passenger_count',
                'pickup_latitude', 'pickup_longitude',
                'dropoff_latitude', 'dropoff_longitude']
target = 'trip_duration'

X = df[feature_cols].values
y = df[target].values

# 50% holdout for final test (don't touch until final evaluation)
X_dev, X_test, y_dev, y_test = train_test_split(
    X, y, test_size=0.50, random_state=SEED
)
#split dev into training (2/3) and validation(1/3)
X_train, X_val, y_train, y_val = train_test_split(
    X_dev, y_dev, test_size=1/3, random_state = SEED
)
print(f"Train: {X_train.shape[0]:,}")
print(f"Val:   {X_val.shape[0]:,}")
print(f"Test:  {X_test.shape[0]:,} (holdout - do not use until final eval)")


Train: 333,333
Val:   166,667
Test:  500,000 (holdout - do not use until final eval)


In [None]:
#Every feature column gets rescaled to the same range so they're all on equal footing. That's it.
scaler = StandardScaler()
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.trandfomr(X_val)
X_test_scaled = scaler.transform(X_test)
print("Features scaled.")