## Introduction

This is a kernel with starter code demonstrating how to read in the data and begin exploring. Click the blue "Edit Notebook" or "Fork Notebook" button at the top of this kernel to begin editing.

## Exploratory Analysis

To begin this exploratory analysis, first use `matplotlib` to import libraries and define functions for plotting the data. Depending on the data, not all plots will be made. (Hey, I'm just a kerneling bot, not a Kaggle Competitions Grandmaster!)

In [1]:
%pip install matplotlib numpy pandas scikit-learn dask "dask[dataframe]" seaborn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install torch

Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Downloading torch-2.5.1-cp39-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting filelock (from torch)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting networkx (from torch)
  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.5.1-cp39-none-macosx_11_0_arm64.whl (63.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hDownloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hDownloading fil

In [4]:
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

Check the data in the 2 created datasets (2019newBig.csv: 12M rows, 2019new.csv: 1.2M rows)

Load the DB and cleanup

In [None]:
dataset = '../Datasets/Small_datasetPreprocessed.parquet'

if os.path.exists(dataset):
    df = pd.read_parquet(dataset)
    df = df.dropna() 
    print(df.head(1))
    print(df.shape)
else:
    print("Dataset not found")

   vendorid  passenger_count  trip_distance  ratecodeid  fare_amount  \
0       2.0              1.0           0.52         1.0          4.5   

   total_amount  total_amount_new  pickup_hour  time_in_taxi  tavg  \
0          6.36               5.3            9      3.616667  71.5   

   precipitation  new_snow  snow_depth  day_type  service_zone_pulocation  \
0            0.0       0.0         0.0         1                      3.0   

   service_zone_dolocation  
0                      3.0  
(1110879, 16)


Start models training with different NN and parameter to see the best ones

In [11]:
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch.optim as optim
from tqdm import tqdm

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Define the neural network model
def NN(input_size):
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.fc1 = nn.Linear(input_size, 128)
            self.fc2 = nn.Linear(128, 64)
            self.fc3 = nn.Linear(64, 1)

        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = self.fc3(x)
            return x

    return Net()

# Custom dataset class
class NYCTaxiDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Preprocess the data
input_size = df.shape[1] - 1
X = df.drop(['fare_amount'], axis=1).values
y = df['fare_amount'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train).view(-1, 1)
X_test = torch.FloatTensor(X_test)
y_test = torch.FloatTensor(y_test).view(-1, 1)

train_dataset = NYCTaxiDataset(X_train, y_train)
test_dataset = NYCTaxiDataset(X_test, y_test)

batch_size = 8196
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=24, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=24, pin_memory=True)

# Initialize the model and wrap it with DataParallel
model = NN(input_size).to(device)
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
    model = nn.DataParallel(model)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop with progress bar
def train_model(model, train_loader, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        with tqdm(total=len(train_loader), desc=f'Epoch [{epoch+1}/{num_epochs}]', unit='batch') as pbar:
            for inputs, labels in train_loader:
                inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)

                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                loss.backward()
                optimizer.step()

                running_loss += loss.item()
                pbar.update(1)

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}")

# Evaluation function for regression
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
            outputs = model(inputs)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    mse = mean_squared_error(actuals, predictions)
    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {mse ** 0.5:.2f}")

# Train the model
train_model(model, train_loader)

# Evaluate the model
evaluate_model(model, test_loader)

# Save the trained model
torch.save(model.state_dict(), 'model_Bignew.pth')

print("Training and evaluation completed!")
'''

Epoch [1/10]:   0%|                                  | 0/116 [00:00<?, ?batch/s]Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'NYCTaxiDataset' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Ve

RuntimeError: DataLoader worker (pid(s) 18984) exited unexpectedly

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'NYCTaxiDataset' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self 

In [10]:


# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class Net(nn.Module):
    def __init__(self,input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)  # Larger layer with 512 neurons
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)         # Another large layer with 256 neurons            
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)         # 128 neurons
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, 64)          # 64 neurons
        self.bn4 = nn.BatchNorm1d(64)
        self.fc5 = nn.Linear(64, 32)           # 32 neurons
        self.bn5 = nn.BatchNorm1d(32)
        self.fc6 = nn.Linear(32, 1)            # Output layer

        self.dropout = nn.Dropout(0.4)         # Increased dropout rate to combat overfitting

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = F.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = F.relu(self.bn4(self.fc4(x)))
        x = self.dropout(x)
        x = F.relu(self.bn5(self.fc5(x)))
        x = self.fc6(x)
        return x


# Custom dataset class
class NYCTaxiDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Preprocess the data
input_size = df.shape[1] - 1
X = df.drop(['fare_amount'], axis=1).values
y = df['fare_amount'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train).view(-1, 1)
X_test = torch.FloatTensor(X_test)
y_test = torch.FloatTensor(y_test).view(-1, 1)

train_dataset = NYCTaxiDataset(X_train, y_train)
test_dataset = NYCTaxiDataset(X_test, y_test)

batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=16, pin_memory=True)

# Initialize the model and wrap it with DataParallel if multiple GPUs are available
model = Net(input_size)
model = model.to(device)
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
    model = nn.DataParallel(model)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

# Training loop with TF32 enabled
def train_model(model, train_loader, num_epochs=100):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        with tqdm(total=len(train_loader), desc=f'Epoch [{epoch+1}/{num_epochs}]', unit='batch') as pbar:
            for inputs, labels in train_loader:
                inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)

                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()
                pbar.update(1)

        epoch_loss = running_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")
        scheduler.step(epoch_loss)
        if(epoch%10==0):
            name = f'Models/model_LargeNN_{epoch // 10}.pth'
            torch.save(model, name)
            evaluate_model(model, test_loader)

# Evaluation function for regression
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
            outputs = model(inputs)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    mse = mean_squared_error(actuals, predictions)
    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {mse ** 0.5:.2f}")

# Train the model
train_model(model, train_loader)

# Evaluate the model
evaluate_model(model, test_loader)

# Save the trained model
torch.save(model, 'model_LargeNN.pth')

print("Training and evaluation completed!")


Epoch [1/100]:   0%|                                | 0/1845 [00:00<?, ?batch/s]Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'NYCTaxiDataset' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Ve

RuntimeError: DataLoader worker (pid(s) 18905) exited unexpectedly

    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'NYCTaxiDataset' on <module '__main__' (built-in)>


In [8]:


# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Net(nn.Module):
    def __init__(self,input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)  # Larger layer with 512 neurons
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)         # Another large layer with 256 neurons            
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)         # 128 neurons
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, 64)          # 64 neurons
        self.bn4 = nn.BatchNorm1d(64)
        self.fc5 = nn.Linear(64, 32)           # 32 neurons
        self.bn5 = nn.BatchNorm1d(32)
        self.fc6 = nn.Linear(32, 1)            # Output layer

        self.dropout = nn.Dropout(0.4)         # Increased dropout rate to combat overfitting

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = F.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = F.relu(self.bn4(self.fc4(x)))
        x = self.dropout(x)
        x = F.relu(self.bn5(self.fc5(x)))
        x = self.fc6(x)
        return x

# Custom dataset class
class NYCTaxiDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Preprocess the data
input_size = df.shape[1] - 1
X = df.drop(['fare_amount'], axis=1).values
y = df['fare_amount'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train).view(-1, 1)
X_test = torch.FloatTensor(X_test)
y_test = torch.FloatTensor(y_test).view(-1, 1)

train_dataset = NYCTaxiDataset(X_train, y_train)
test_dataset = NYCTaxiDataset(X_test, y_test)

batch_size = 512
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=16, pin_memory=True)
# Function to load the entire model
def load_entire_model(filepath):
    model = torch.load(filepath, map_location=device)  # Load the entire model
    model.to(device)
    model.eval()  # Set the model to evaluation mode
    return model

# Evaluation function for regression with R² score, MAE, and MSE
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
            outputs = model(inputs)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    mse = mean_squared_error(actuals, predictions)
    mae = mean_absolute_error(actuals, predictions)  # Calculate MAE
    r2 = r2_score(actuals, predictions)  # Calculate R² score
    
    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {mse ** 0.5:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R² Score: {r2:.4f}")

# Load the entire model and evaluate it
model_filepath = 'model_LargeNN.pth'  # Replace with your .pth file path
loaded_model = load_entire_model(model_filepath)

# Assume test_loader is defined as before
evaluate_model(loaded_model, test_loader)


  model = torch.load(filepath, map_location=device)  # Load the entire model


FileNotFoundError: [Errno 2] No such file or directory: 'model_LargeNN.pth'