This notebook expects a previows data processing pipeline, that defines four variables: `X_train`, `X_test`, `y_train`, `y_test`



In [1]:
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler
#import tensorflow as tf

from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [2]:
from google.colab import drive
drive.mount('/content/drive')
data_path = '/content/drive/MyDrive/MC906/Filtered_data/'

Mounted at /content/drive


In [3]:
class LazyCSVLoader(Dataset):
    def __init__(self, feature_csv, target_csv):
        self.feature_csv = feature_csv
        self.target_csv = target_csv
        self.num_features = pd.read_csv(feature_csv, nrows=1).shape[1] - 1  # Exclude the index column
        self.length = self._get_len()

    def _get_len(self):
        with open(self.feature_csv) as f:
            for i, l in enumerate(f):
                pass
        return i  # returns the number of rows

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Read the specific row and skip the index column
        features = pd.read_csv(self.feature_csv, skiprows=idx + 1, nrows=1).iloc[:, 1:].values.flatten()
        target = pd.read_csv(self.target_csv, skiprows=idx + 1, nrows=1).iloc[:, 1:].values.flatten()[0]

        data = torch.tensor(features, dtype=torch.float)
        target = torch.tensor(target, dtype=torch.float)

        return data, target

In [4]:
class PolynomialRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolynomialRegression, self).__init__()
        self.poly = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        out = self.poly(x)
        return out

In [5]:
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.unsqueeze(1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(dataloader)

In [6]:
def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets.unsqueeze(1))
            running_loss += loss.item()
    return running_loss / len(dataloader)

In [7]:
def test(model, dataloader, device):
    model.eval()
    predictions, actuals = [], []
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(targets.cpu().numpy())
    return predictions, actuals

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load datasets
X_train_df = LazyCSVLoader(data_path+'X_train.csv', data_path+'y_train.csv')
X_val_df = LazyCSVLoader(data_path+'X_val.csv', data_path+'y_val.csv')
X_test_df = LazyCSVLoader(data_path+'X_test.csv', data_path+'y_test.csv')


Using device: cuda


In [13]:
# Create DataLoaders
train_loader = DataLoader(X_train_df, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)
val_loader = DataLoader(X_val_df, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(X_test_df, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)



In [11]:
# Define model, criterion, and optimizer
input_dim = X_train_df.num_features
output_dim = 1
model = PolynomialRegression(input_dim, output_dim)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)
model.to(device)

criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)


In [None]:
# Training loop
num_epochs = 100
best_val_loss = float('inf')
best_epoch = -1
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    train_loss = train(model, train_loader, criterion, optimizer, device)
    val_loss = validate(model, val_loader, criterion, device)
    train_losses.append(train_loss)
    val_losses.append(val_loss)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_epoch = epoch
        torch.save(model.state_dict(), data_path+'Best/PolyReg.pth')

    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')


In [None]:
# Plot train vs validation loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs + 1), train_losses, label='Train Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.axvline(x=best_epoch + 1, color='r', linestyle='--', label='Best Epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Train vs Validation Loss')
plt.show()

print(f'Best Epoch: {best_epoch + 1}, Best Validation Loss: {best_val_loss:.4f}')


In [None]:
# Load the best model for testing
model.load_state_dict(torch.load(data_path+'Best/PolyReg.pth'))
predictions, actuals = test(model, test_loader, device)
print(f'Test Predictions: {predictions}')
print(f'Test Actuals: {actuals}')