In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
# Load dataset and drop any rows with missing values
data = pd.read_csv('HousingData.csv')
data.dropna(inplace=True)



In [3]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7


In [4]:
# 'MEDV' is the target column, and the rest are features
X = data.drop(columns=['MEDV'])  # Replace 'MEDV' with the actual target column
y = data['MEDV']

In [5]:
# Split data into training (80%) and testing (20%) sets
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(X, y, test_size=0.2, random_state=42)



In [6]:
# Convert numpy arrays to PyTorch tensors of type float32
X_train = torch.tensor(X_train_np, dtype=torch.float32)
y_train = torch.tensor(y_train_np, dtype=torch.float32).view(-1, 1)
X_test = torch.tensor(X_test_np, dtype=torch.float32)
y_test = torch.tensor(y_test_np, dtype=torch.float32).view(-1, 1)

ValueError: could not determine the shape of object type 'DataFrame'

In [None]:
# Create a TensorDataset and DataLoader for batching training data
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)

In [18]:
class LinearRegressionModel:
    """
    A class for performing linear regression using PyTorch.
    
    Attributes:
        model (nn.Module): The linear layer model.
        criterion (nn.Module): Mean Squared Error loss function.
        optimizer (optim.Optimizer): Optimizer for model parameters.
        epochs (int): Number of training epochs.
    """
    def __init__(self, input_dim: int, output_dim: int, lr: float = 0.01, epochs: int = 100) -> None:
        """
        Initialize the linear regression model with error checking.
        
        :param input_dim: Number of input features.
        :param output_dim: Number of outputs.
        :param lr: Learning rate.
        :param epochs: Number of training epochs.
        :raises ValueError: If input or output dimensions are not positive.
        """
        if input_dim <= 0 or output_dim <= 0:
            raise ValueError("Input and output dimensions must be positive")
        self.epochs = epochs
        self.model = nn.Linear(input_dim, output_dim)  # linear mapping layer
        self.criterion = nn.MSELoss()  # mean squared error loss
        self.optimizer = optim.SGD(self.model.parameters(), lr=lr)  # stochastic gradient descent optimizer

    def _train_step(self, features: torch.Tensor, targets: torch.Tensor) -> float:
        """
        Perform a single training step and return the loss value.
        
        :param features: Batch of input features.
        :param targets: Batch of target values.
        :return: Loss value as a float.
        """
        self.optimizer.zero_grad()             # Clear gradients from previous step
        preds = self.model(features)           # Forward pass: compute predictions
        loss = self.criterion(preds, targets)   # Compute loss between predictions and targets
        loss.backward()                        # Backward propagation: compute gradients based on loss
        self.optimizer.step()                  # Update model parameters using computed gradients
        return loss.item()

    def train_epoch(self, train_loader: DataLoader) -> float:
        """
        Train the model for one epoch.
        
        :param train_loader: DataLoader with training data batches.
        :return: Average loss over the epoch.
        """
        losses = []
        for features, targets in train_loader:
            losses.append(self._train_step(features, targets))
        return sum(losses) / len(losses) if losses else 0.0

    def train_model(self, train_loader: DataLoader, X_train: torch.Tensor, y_train: torch.Tensor) -> None:
        """
        Train the model over the specified number of epochs and print MSE and R2 score each epoch.
        
        :param train_loader: DataLoader with training data.
        :param X_train: All training features.
        :param y_train: All training target values.
        """
        for e in range(self.epochs):
            self.train_epoch(train_loader)  # Run one full training epoch
            preds = self.model(X_train)       # Predict on entire training set
            mse_val = self.criterion(preds, y_train).item()  # Calculate MSE on training set
            r2_val = r2_score(y_train.numpy(), preds.detach().numpy())  # Calculate R2 score
            print(f"Epoch {e+1}/{self.epochs}: MSE={mse_val:.4f}, R2={r2_val:.4f}")

    def evaluate_model(self, X: torch.Tensor, y: torch.Tensor) -> tuple[float, float]:
        """
        Evaluate the model on provided data.
        
        :param X: Input features tensor.
        :param y: True target values tensor.
        :return: Tuple containing MSE and R2 score.
        """
        self.model.eval()  # Set model to evaluation mode (disables dropout, etc.)
        with torch.no_grad():
            preds = self.model(X)  # Compute predictions without gradient tracking
            mse_val = self.criterion(preds, y).item()  # Calculate Mean Squared Error
            r2_val = r2_score(y.numpy(), preds.detach().numpy())  # Calculate R2 score
        self.model.train()  # Return to training mode
        return mse_val, r2_val

In [19]:
# Instantiate the model with appropriate input dimension (number of features) and output dimension 1
input_dim = X_train.shape[1]
model = LinearRegressionModel(input_dim=input_dim, output_dim=1, lr=0.01, epochs=100)

TypeError: expected np.ndarray (got DataFrame)

In [None]:
# Train the model while printing training MSE and R2 score at each epoch
model.train_model(train_loader, X_train, y_train)

In [None]:
# Evaluate the trained model on both training and testing data
train_mse, train_r2 = model.evaluate_model(X_train, y_train)
test_mse, test_r2 = model.evaluate_model(X_test, y_test)
print(f"Training Data - MSE: {train_mse:.4f}, R2: {train_r2:.4f}")
print(f"Testing Data  - MSE: {test_mse:.4f}, R2: {test_r2:.4f}")

In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from typing import Union, Tuple

def to_numpy(data: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
    """
    Convert input data to a numpy array.

    :param data: DataFrame or numpy array.
    :return: numpy array version of data.
    """
    return data.to_numpy() if isinstance(data, pd.DataFrame) else data

# Load dataset and drop rows with missing values
data = pd.read_csv('HousingData.csv')
data.dropna(inplace=True)

# Assume the last column is the target and the rest are features
X = to_numpy(data.iloc[:, :-1])
y = to_numpy(data.iloc[:, -1])

# Split data into training (80%) and testing (20%) sets
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure that the split data are numpy arrays
X_train_np = to_numpy(X_train_np)
X_test_np = to_numpy(X_test_np)
y_train_np = to_numpy(y_train_np)
y_test_np = to_numpy(y_test_np)

# Convert numpy arrays to PyTorch tensors of type float32
X_train = torch.tensor(X_train_np, dtype=torch.float32)
y_train = torch.tensor(y_train_np, dtype=torch.float32).view(-1, 1)
X_test = torch.tensor(X_test_np, dtype=torch.float32)
y_test = torch.tensor(y_test_np, dtype=torch.float32).view(-1, 1)

# Create a TensorDataset and DataLoader for batching training data
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)

class LinearRegressionModel:
    """
    A linear regression model using PyTorch.
    """
    def __init__(self, input_dim: int, output_dim: int, lr: float = 0.01, epochs: int = 100) -> None:
        """
        Initialize the linear regression model.

        :param input_dim: Number of input features.
        :param output_dim: Number of outputs.
        :param lr: Learning rate.
        :param epochs: Number of training epochs.
        :raises ValueError: If input or output dimensions are not positive.
        """
        if input_dim <= 0 or output_dim <= 0:
            raise ValueError("Input and output dimensions must be positive")
        self.epochs = epochs
        self.model = nn.Linear(input_dim, output_dim)
        self.criterion = nn.MSELoss()
        self.optimizer = optim.SGD(self.model.parameters(), lr=lr)

    def _train_step(self, features: torch.Tensor, targets: torch.Tensor) -> float:
        """
        Execute a single training step.

        :param features: Batch of input features.
        :param targets: Batch of target values.
        :return: Loss value as a float.
        """
        self.optimizer.zero_grad()
        preds = self.model(features)
        loss = self.criterion(preds, targets)
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def train_epoch(self, train_loader: DataLoader) -> float:
        """
        Train the model for one epoch.

        :param train_loader: DataLoader with training data batches.
        :return: Average loss over the epoch.
        """
        losses = [self._train_step(features, targets) for features, targets in train_loader]
        return sum(losses) / len(losses) if losses else 0.0

    def train_model(self, train_loader: DataLoader, X_train: torch.Tensor, y_train: torch.Tensor) -> None:
        """
        Train the model over all epochs and print metrics for each epoch.

        :param train_loader: DataLoader with training data.
        :param X_train: Training features tensor.
        :param y_train: Training target tensor.
        """
        for e in range(self.epochs):
            self.train_epoch(train_loader)
            preds = self.model(X_train)
            mse_val = self.criterion(preds, y_train).item()
            r2_val = r2_score(y_train.numpy(), preds.detach().numpy())
            print(f"Epoch {e+1}/{self.epochs}: MSE={mse_val:.4f}, R2={r2_val:.4f}")

    def evaluate_model(self, X: torch.Tensor, y: torch.Tensor) -> Tuple[float, float]:
        """
        Evaluate model performance on the provided data.

        :param X: Input features tensor.
        :param y: True target tensor.
        :return: Tuple containing Mean Squared Error and R2 score.
        """
        self.model.eval()
        with torch.no_grad():
            preds = self.model(X)
            mse_val = self.criterion(preds, y).item()
            r2_val = r2_score(y.numpy(), preds.detach().numpy())
        self.model.train()
        return mse_val, r2_val

# Instantiate and train the model
input_dim = X_train.shape[1]
model = LinearRegressionModel(input_dim=input_dim, output_dim=1, lr=0.01, epochs=100)
model.train_model(train_loader, X_train, y_train)

# Evaluate the model on training and testing data
train_mse, train_r2 = model.evaluate_model(X_train, y_train)
test_mse, test_r2 = model.evaluate_model(X_test, y_test)
print(f"Training Data - MSE: {train_mse:.4f}, R2: {train_r2:.4f}")
print(f"Testing Data  - MSE: {test_mse:.4f}, R2: {test_r2:.4f}")


ValueError: could not determine the shape of object type 'Series'