In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
import random

In [2]:
def set_seed(seed=42):
    """Set all random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(42)  

Loading Dataset

In [3]:
dataset = pd.read_csv('dataset.csv')
dataset.head()

Unnamed: 0,X,Y,Z,A0,A1,A2,A5,A6,A7
0,0.0,-0.759297,0.419074,3.31,2.01,0.0,1.94,1.58,1.65
1,0.058824,-0.759205,0.418863,3.39,2.23,0.0,2.09,1.73,1.8
2,0.117647,-0.759242,0.419042,3.46,2.16,0.0,2.09,1.8,1.87
3,0.176471,-0.759302,0.419248,3.67,2.3,0.0,2.23,1.87,1.94
4,0.235294,-0.759177,0.41897,3.82,2.3,0.0,2.16,1.87,1.94


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10708 entries, 0 to 10707
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       10708 non-null  float64
 1   Y       10708 non-null  float64
 2   Z       10708 non-null  float64
 3   A0      10708 non-null  float64
 4   A1      10708 non-null  float64
 5   A2      10708 non-null  float64
 6   A5      10708 non-null  float64
 7   A6      10708 non-null  float64
 8   A7      10708 non-null  float64
dtypes: float64(9)
memory usage: 753.0 KB


In [5]:
X, y = dataset[["X", "Y", "Z"]], dataset[['A0', 'A1', 'A2', 'A5', 'A6', 'A7']]

x_mean = X.iloc[:, 0].mean()

x_std = X.iloc[:, 0].std()
y_mean = X.iloc[:, 1].mean()
y_std = X.iloc[:, 1].std()
z_mean = X.iloc[:, 2].mean()
z_std = X.iloc[:, 2].std()

# Normalize each column
X_norm = np.copy(X)
X_norm[:, 0] = (X.iloc[:, 0] - x_mean) / x_std
X_norm[:, 1] = (X.iloc[:, 1] - y_mean) / y_std
X_norm[:, 2] = (X.iloc[:, 2] - z_mean) / z_std

y_norm = y / 73.8

In [6]:
X_norm = torch.tensor(pd.DataFrame(X_norm).values, dtype=torch.float32)
y_norm = torch.tensor(pd.DataFrame(y_norm).values, dtype=torch.float32)

X_train, X_temp, y_train, y_temp = train_test_split(X_norm, y_norm, test_size=0.2, shuffle=True, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [7]:
X_train

tensor([[-0.9425,  0.6090,  1.6928],
        [-0.7765,  1.6047,  0.8203],
        [ 1.5648, -0.5537,  0.8535],
        ...,
        [-0.1568, -0.2147, -1.0535],
        [-1.2321, -1.6867,  0.6647],
        [ 0.6153,  0.3961,  0.7643]])

Starting with Linear Regression

In [10]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

In [21]:
y_test_scaled = y_test * 73.8

y_pred_linear = linear_reg.predict(X_test)
y_pred_scaled_linear = y_pred_linear * 73.8

y_pred_scaled_linear = torch.tensor(y_pred_scaled_linear, dtype=torch.float32)

print(y_pred_scaled_linear)

tensor([[47.8568, 45.7159, 42.9496, 31.8243, 32.8398, 33.1130],
        [38.4212, 44.2116, 58.8328, 17.9499, 30.9960, 22.2296],
        [53.7603, 42.7434, 21.6352, 33.0638, 40.4904, 28.9547],
        ...,
        [46.6435, 46.4505, 43.4394, 29.8451, 35.7085, 31.0743],
        [45.4639, 62.9228, 59.8097, 44.0903, 45.0396, 50.4497],
        [54.5052, 61.1049, 16.3584, 42.5400, 72.7219, 37.4462]])


In [22]:
print(y_test_scaled)

tensor([[73.4300, 73.3600, 73.2900,  0.0000,  0.0000, 69.4700],
        [ 0.0000,  0.0000, 57.5600,  0.0000, 48.9800,  0.0000],
        [49.2700, 28.4200,  0.0000, 16.8800, 24.0200, 23.3700],
        ...,
        [73.8000, 73.7200, 73.8000, 54.3900,  0.0000,  0.0000],
        [73.8000, 73.8000, 73.8000,  0.0000, 73.8000, 54.6100],
        [73.5800, 71.1300,  0.0000, 44.6500, 67.7400, 56.7000]])


In [24]:
# Computing MSE for the linear regression model

MSE = torch.nn.MSELoss()

mse_linear = MSE(y_pred_scaled_linear, y_test_scaled)

print(f'Mean Squared Error for Linear Regression: {mse_linear.item()}')

Mean Squared Error for Linear Regression: 661.6538696289062


Polynomial Regression, different Degrees

In [41]:
for i in range(2, 15):
    poly = PolynomialFeatures(degree=i, include_bias=False)
    X_poly = poly.fit_transform(X_train)

    # if i > 3:
    #     poly_reg = Ridge(alpha=1.0)
    #     print("Ridge Regularization is applied for degree > 3 (Alpha = 1.0)")
    # else:
    #     poly_reg = LinearRegression()

    poly_reg = LinearRegression()

    poly_reg.fit(X_poly, y_train)

    y_pred_poly = poly_reg.predict(poly.transform(X_test))
    y_pred_scaled_poly = y_pred_poly * 73.8

    y_pred_scaled_poly = torch.tensor(y_pred_scaled_poly, dtype=torch.float32)

    mse_poly = MSE(y_pred_scaled_poly, y_test_scaled)
    mse_ploy_train = MSE(torch.tensor(poly_reg.predict(poly.transform(X_train)) * 73.8), y_train * 73.8)

    print("-------------------------------------------------------------------------------------------------")
    print(f'Test Mean Squared Error for Polynomial Regression (degree {i}): {mse_poly.item()}')
    print(f'Train Mean Squared Error for Polynomial Regression (degree {i}): {mse_ploy_train.item()}')
    print("-------------------------------------------------------------------------------------------------")


-------------------------------------------------------------------------------------------------
Test Mean Squared Error for Polynomial Regression (degree 2): 340.8730163574219
Train Mean Squared Error for Polynomial Regression (degree 2): 337.2205965953453
-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
Test Mean Squared Error for Polynomial Regression (degree 3): 162.36212158203125
Train Mean Squared Error for Polynomial Regression (degree 3): 160.45550992532296
-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
Test Mean Squared Error for Polynomial Regression (degree 4): 71.62824249267578
Train Mean Squared Error for Polynomial Regression (degree 4): 72.08568448648559
-------------------------

Degree 12 seems to be the best fit. Before 12, it is underfit, after 12 it is overfitting

In [47]:


poly = PolynomialFeatures(degree=12, include_bias=False)
X_poly = poly.fit_transform(X_train)

poly_reg = LinearRegression()

poly_reg.fit(X_poly, y_train)

y_pred_poly = poly_reg.predict(poly.transform(X_test))
y_pred_scaled_poly = y_pred_poly * 73.8
y_pred_scaled_poly = np.clip(y_pred_scaled_poly, 0, 73.8)


y_pred_scaled_poly = torch.tensor(y_pred_scaled_poly, dtype=torch.float32)

mse_poly = MSE(y_pred_scaled_poly, y_test_scaled)
mse_ploy_train = MSE(torch.tensor(poly_reg.predict(poly.transform(X_train)) * 73.8), y_train * 73.8)

print(f'Train Mean Squared Error for Polynomial Regression (degree {i}): {mse_ploy_train.item()}')

Train Mean Squared Error for Polynomial Regression (degree 14): 0.5668640836338803


In [48]:
y_pred_scaled_poly = torch.round(y_pred_scaled_poly, decimals=1)
y_pred_scaled_poly 

tensor([[73.6000, 73.8000, 73.7000,  0.0000,  0.0000, 70.8000],
        [ 0.2000,  0.4000, 58.5000,  0.0000, 49.8000,  0.1000],
        [48.9000, 29.4000,  0.0000, 18.1000, 24.9000, 23.4000],
        ...,
        [73.5000, 73.6000, 73.7000, 54.4000,  0.1000,  0.1000],
        [73.7000, 73.8000, 73.7000,  0.2000, 73.8000, 53.6000],
        [73.7000, 70.1000,  0.1000, 43.7000, 66.4000, 56.0000]])

In [46]:
y_test_scaled

tensor([[73.4300, 73.3600, 73.2900,  0.0000,  0.0000, 69.4700],
        [ 0.0000,  0.0000, 57.5600,  0.0000, 48.9800,  0.0000],
        [49.2700, 28.4200,  0.0000, 16.8800, 24.0200, 23.3700],
        ...,
        [73.8000, 73.7200, 73.8000, 54.3900,  0.0000,  0.0000],
        [73.8000, 73.8000, 73.8000,  0.0000, 73.8000, 54.6100],
        [73.5800, 71.1300,  0.0000, 44.6500, 67.7400, 56.7000]])