In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
import random

In [2]:
def set_seed(seed=42):
    """Set all random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(42)  

Loading Dataset

In [4]:
dataset = pd.read_csv('CurrentData/19JuneDataset.csv')
dataset.head()

Unnamed: 0,time,A0,A1,A2,A5,A6,A7,X,Y,Z
0,0.0,3.02,1.87,0.28,2.45,0.14,0.0,-0.629333,0.844733,1.169218
1,0.046863,2.95,1.8,0.0,1.87,0.0,0.0,-0.629225,0.844531,1.169302
2,0.078058,2.95,1.8,0.07,2.16,0.0,0.0,-0.629148,0.844372,1.169372
3,0.109227,3.02,1.65,0.0,2.01,0.0,0.0,-0.629085,0.844232,1.169426
4,0.140454,3.17,1.8,0.07,2.3,0.07,0.0,-0.629038,0.844166,1.169454


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18165 entries, 0 to 18164
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   time    18165 non-null  float64
 1   A0      18165 non-null  float64
 2   A1      18165 non-null  float64
 3   A2      18165 non-null  float64
 4   A5      18165 non-null  float64
 5   A6      18165 non-null  float64
 6   A7      18165 non-null  float64
 7   X       18165 non-null  float64
 8   Y       18165 non-null  float64
 9   Z       18165 non-null  float64
dtypes: float64(10)
memory usage: 1.4 MB


In [6]:
X, y = dataset[["X", "Y", "Z"]], dataset[['A0', 'A1', 'A2', 'A5', 'A6', 'A7']]

x_mean = X.iloc[:, 0].mean()

x_std = X.iloc[:, 0].std()
y_mean = X.iloc[:, 1].mean()
y_std = X.iloc[:, 1].std()
z_mean = X.iloc[:, 2].mean()
z_std = X.iloc[:, 2].std()

# Normalize each column
X_norm = np.copy(X)
X_norm[:, 0] = (X.iloc[:, 0] - x_mean) / x_std
X_norm[:, 1] = (X.iloc[:, 1] - y_mean) / y_std
X_norm[:, 2] = (X.iloc[:, 2] - z_mean) / z_std

y_norm = y / 73.8

In [7]:
X_norm = torch.tensor(pd.DataFrame(X_norm).values, dtype=torch.float32)
y_norm = torch.tensor(pd.DataFrame(y_norm).values, dtype=torch.float32)

X_train, X_temp, y_train, y_temp = train_test_split(X_norm, y_norm, test_size=0.2, shuffle=True, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [8]:
X_train

tensor([[ 0.2064,  0.8040,  0.0440],
        [-2.1598,  1.2789,  0.7274],
        [ 0.4554,  0.7665,  0.8691],
        ...,
        [ 0.3993, -0.9969,  0.6394],
        [-1.5094,  0.7090, -0.9503],
        [-0.1380,  0.6910,  0.3788]])

Starting with Linear Regression

In [9]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

In [10]:
y_test_scaled = y_test * 73.8

y_pred_linear = linear_reg.predict(X_test)
y_pred_scaled_linear = y_pred_linear * 73.8

y_pred_scaled_linear = torch.tensor(y_pred_scaled_linear, dtype=torch.float32)

print(y_pred_scaled_linear)

tensor([[62.4317, 67.7364, 78.9666, 45.8097, 48.5814, 49.1037],
        [40.8730, 77.1022, 93.2547, 34.8903, 38.2980, 77.8452],
        [83.8819, 60.3892, 73.8587, 60.2002, 66.0791, 24.9511],
        ...,
        [61.2099, 58.0434, 50.2515, 28.8831, 19.9661, 27.2629],
        [73.3403, 57.8299, 53.6551, 42.7396, 37.8951, 22.7737],
        [69.2717, 58.3482, 50.7292, 38.4611, 31.4314, 25.3781]])


In [11]:
print(y_test_scaled)

tensor([[73.8000, 73.8000, 73.8000,  0.1400, 73.2900, 55.1800],
        [13.4900, 73.8000, 73.8000, 73.2200, 73.8000, 73.8000],
        [73.8000, 73.8000, 73.8000, 73.2200, 67.5900,  0.0000],
        ...,
        [73.8000, 73.8000, 73.8000, 36.3500,  0.0000,  0.0000],
        [73.5800, 73.5100, 73.6500, 72.5700,  4.3200,  0.0000],
        [73.8000, 73.8000, 73.8000, 63.2600,  0.0000,  0.0000]])


In [12]:
# Computing MAE for the linear regression model

MAE = torch.nn.L1Loss()

mae_linear = MAE(y_pred_scaled_linear, y_test_scaled)

print(f'Mean Absolute Error for Linear Regression: {mae_linear.item()}')

Mean Absolute Error for Linear Regression: 16.742263793945312


Polynomial Regression, different Degrees

In [14]:
for i in range(2, 15):
    poly = PolynomialFeatures(degree=i, include_bias=False)
    X_poly = poly.fit_transform(X_train)

    # if i > 3:
    #     poly_reg = Ridge(alpha=1.0)
    #     print("Ridge Regularization is applied for degree > 3 (Alpha = 1.0)")
    # else:
    #     poly_reg = LinearRegression()

    poly_reg = LinearRegression()

    poly_reg.fit(X_poly, y_train)

    y_pred_poly = poly_reg.predict(poly.transform(X_test))
    y_pred_scaled_poly = y_pred_poly * 73.8

    y_pred_scaled_poly = torch.tensor(y_pred_scaled_poly, dtype=torch.float32)

    mae_poly = MAE(y_pred_scaled_poly, y_test_scaled)
    mae_ploy_train = MAE(torch.tensor(poly_reg.predict(poly.transform(X_train)) * 73.8), y_train * 73.8)

    print("-------------------------------------------------------------------------------------------------")
    print(f'Test Mean Absolute Error for Polynomial Regression (degree {i}): {mae_poly.item()}')
    print(f'Train Mean Absolute Error for Polynomial Regression (degree {i}): {mae_ploy_train.item()}')
    print("-------------------------------------------------------------------------------------------------")


-------------------------------------------------------------------------------------------------
Test Mean Absolute Error for Polynomial Regression (degree 2): 8.989516258239746
Train Mean Absolute Error for Polynomial Regression (degree 2): 9.073997026810849
-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
Test Mean Absolute Error for Polynomial Regression (degree 3): 4.985935211181641
Train Mean Absolute Error for Polynomial Regression (degree 3): 5.107845266954478
-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
Test Mean Absolute Error for Polynomial Regression (degree 4): 3.556711196899414
Train Mean Absolute Error for Polynomial Regression (degree 4): 3.6517424974939985
--------------------

Degree 13 seems to be the best fit. Before 13, it is underfit, after 13 it is overfitting

In [13]:


poly = PolynomialFeatures(degree=13, include_bias=False)
X_poly = poly.fit_transform(X_train)

poly_reg = LinearRegression()

poly_reg.fit(X_poly, y_train)

y_pred_poly = poly_reg.predict(poly.transform(X_test))
y_pred_scaled_poly = y_pred_poly * 73.8
y_pred_scaled_poly = np.clip(y_pred_scaled_poly, 0, 73.8)


y_pred_scaled_poly = torch.tensor(y_pred_scaled_poly, dtype=torch.float32)

mae_poly = MAE(y_pred_scaled_poly, y_test_scaled)
mae_ploy_train = MAE(torch.tensor(poly_reg.predict(poly.transform(X_train)) * 73.8), y_train * 73.8)

print(f'Test Mean Absolute Error for Polynomial Regression (degree 13): {mae_poly.item()}')

Test Mean Absolute Error for Polynomial Regression (degree 13): 0.29602572321891785


In [16]:
y_pred_scaled_poly = torch.round(y_pred_scaled_poly, decimals=1)
y_pred_scaled_poly 

tensor([[73.7000, 73.7000, 73.7000,  0.0000, 73.6000, 56.0000],
        [13.7000, 73.6000, 73.4000, 72.7000, 73.2000, 73.6000],
        [73.5000, 73.5000, 73.6000, 73.4000, 67.7000,  0.0000],
        ...,
        [73.4000, 73.6000, 73.7000, 36.1000,  0.0000,  0.0000],
        [73.5000, 73.5000, 73.7000, 72.8000,  4.7000,  0.0000],
        [73.3000, 73.5000, 73.6000, 63.2000,  0.0000,  0.0000]])

In [17]:
y_test_scaled

tensor([[73.8000, 73.8000, 73.8000,  0.1400, 73.2900, 55.1800],
        [13.4900, 73.8000, 73.8000, 73.2200, 73.8000, 73.8000],
        [73.8000, 73.8000, 73.8000, 73.2200, 67.5900,  0.0000],
        ...,
        [73.8000, 73.8000, 73.8000, 36.3500,  0.0000,  0.0000],
        [73.5800, 73.5100, 73.6500, 72.5700,  4.3200,  0.0000],
        [73.8000, 73.8000, 73.8000, 63.2600,  0.0000,  0.0000]])