**Importing Libraries**

In [167]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import warnings
from scipy.stats import boxcox #for gausian transformation using boxcox method

# Disable all runtime warnings
warnings.filterwarnings("ignore")

**Reading Data**

In [168]:
df = pd.read_csv("po2_data.csv")
df

Unnamed: 0,subject#,age,sex,test_time,motor_updrs,total_updrs,jitter(%),jitter(abs),jitter(rap),jitter(ppq5),...,shimmer(abs),shimmer(apq3),shimmer(apq5),shimmer(apq11),shimmer(dda),nhr,hnr,rpde,dfa,ppe
0,1,72,0,5.6431,28.199,34.398,0.00662,0.000034,0.00401,0.00317,...,0.230,0.01438,0.01309,0.01662,0.04314,0.014290,21.640,0.41888,0.54842,0.16006
1,1,72,0,12.6660,28.447,34.894,0.00300,0.000017,0.00132,0.00150,...,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.10810
2,1,72,0,19.6810,28.695,35.389,0.00481,0.000025,0.00205,0.00208,...,0.181,0.00734,0.00844,0.01458,0.02202,0.020220,23.047,0.46222,0.54405,0.21014
3,1,72,0,25.6470,28.905,35.810,0.00528,0.000027,0.00191,0.00264,...,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.48730,0.57794,0.33277
4,1,72,0,33.6420,29.187,36.375,0.00335,0.000020,0.00093,0.00130,...,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,42,61,0,142.7900,22.485,33.485,0.00406,0.000031,0.00167,0.00168,...,0.160,0.00973,0.01133,0.01549,0.02920,0.025137,22.369,0.64215,0.55314,0.21367
5871,42,61,0,149.8400,21.988,32.988,0.00297,0.000025,0.00119,0.00147,...,0.215,0.01052,0.01277,0.01904,0.03157,0.011927,22.886,0.52598,0.56518,0.12621
5872,42,61,0,156.8200,21.495,32.495,0.00349,0.000025,0.00152,0.00187,...,0.244,0.01371,0.01456,0.01877,0.04112,0.017701,25.065,0.47792,0.57888,0.14157
5873,42,61,0,163.7300,21.007,32.007,0.00281,0.000020,0.00128,0.00151,...,0.131,0.00693,0.00870,0.01307,0.02078,0.007984,24.422,0.56865,0.56327,0.14204


**Dividing into X and Y Columns**

In [169]:
X = df.drop(columns=["subject#","motor_updrs","total_updrs"])
Y_mobile_UPDRS = df["motor_updrs"]
Y_totol_UPDRS = df["total_updrs"]
X

Unnamed: 0,age,sex,test_time,jitter(%),jitter(abs),jitter(rap),jitter(ppq5),jitter(ddp),shimmer(%),shimmer(abs),shimmer(apq3),shimmer(apq5),shimmer(apq11),shimmer(dda),nhr,hnr,rpde,dfa,ppe
0,72,0,5.6431,0.00662,0.000034,0.00401,0.00317,0.01204,0.02565,0.230,0.01438,0.01309,0.01662,0.04314,0.014290,21.640,0.41888,0.54842,0.16006
1,72,0,12.6660,0.00300,0.000017,0.00132,0.00150,0.00395,0.02024,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.10810
2,72,0,19.6810,0.00481,0.000025,0.00205,0.00208,0.00616,0.01675,0.181,0.00734,0.00844,0.01458,0.02202,0.020220,23.047,0.46222,0.54405,0.21014
3,72,0,25.6470,0.00528,0.000027,0.00191,0.00264,0.00573,0.02309,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.48730,0.57794,0.33277
4,72,0,33.6420,0.00335,0.000020,0.00093,0.00130,0.00278,0.01703,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,61,0,142.7900,0.00406,0.000031,0.00167,0.00168,0.00500,0.01896,0.160,0.00973,0.01133,0.01549,0.02920,0.025137,22.369,0.64215,0.55314,0.21367
5871,61,0,149.8400,0.00297,0.000025,0.00119,0.00147,0.00358,0.02315,0.215,0.01052,0.01277,0.01904,0.03157,0.011927,22.886,0.52598,0.56518,0.12621
5872,61,0,156.8200,0.00349,0.000025,0.00152,0.00187,0.00456,0.02499,0.244,0.01371,0.01456,0.01877,0.04112,0.017701,25.065,0.47792,0.57888,0.14157
5873,61,0,163.7300,0.00281,0.000020,0.00128,0.00151,0.00383,0.01484,0.131,0.00693,0.00870,0.01307,0.02078,0.007984,24.422,0.56865,0.56327,0.14204


**Train Test Split using Random Funtion and 0.4 test Ratio**

In [170]:
def calculating_train_test_split(X,Y,train_test_ratio):
    return  train_test_split(X, Y, test_size=train_test_ratio/10, random_state=42)

**Train Models for X_Train and Y_Trains**

In [171]:
def create_Model(X_train,y_train):
    X_train_model = sm.add_constant(X_train)
    return sm.OLS(y_train, X_train_model).fit()

**Test the Models**

In [172]:
def test_Model(X_test,model):
    X_test_model = sm.add_constant(X_test)
    return model.predict(X_test_model)

**Calculating Matrics**

In [173]:
def calculating_matrics(y_test,y_pred,X_test):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    nrmse = rmse / np.mean(y_test)
    r2 = r2_score(y_test, y_pred)

    #for Adjusted R2
    n = len(y_test)
    p = X_test.shape[1] - 1 

    #ar2
    adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

    return mae,mse,rmse,nrmse,r2,adjusted_r2

**TASK 2**

**Calling Task 1 Functions With respective data**

In [174]:
def task1_funcs(X,Y,train_test_ratio):
    X_train, X_test, y_train, y_test = calculating_train_test_split(X,Y,train_test_ratio)
    model = create_Model(X_train,y_train)
    y_pred = test_Model(X_test,model)
    mae,mse,rmse,nrmse,r2,adjusted_r2 = calculating_matrics(y_test,y_pred,X_test)

    print("MAE : ",mae)
    print("MSE : ",mse)
    print("RMSE : ",rmse)
    print("NRMSE : ",nrmse)
    print("R2 : ",r2)
    print("Adjusted R2 : ",adjusted_r2)

In [175]:
def task2_func(train_test_ratio):
    print("For Mobile UPDRS with train test ratio of ",train_test_ratio/10)
    task1_funcs(X,Y_mobile_UPDRS,train_test_ratio)

    print("For Total UPDRS with train test ratio of ",train_test_ratio/10)
    task1_funcs(X,Y_totol_UPDRS,train_test_ratio)

In [176]:
for i in range(2,6):
    task2_func(i)

For Mobile UPDRS with train test ratio of  0.2
MAE :  6.353395591926878
MSE :  56.01419722157606
RMSE :  7.484263305200857
NRMSE :  0.3538555426893264
R2 :  0.1224365257144312
Adjusted R2 :  0.10877204255081507
For Total UPDRS with train test ratio of  0.2
MAE :  8.053791050254512
MSE :  93.30672070272995
RMSE :  9.659540398110561
NRMSE :  0.335530993246988
R2 :  0.1579807041982827
Adjusted R2 :  0.14486967710102416
For Mobile UPDRS with train test ratio of  0.3
MAE :  6.298602796710583
MSE :  55.4961331661009
RMSE :  7.449572683456474
NRMSE :  0.3485495157906662
R2 :  0.1484862995835896
Adjusted R2 :  0.13969774074901653
For Total UPDRS with train test ratio of  0.3
MAE :  7.97027058153572
MSE :  92.22609413538447
RMSE :  9.603441785911157
NRMSE :  0.33054388124748074
R2 :  0.18006873390798417
Adjusted R2 :  0.17160614056529144
For Mobile UPDRS with train test ratio of  0.4
MAE :  6.2093856438989485
MSE :  54.237390996171
RMSE :  7.364603926632511
NRMSE :  0.345299526977132
R2 :  0.15

**Task 3**

In [177]:
# log the X for calulation
X_log = np.log1p(X)

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
X_log_imputed = imputer.fit_transform(X_log)

# Calculate VIF for each feature manually
vif_data = pd.DataFrame()
vif_data["Variable"] = X_log.columns
vif_data["VIF"] = [variance_inflation_factor(X_log_imputed, i) for i in range(X_log_imputed.shape[1])]

# Display the VIF values
print(vif_data)

          Variable           VIF
0              age    714.957668
1              sex      2.191358
2        test_time     28.038790
3        jitter(%)    203.394849
4      jitter(abs)     25.547043
5      jitter(rap)  16826.151716
6     jitter(ppq5)     57.012622
7      jitter(ddp)  17409.436504
8       shimmer(%)    450.473109
9     shimmer(abs)    250.072298
10   shimmer(apq3)   7934.824156
11   shimmer(apq5)    143.074941
12  shimmer(apq11)     52.868698
13    shimmer(dda)   8669.839003
14             nhr     12.359359
15             hnr    595.040411
16            rpde     73.822746
17             dfa    189.695975
18             ppe     37.227438


This indicates that majority of columns provide little to no effect. The higher the VIF the lesser the predictibilty using that columns

In [178]:
def task3_func(train_test_ratio):
    print("For Mobile UPDRS with Log Transformation and with train test ratio of ",train_test_ratio/10)
    task1_funcs(X_log_imputed,Y_mobile_UPDRS,train_test_ratio)

    print("For Total UPDRS with Log Transformation and with train test ratio of ",train_test_ratio/10)
    task1_funcs(X_log_imputed,Y_totol_UPDRS,train_test_ratio)

In [179]:
for i in range(2,6):
    task3_func(i)

For Mobile UPDRS with Log Transformation and with train test ratio of  0.2
MAE :  6.304429026463704
MSE :  55.11051385972673
RMSE :  7.423645590929482
NRMSE :  0.3509895406119839
R2 :  0.13659435622910454
Adjusted R2 :  0.12315032371364076
For Total UPDRS with Log Transformation and with train test ratio of  0.2
MAE :  7.9822295942862285
MSE :  92.73492754596286
RMSE :  9.629897587511659
NRMSE :  0.33450132917675673
R2 :  0.1631406848253928
Adjusted R2 :  0.15011000344724146
For Mobile UPDRS with Log Transformation and with train test ratio of  0.3
MAE :  6.272706442779477
MSE :  54.92588912207354
RMSE :  7.411200248412773
NRMSE :  0.3467541519191394
R2 :  0.15723592930313834
Adjusted R2 :  0.1485376762798909
For Total UPDRS with Log Transformation and with train test ratio of  0.3
MAE :  7.934544964252033
MSE :  92.14342615259058
RMSE :  9.599136739967328
NRMSE :  0.33039570451803885
R2 :  0.18080368928512458
Adjusted R2 :  0.17234868149104898
For Mobile UPDRS with Log Transformation 

There is not much difference between the performance matrix of linear and log transmission Lower MAE, MSE, RMSE, and higher R-squared values indicate better performance.
for 0.2 mobile 
MAE :  6.304429026463704
MSE :  55.11051385972673

MAE :  6.353395591926878
MSE :  56.01419722157606

There is a slight difference at other train test ratios too but the gain or loss is not enough to hit a dent

**TASK 4**

In [183]:
# Creating standard scaller
scaler = StandardScaler()

# standardizing the x columns
X_standardized = scaler.fit_transform(X)

#for gausian transformation using boxcox method
Y_totol_UPDRS_Gau,_ =  boxcox(Y_totol_UPDRS)
Y_mobile_UPDRS_Gau,_ =  boxcox(Y_mobile_UPDRS)


In [184]:
def task4_func(train_test_ratio):
    print("For Mobile UPDRS with Scalar and Gaussian Transformation on target and with train test ratio of ",train_test_ratio/10)
    task1_funcs(X_standardized,Y_mobile_UPDRS_Gau,train_test_ratio)

    print("For Total UPDRS with scalar and Gaussian Transformation on target and with train test ratio of ",train_test_ratio/10)
    task1_funcs(X_standardized,Y_totol_UPDRS_Gau,train_test_ratio)

In [185]:
for i in range(2,6):
    task4_func(i)

For Mobile UPDRS with Scalar and Gaussian Transformation on target and with train test ratio of  0.2
MAE :  2.9518393729989603
MSE :  12.18937450612094
RMSE :  3.491328472962826
NRMSE :  0.3025138073399999
R2 :  0.1264265721794614
Adjusted R2 :  0.11282421776703078
For Total UPDRS with scalar and Gaussian Transformation on target and with train test ratio of  0.2
MAE :  2.813235475734836
MSE :  11.453020079972502
RMSE :  3.3842310913961686
NRMSE :  0.2614906440437709
R2 :  0.16653216508520186
Adjusted R2 :  0.1535542922232067
For Mobile UPDRS with Scalar and Gaussian Transformation on target and with train test ratio of  0.3
MAE :  2.9234352095345852
MSE :  12.038556942707968
RMSE :  3.4696623672495814
NRMSE :  0.2980513710343266
R2 :  0.15291992229576534
Adjusted R2 :  0.14417712332863453
For Total UPDRS with scalar and Gaussian Transformation on target and with train test ratio of  0.3
MAE :  2.777656114617121
MSE :  11.269177435002897
RMSE :  3.3569595521845206
NRMSE :  0.2575802434

As stated in the conclussion of task 3 Lower MAE, MSE, RMSE, and higher R-squared values indicate better performance.
At 0.2 train test ratio
For Mobile UPDRS with train test ratio of  0.2
MAE :  6.353395591926878
MSE :  56.01419722157606
RMSE :  7.484263305200857
For Mobile UPDRS with Scalar and Gaussian Transformation on target and with train test ratio of  0.2
MAE :  2.9518393729989603
MSE :  12.18937450612094
RMSE :  3.491328472962826

It can be seen there is a significant performance gain. when standardize the X and gausian transform the target(y_mobile_updrs and y_total_updrs)