# Experimentation of Linear Regression Models

In [1]:
# importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.metrics import mean_squared_error as mse
import os
import joblib

### Loading train and test sets of synthetic data and concrete compressive strength

In [2]:
cwd = os.getcwd()
data_dir = os.path.join(os.path.dirname(cwd), "data/processed")
model_dir = os.path.join(os.path.dirname(cwd), "models")
# listing all the files
data_files = os.listdir(data_dir)
model_files = os.listdir(model_dir)
print("Data Files : \n", data_files)
print("\nModel Files : \n", model_files)

Data Files : 
 ['ccs_test_scaled_x.csv', 'ccs_test_scaled_y.csv', 'ccs_train_scaled_x.csv', 'ccs_train_scaled_y.csv', 'synthetic_test_scaled_x.csv', 'synthetic_test_scaled_y.csv', 'synthetic_train_scaled_x.csv', 'synthetic_train_scaled_y.csv', 'synthetic_true_beta.csv']

Model Files : 
 ['ccs_scaler_x.pkl', 'ccs_scaler_y.pkl', 'synthetic_scaler_x.pkl', 'synthetic_scaler_y.pkl']


In [3]:
# loading all the synthetic data
syn_files = ['synthetic_test_scaled_x.csv', 'synthetic_test_scaled_y.csv', 'synthetic_train_scaled_x.csv', 
             'synthetic_train_scaled_y.csv', 'synthetic_true_beta.csv']

syn_test_scaled_x, syn_test_scaled_y, syn_train_scaled_x, syn_train_scaled_y, syn_true_beta = [np.array(pd.read_csv(os.path.join(data_dir, syn_files[i])))
                                                                                                                             for i in range(5)]

# loading all the concrete compressive strength data
ccs_files = ['ccs_test_scaled_x.csv', 'ccs_test_scaled_y.csv', 'ccs_train_scaled_x.csv', 'ccs_train_scaled_y.csv']

ccs_test_scaled_x, ccs_test_scaled_y, ccs_train_scaled_x, ccs_train_scaled_y= [np.array(pd.read_csv(os.path.join(data_dir, ccs_files[i]))) for i in range(4)]

In [4]:
# loading all the scalers
ccs_scaler_x, ccs_scaler_y, synthetic_scaler_x, synthetic_scaler_y = [joblib.load(os.path.join(model_dir, model_files[i])) for i in range(4)]

In [5]:
# helper function to store all the metrics

def save_metrics(labels, coefficients, intercept, test_mse, test_r2, custom = False):
    
    if custom:
        models = ['Linear','Linear_GD',  'Ridge', 'Ridge_GD','Lasso_CD']
    else:
        models = ['Linear', 'Ridge', 'Lasso']
    
    
    df_coefficients = pd.DataFrame()
    df_intercept = pd.DataFrame()
    df_mse = pd.DataFrame()
    df_r2 = pd.DataFrame()
    for i in range(len(models)):
        df_coefficients[models[i]] = coefficients[i]
        df_intercept[models[i]] = intercept[i]
        df_mse[models[i]] = test_mse[i]
        df_r2[models[i]] = test_r2[i]
    
    df = pd.concat([df_coefficients, df_intercept, df_mse, df_r2], ignore_index = True)
    #print(df_coefficients, df_intercept, df_mse, df_r2, sep = "\n")
    df["Labels"] = labels
    models.insert(0, "Labels")
    df = df[models]

    return df



## Ordinary Least Squares Experimentation

### Synthetic data

In [6]:
syn_ols = LinearRegression()
syn_ols.fit(syn_train_scaled_x, syn_train_scaled_y)
syn_ols_test_pred = syn_ols.predict(syn_test_scaled_x)
syn_ols_mse = mse(syn_test_scaled_y, syn_ols_test_pred)
syn_ols_r2_score = syn_ols.score(syn_test_scaled_x, syn_test_scaled_y)
print(f"MSE : {syn_ols_mse} \nR2 Score : {syn_ols_r2_score}")
print(f"Actual Coefficients: \n{np.array(syn_true_beta)}")
print(f"Estimated Intercept: \n{syn_ols.intercept_}")
print(f"Estimated Coefficients: \n{syn_ols.coef_}")

MSE : 0.0019372109275486045 
R2 Score : 0.9978326567085775
Actual Coefficients: 
[[-3.21747533]
 [ 0.        ]
 [-3.26519926]
 [ 0.        ]
 [ 1.37553264]
 [-0.03833616]
 [ 0.        ]
 [-2.0055649 ]
 [ 0.        ]
 [ 3.01366916]
 [ 2.66588524]
 [ 2.3361583 ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]
Estimated Intercept: 
[6.42431289e-17]
Estimated Coefficients: 
[[-1.27425733e-02 -5.97751745e-01  1.49047600e-03  2.25873114e-01
  -1.23712085e-02 -5.82427479e-03 -3.67146299e-01  5.70314152e-04
   4.68363900e-01  4.53725264e-01  3.78986376e-01 -2.82841677e-03
  -1.64020631e-02  2.60845216e-03  4.54631503e-03]]


### Concrete Compression Strength data

In [7]:
ccs_ols = LinearRegression()
ccs_ols.fit(ccs_train_scaled_x, ccs_train_scaled_y)
ccs_ols_test_pred = ccs_ols.predict(ccs_test_scaled_x)
ccs_ols_mse = mse(ccs_test_scaled_y, ccs_ols_test_pred)
ccs_ols_r2_score = ccs_ols.score(ccs_test_scaled_x, ccs_test_scaled_y)
print(f"MSE : {ccs_ols_mse} \nR2 Score : {ccs_ols_r2_score}")
print(f"Estimated Intercept: \n{ccs_ols.intercept_}")
print(f"Estimated Coefficients: \n{ccs_ols.coef_}")

MSE : 0.4444234800359259 
R2 Score : 0.5732161868625566
Estimated Intercept: 
[2.27395861e-16]
Estimated Coefficients: 
[[ 0.72887201  0.5038337   0.31776511 -0.19967827  0.11567052  0.08493404
   0.06673234  0.44385624]]


# Ridge Regression Experimentation

### Synthetic data

In [8]:
syn_ridge = Ridge(alpha = 0.01)
syn_ridge.fit(syn_train_scaled_x, syn_train_scaled_y)
syn_ridge_test_pred = syn_ridge.predict(syn_test_scaled_x)
syn_ridge_mse = mse(syn_test_scaled_y, syn_ridge_test_pred)
syn_ridge_r2_score = syn_ridge.score(syn_test_scaled_x, syn_test_scaled_y)
print(f"MSE : {syn_ridge_mse} \nR2 Score : {syn_ridge_r2_score}")
print(f"Actual Coefficients: \n{np.array(syn_true_beta)}")
print(f"Estimated Intercept: \n{syn_ridge.intercept_}")
print(f"Estimated Coefficients: \n{syn_ridge.coef_}")

MSE : 0.0019480844470239824 
R2 Score : 0.9978204914615443
Actual Coefficients: 
[[-3.21747533]
 [ 0.        ]
 [-3.26519926]
 [ 0.        ]
 [ 1.37553264]
 [-0.03833616]
 [ 0.        ]
 [-2.0055649 ]
 [ 0.        ]
 [ 3.01366916]
 [ 2.66588524]
 [ 2.3361583 ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]
Estimated Intercept: 
[6.42185766e-17]
Estimated Coefficients: 
[-1.26891265e-02 -5.97625381e-01  1.55518920e-03  2.25618563e-01
 -1.23849491e-02 -5.73434115e-03 -3.67033894e-01  5.65432805e-04
  4.68252311e-01  4.53666036e-01  3.78895495e-01 -2.79153066e-03
 -1.64654397e-02  2.68136090e-03  4.60316376e-03]


### Concrete Compressive Strength Data

In [9]:
ccs_ridge = Ridge(alpha = 0.01)
ccs_ridge.fit(ccs_train_scaled_x, ccs_train_scaled_y)
ccs_ridge_test_pred = ccs_ridge.predict(ccs_test_scaled_x)
ccs_ridge_mse = mse(ccs_test_scaled_y, ccs_ridge_test_pred)
ccs_ridge_r2_score = ccs_ridge.score(ccs_test_scaled_x, ccs_test_scaled_y)
print(f"MSE : {ccs_ridge_mse} \nR2 Score : {ccs_ridge_r2_score}")
print(f"Estimated Intercept: \n{ccs_ridge.intercept_}")
print(f"Estimated Coefficients: \n{ccs_ridge.coef_}")

MSE : 0.44442573529864327 
R2 Score : 0.5732140211137486
Estimated Intercept: 
[2.27462741e-16]
Estimated Coefficients: 
[ 0.72873647  0.50370319  0.31764889 -0.19977065  0.11566863  0.0848393
  0.06661372  0.4438441 ]


# Lasso Regression Experimentation

### Synthetic data

In [10]:
syn_lasso = Lasso(alpha = 0.01)
syn_lasso.fit(syn_train_scaled_x, syn_train_scaled_y)
syn_lasso_test_pred = syn_lasso.predict(syn_test_scaled_x)
syn_lasso_mse = mse(syn_test_scaled_y, syn_lasso_test_pred)
syn_lasso_r2_score = syn_lasso.score(syn_test_scaled_x, syn_test_scaled_y)
print(f"MSE : {syn_lasso_mse} \nR2 Score : {syn_lasso_r2_score}")
print(f"Actual Coefficients: \n{np.array(syn_true_beta)}")
print(f"Estimated Intercept: \n{syn_lasso.intercept_}")
print(f"Estimated Coefficients: \n{syn_lasso.coef_}")

MSE : 0.004034555718858743 
R2 Score : 0.9954861563360041
Actual Coefficients: 
[[-3.21747533]
 [ 0.        ]
 [-3.26519926]
 [ 0.        ]
 [ 1.37553264]
 [-0.03833616]
 [ 0.        ]
 [-2.0055649 ]
 [ 0.        ]
 [ 3.01366916]
 [ 2.66588524]
 [ 2.3361583 ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]
Estimated Intercept: 
[6.3138487e-17]
Estimated Coefficients: 
[ 0.         -0.58365055  0.          0.21107639 -0.00202828  0.
 -0.35331373 -0.          0.45264696  0.4468358   0.36351117  0.
 -0.01145968  0.00178369  0.00181749]


### Concrete Compression Strength data

In [11]:
ccs_lasso = Lasso(alpha = 0.01)
ccs_lasso.fit(ccs_train_scaled_x, ccs_train_scaled_y)
ccs_lasso_test_pred = ccs_lasso.predict(ccs_test_scaled_x)
ccs_lasso_mse = mse(ccs_test_scaled_y, ccs_lasso_test_pred)
ccs_lasso_r2_score = ccs_lasso.score(ccs_test_scaled_x, ccs_test_scaled_y)
print(f"MSE : {ccs_lasso_mse} \nR2 Score : {ccs_lasso_r2_score}")
print(f"Estimated Intercept: \n{ccs_lasso.intercept_}")
print(f"Estimated Coefficients: \n{ccs_lasso.coef_}")

MSE : 0.4461974040383291 
R2 Score : 0.5715126718954795
Estimated Intercept: 
[2.75589147e-16]
Estimated Coefficients: 
[ 0.60727329  0.38277518  0.20760533 -0.26725136  0.11259232  0.
 -0.02387945  0.42496707]


## Storing all the results as pandas dataframe

In [12]:
# Synthetic Data
coefficients = [pd.Series(model.coef_.reshape(-1, )) for model in (syn_ols , syn_ridge, syn_lasso)]
intercept = [model.intercept_ for model in (syn_ols , syn_ridge, syn_lasso)]
test_mse = [np.array([mse]) for mse in (syn_ols_mse, syn_ridge_mse, syn_lasso_mse)]
test_r2 = [np.array([r2]) for r2 in (syn_ols_r2_score, syn_ridge_r2_score, syn_lasso_r2_score)]
labels = ['Intercept', 'Feature 1', 'Feature 2', 'Feature 3', 'Feature 4',
           'Feature 5', 'Feature 6', 'Feature 7', 'Feature 8', 'Feature 9',
           'Feature 10', 'Feature 11', 'Feature 12', 'Feature 13', 'Feature 14',
           'Feature 15', 'Test MSE', 'Test R2 Score']
syn_df_1 = save_metrics(labels, coefficients, intercept, test_mse, test_r2)
syn_df_1.to_csv(os.path.join(os.path.dirname(cwd), r"results/metrics", "Group1_Metrics_Synthetic.csv"), index=False)

# Concrete Compressive Strength
coefficients = [pd.Series(model.coef_.reshape(-1, )) for model in (ccs_ols , ccs_ridge, ccs_lasso)]
intercept = [model.intercept_ for model in (ccs_ols , ccs_ridge, ccs_lasso)]
test_mse = [np.array([mse]) for mse in (ccs_ols_mse,  ccs_ridge_mse, ccs_lasso_mse)]
test_r2 = [np.array([r2]) for r2 in (ccs_ols_r2_score, ccs_ridge_r2_score, ccs_lasso_r2_score)]
labels = [ 'Intercept', 'Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer',
       'Coarse Aggregate', 'Fine Aggregate', 'Age (day)', 'Test MSE', 'Test R2 Score']
ccs_df_1 = save_metrics(labels, coefficients, intercept, test_mse, test_r2)
ccs_df_1.to_csv(os.path.join(os.path.dirname(cwd), r"results/metrics", "Group1_Metrics_CCS.csv"), index=False)

# SGDRegressor Experimentation

## Linear Regression  => penalty = None

### Synthetic data

In [13]:
syn_sgd_lr = SGDRegressor(penalty = None)
syn_sgd_lr.fit(syn_train_scaled_x, syn_train_scaled_y.ravel())
syn_sgd_lr_test_pred = syn_sgd_lr.predict(syn_test_scaled_x)
syn_sgd_lr_mse = mse(syn_test_scaled_y, syn_sgd_lr_test_pred)
syn_sgd_lr_r2_score = syn_sgd_lr.score(syn_test_scaled_x, syn_test_scaled_y)
print(f"MSE : {syn_sgd_lr_mse} \nR2 Score : {syn_sgd_lr_r2_score}")
print(f"Actual Coefficients: \n{np.array(syn_true_beta)}")
print(f"Estimated Intercept: \n{syn_sgd_lr.intercept_}")
print(f"Estimated Coefficients: \n{syn_sgd_lr.coef_}")

MSE : 0.03522444920902095 
R2 Score : 0.9605910370411594
Actual Coefficients: 
[[-3.21747533]
 [ 0.        ]
 [-3.26519926]
 [ 0.        ]
 [ 1.37553264]
 [-0.03833616]
 [ 0.        ]
 [-2.0055649 ]
 [ 0.        ]
 [ 3.01366916]
 [ 2.66588524]
 [ 2.3361583 ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]
Estimated Intercept: 
[-4.94636471e-05]
Estimated Coefficients: 
[ 1.62289419e-02 -5.34622852e-01  4.07239693e-02  4.97100172e-02
 -2.41134260e-02  6.08804767e-02 -2.97297558e-01  5.17084701e-04
  4.11228786e-01  4.31519792e-01  3.33011594e-01  1.61339448e-02
 -6.28571622e-02  6.77302668e-02  4.95981434e-02]


### Concrete Compressive Strength

In [14]:
ccs_sgd_lr = SGDRegressor(penalty = None)
ccs_sgd_lr.fit(ccs_train_scaled_x, ccs_train_scaled_y.ravel())
ccs_sgd_lr_test_pred = ccs_sgd_lr.predict(ccs_test_scaled_x)
ccs_sgd_lr_mse = mse(ccs_test_scaled_y, ccs_sgd_lr_test_pred)
ccs_sgd_lr_r2_score = ccs_sgd_lr.score(ccs_test_scaled_x, ccs_test_scaled_y)
print(f"MSE : {ccs_sgd_lr_mse} \nR2 Score : {ccs_sgd_lr_r2_score}")
print(f"Estimated Coefficients: \n{ccs_sgd_lr.coef_}")

MSE : 0.4586469900985949 
R2 Score : 0.5595572238837014
Estimated Coefficients: 
[ 0.50994317  0.29209185  0.12781442 -0.35608679  0.11036394 -0.0680883
 -0.12984413  0.4298447 ]


## Ridge Regression => penalty = "l2" 

### Synthetic data

In [15]:
syn_sgd_ridge = SGDRegressor(alpha = 0.01)
syn_sgd_ridge.fit(syn_train_scaled_x, syn_train_scaled_y.ravel())
syn_sgd_ridge_test_pred = syn_sgd_ridge.predict(syn_test_scaled_x)
syn_sgd_ridge_mse = mse(syn_test_scaled_y, syn_sgd_ridge_test_pred)
syn_sgd_ridge_r2_score = syn_sgd_ridge.score(syn_test_scaled_x, syn_test_scaled_y)
print(f"MSE : {syn_sgd_ridge_mse} \nR2 Score : {syn_sgd_ridge_r2_score}")
print(f"Actual Coefficients: \n{np.array(syn_true_beta)}")
print(f"Estimated Intercept: \n{syn_sgd_ridge.intercept_}")
print(f"Estimated Coefficients: \n{syn_sgd_ridge.coef_}")

MSE : 0.03731259404567314 
R2 Score : 0.9582548295384672
Actual Coefficients: 
[[-3.21747533]
 [ 0.        ]
 [-3.26519926]
 [ 0.        ]
 [ 1.37553264]
 [-0.03833616]
 [ 0.        ]
 [-2.0055649 ]
 [ 0.        ]
 [ 3.01366916]
 [ 2.66588524]
 [ 2.3361583 ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]
Estimated Intercept: 
[0.00064318]
Estimated Coefficients: 
[ 0.01783479 -0.5283063   0.03818177  0.05315024 -0.02234145  0.06237736
 -0.29336922 -0.00127717  0.40643644  0.42931718  0.33015073  0.01487585
 -0.05860289  0.06705793  0.04795302]


### Concrete Compressive Strength

In [16]:
ccs_sgd_ridge =  SGDRegressor(alpha = 0.01)
ccs_sgd_ridge.fit(ccs_train_scaled_x, ccs_train_scaled_y.reshape(-1, ))
ccs_sgd_ridge_test_pred = ccs_sgd_ridge.predict(ccs_test_scaled_x)
ccs_sgd_ridge_mse = mse(ccs_test_scaled_y, ccs_sgd_ridge_test_pred)
ccs_sgd_ridge_r2_score = ccs_sgd_ridge.score(ccs_test_scaled_x, ccs_test_scaled_y)
print(f"MSE : {ccs_sgd_ridge_mse} \nR2 Score : {ccs_sgd_ridge_r2_score}")
print(f"Estimated Coefficients: \n{ccs_sgd_ridge.coef_}")

MSE : 0.46389514091923445 
R2 Score : 0.5545173780614874
Estimated Coefficients: 
[ 0.5150133   0.29085439  0.12842919 -0.34489073  0.11513234 -0.06294959
 -0.13010718  0.43772172]


## Lasso Regression => penalty = "l1"

### Synthetic data

In [17]:
syn_sgd_lasso = SGDRegressor(penalty = "l1", alpha = 0.01, l1_ratio = 0.01)
syn_sgd_lasso.fit(syn_train_scaled_x, syn_train_scaled_y.reshape(-1, ))
syn_sgd_lasso_test_pred = syn_sgd_lasso.predict(syn_test_scaled_x)
syn_sgd_lasso_mse = mse(syn_test_scaled_y, syn_sgd_lasso_test_pred)
syn_sgd_lasso_r2_score = syn_sgd_lasso.score(syn_test_scaled_x, syn_test_scaled_y)
print(f"MSE : {syn_sgd_lasso_mse} \nR2 Score : {syn_sgd_lasso_r2_score}")
print(f"Actual Coefficients: \n{np.array(syn_true_beta)}")
print(f"Estimated Coefficients: \n{syn_sgd_lasso.coef_}")

MSE : 0.039761497742818994 
R2 Score : 0.9555150065672715
Actual Coefficients: 
[[-3.21747533]
 [ 0.        ]
 [-3.26519926]
 [ 0.        ]
 [ 1.37553264]
 [-0.03833616]
 [ 0.        ]
 [-2.0055649 ]
 [ 0.        ]
 [ 3.01366916]
 [ 2.66588524]
 [ 2.3361583 ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]
Estimated Coefficients: 
[ 0.01504125 -0.52405     0.03493701  0.05380214 -0.00948238  0.06072722
 -0.2850615   0.          0.39908313  0.42388835  0.32244381  0.00821771
 -0.05618042  0.06619704  0.04770913]


### Concrete Compressive Strength

In [18]:
ccs_sgd_lasso =  SGDRegressor(penalty = "l1", alpha = 0.01, l1_ratio = 0.01)
ccs_sgd_lasso.fit(ccs_train_scaled_x, ccs_train_scaled_y.reshape(-1, ))
ccs_sgd_lasso_test_pred = ccs_sgd_lasso.predict(ccs_test_scaled_x)
ccs_sgd_lasso_mse = mse(ccs_test_scaled_y, ccs_sgd_lasso_test_pred)
ccs_sgd_lasso_r2_score = ccs_sgd_lasso.score(ccs_test_scaled_x, ccs_test_scaled_y)
print(f"MSE : {ccs_sgd_lasso_mse} \nR2 Score : {ccs_sgd_lasso_r2_score}")
print(f"Estimated Coefficients: \n{ccs_sgd_lasso.coef_}")

MSE : 0.4598860999617885 
R2 Score : 0.558367295682187
Estimated Coefficients: 
[ 0.48952206  0.2745733   0.09243868 -0.32517566  0.13147812 -0.06180302
 -0.12946188  0.41431469]


## Storing the metrics of group 2

In [19]:
# Synthetic Data
coefficients = [pd.Series(model.coef_.reshape(-1, )) for model in (syn_sgd_lr , syn_sgd_ridge, syn_sgd_lasso)]
intercept = [model.intercept_ for model in (syn_sgd_lr , syn_sgd_ridge, syn_sgd_lasso)]
test_mse = [np.array([mse]) for mse in (syn_sgd_lr_mse, syn_sgd_ridge_mse, syn_sgd_lasso_mse)]
test_r2 = [np.array([r2]) for r2 in (syn_sgd_lr_r2_score, syn_sgd_ridge_r2_score, syn_sgd_lasso_r2_score)]
labels = ['Intercept', 'Feature 1', 'Feature 2', 'Feature 3', 'Feature 4',
           'Feature 5', 'Feature 6', 'Feature 7', 'Feature 8', 'Feature 9',
           'Feature 10', 'Feature 11', 'Feature 12', 'Feature 13', 'Feature 14',
           'Feature 15', 'Test MSE', 'Test R2 Score']
df = save_metrics(labels, coefficients, intercept, test_mse, test_r2)
df.to_csv(os.path.join(os.path.dirname(cwd), r"results/metrics", "Group2_Metrics_Synthetic.csv"), index=False)

# Concrete Compressive Strength
coefficients = [pd.Series(model.coef_.reshape(-1, )) for model in (ccs_sgd_lr , ccs_sgd_ridge, ccs_sgd_lasso)]
intercept = [model.intercept_ for model in (ccs_sgd_lr , ccs_sgd_ridge, ccs_sgd_lasso)]
test_mse = [np.array([mse]) for mse in (ccs_sgd_lr_mse,  ccs_sgd_ridge_mse, ccs_sgd_lasso_mse)]
test_r2 = [np.array([r2]) for r2 in (ccs_sgd_lr_r2_score, ccs_sgd_ridge_r2_score, ccs_sgd_lasso_r2_score)]
labels = [ 'Intercept', 'Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer',
       'Coarse Aggregate', 'Fine Aggregate', 'Age (day)', 'Test MSE', 'Test R2 Score']
df = save_metrics(labels, coefficients, intercept, test_mse, test_r2)
df.to_csv(os.path.join(os.path.dirname(cwd), r"results/metrics", "Group2_Metrics_CCS.csv"), index=False)

# Custom Linear Regression Implementations

In [20]:
# This allows .py files defined in src to be used in notebooks
import os
import sys

src_path = os.path.abspath(os.path.join('..', 'src'))

if src_path not in sys.path:
    sys.path.append(src_path)

from models import OLS, OLS_GD, Ridge, Ridge_GD, Lasso_CD
    

## Linear Regression -> Closed form

### Synthetic Data

In [21]:
from sklearn.metrics import mean_squared_error as mse, r2_score

In [22]:
syn_ols_custom = OLS()
syn_ols_custom.fit(syn_train_scaled_x, syn_train_scaled_y.reshape(-1, ))
syn_ols_custom_test_pred = syn_ols_custom.predict(syn_test_scaled_x)
syn_ols_custom_mse = mse(syn_test_scaled_y, syn_ols_custom_test_pred)
syn_ols_custom_r2_score = r2_score(syn_test_scaled_y, syn_ols_custom_test_pred)
print(f"MSE : {syn_ols_custom_mse} \nR2 Score : {syn_ols_custom_r2_score}")
print(f"Actual Coefficients: \n{np.array(syn_true_beta)}")
print(f"Estimated Coefficients: \n{syn_ols_custom.coefficients}")
print(f"Estimated Intercept: \n{syn_ols_custom.intercept}")

Final Train Loss: 0.0009
MSE : 0.0019372109275486425 
R2 Score : 0.9978326567085775
Actual Coefficients: 
[[-3.21747533]
 [ 0.        ]
 [-3.26519926]
 [ 0.        ]
 [ 1.37553264]
 [-0.03833616]
 [ 0.        ]
 [-2.0055649 ]
 [ 0.        ]
 [ 3.01366916]
 [ 2.66588524]
 [ 2.3361583 ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]
Estimated Coefficients: 
[-1.27425733e-02 -5.97751745e-01  1.49047600e-03  2.25873114e-01
 -1.23712085e-02 -5.82427479e-03 -3.67146299e-01  5.70314152e-04
  4.68363900e-01  4.53725264e-01  3.78986376e-01 -2.82841677e-03
 -1.64020631e-02  2.60845216e-03  4.54631503e-03]
Estimated Intercept: 
9.040730017461695e-17


### Concrete Compression Strength Data

In [23]:
ccs_ols_custom = OLS()
ccs_ols_custom.fit(ccs_train_scaled_x, ccs_train_scaled_y.reshape(-1, ))
ccs_ols_custom_test_pred = ccs_ols_custom.predict(ccs_test_scaled_x)
ccs_ols_custom_mse = mse(ccs_test_scaled_y, ccs_ols_custom_test_pred)
ccs_ols_custom_r2_score = r2_score(ccs_test_scaled_y, ccs_ols_custom_test_pred)
print(f"MSE : {ccs_ols_custom_mse} \nR2 Score : {ccs_ols_custom_r2_score}")
print(f"Estimated Coefficients: \n{ccs_ols_custom.coefficients}")
print(f"Estimated Intercept: \n{ccs_ols_custom.intercept}")

Final Train Loss: 0.3760
MSE : 0.444423480035926 
R2 Score : 0.5732161868625565
Estimated Coefficients: 
[ 0.72887201  0.5038337   0.31776511 -0.19967827  0.11567052  0.08493404
  0.06673234  0.44385624]
Estimated Intercept: 
2.484598912291135e-16


## Linear Regression -> Gradient Descent

### Synthetic Data

In [24]:
syn_ols_gd_custom = OLS_GD()
syn_ols_gd_custom.fit(syn_train_scaled_x, syn_train_scaled_y.reshape(-1, ), 10000, 0.01)
syn_ols_gd_custom_test_pred = syn_ols_gd_custom.predict(syn_test_scaled_x)
syn_ols_gd_custom_mse = mse(syn_test_scaled_y, syn_ols_gd_custom_test_pred)
syn_ols_gd_custom_r2_score = r2_score(syn_test_scaled_y, syn_ols_gd_custom_test_pred)
print(f"MSE : {syn_ols_gd_custom_mse} \nR2 Score : {syn_ols_gd_custom_r2_score}")
print(f"Actual Coefficients: \n{np.array(syn_true_beta)}")
print(f"Estimated Coefficients: \n{syn_ols_gd_custom.coefficients}")
print(f"Estimated Intercept: \n{syn_ols_gd_custom.intercept}")

Final train loss = 0.0009
MSE : 0.0019372109275554172 
R2 Score : 0.9978326567085699
Actual Coefficients: 
[[-3.21747533]
 [ 0.        ]
 [-3.26519926]
 [ 0.        ]
 [ 1.37553264]
 [-0.03833616]
 [ 0.        ]
 [-2.0055649 ]
 [ 0.        ]
 [ 3.01366916]
 [ 2.66588524]
 [ 2.3361583 ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]
Estimated Coefficients: 
[-1.27425733e-02 -5.97751745e-01  1.49047600e-03  2.25873114e-01
 -1.23712085e-02 -5.82427479e-03 -3.67146299e-01  5.70314152e-04
  4.68363900e-01  4.53725264e-01  3.78986376e-01 -2.82841677e-03
 -1.64020631e-02  2.60845216e-03  4.54631503e-03]
Estimated Intercept: 
5.115352585960411e-17


### Concrete Compression Strength Data

In [25]:
ccs_ols_gd_custom = OLS_GD()
ccs_ols_gd_custom.fit(ccs_train_scaled_x, ccs_train_scaled_y.reshape(-1, ), lr = 0.01)
ccs_ols_gd_custom_test_pred = ccs_ols_gd_custom.predict(ccs_test_scaled_x)
ccs_ols_gd_custom_mse = mse(ccs_test_scaled_y, ccs_ols_gd_custom_test_pred)
ccs_ols_gd_custom_r2_score = r2_score(ccs_test_scaled_y, ccs_ols_gd_custom_test_pred)
print(f"MSE : {ccs_ols_gd_custom_mse} \nR2 Score : {ccs_ols_gd_custom_r2_score}")
print(f"Estimated Coefficients: \n{ccs_ols_gd_custom.coefficients}")
print(f"Estimated Intercept: \n{ccs_ols_gd_custom.intercept}")

Final train loss = 0.3800
MSE : 0.4548013923152599 
R2 Score : 0.5632501855733762
Estimated Coefficients: 
[ 0.55925745  0.33982546  0.17283808 -0.33983302  0.10066343 -0.0459934
 -0.09451954  0.43740919]
Estimated Intercept: 
3.245745096639888e-16


## Ridge Regression -> Closed form

### Synthetic Data

In [26]:
syn_ridge_custom = Ridge()
syn_ridge_custom.fit(syn_train_scaled_x, syn_train_scaled_y.reshape(-1, ))
syn_ridge_custom_test_pred = syn_ridge_custom.predict(syn_test_scaled_x)
syn_ridge_custom_mse = mse(syn_test_scaled_y, syn_ridge_custom_test_pred)
syn_ridge_custom_r2_score = r2_score(syn_test_scaled_y, syn_ridge_custom_test_pred)
print(f"MSE : {syn_ridge_custom_mse} \nR2 Score : {syn_ridge_custom_r2_score}")
print(f"Actual Coefficients: \n{np.array(syn_true_beta)}")
print(f"Estimated Coefficients: \n{syn_ridge_custom.coefficients}")
print(f"Estimated Intercept: \n{syn_ridge_custom.intercept}")

Final Train Loss: 0.0009
MSE : 0.0019480844470239754 
R2 Score : 0.9978204914615443
Actual Coefficients: 
[[-3.21747533]
 [ 0.        ]
 [-3.26519926]
 [ 0.        ]
 [ 1.37553264]
 [-0.03833616]
 [ 0.        ]
 [-2.0055649 ]
 [ 0.        ]
 [ 3.01366916]
 [ 2.66588524]
 [ 2.3361583 ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]
Estimated Coefficients: 
[-1.26891265e-02 -5.97625381e-01  1.55518920e-03  2.25618563e-01
 -1.23849491e-02 -5.73434115e-03 -3.67033894e-01  5.65432805e-04
  4.68252311e-01  4.53666036e-01  3.78895495e-01 -2.79153066e-03
 -1.64654397e-02  2.68136090e-03  4.60316376e-03]
Estimated Intercept: 
9.038054996216637e-17


### Concrete Compression Strength Data

In [27]:
ccs_ridge_custom = Ridge()
ccs_ridge_custom.fit(ccs_train_scaled_x, ccs_train_scaled_y.reshape(-1, ))
ccs_ridge_custom_test_pred = ccs_ridge_custom.predict(ccs_test_scaled_x)
ccs_ridge_custom_mse = mse(ccs_test_scaled_y, ccs_ridge_custom_test_pred)
ccs_ridge_custom_r2_score = r2_score(ccs_test_scaled_y, ccs_ridge_custom_test_pred)
print(f"MSE : {ccs_ridge_custom_mse} \nR2 Score : {ccs_ridge_custom_r2_score}")
print(f"Estimated Coefficients: \n{ccs_ridge_custom.coefficients}")
print(f"Estimated Intercept: \n{ccs_ridge_custom.intercept}")

Final Train Loss: 0.3760
MSE : 0.44442573529864343 
R2 Score : 0.5732140211137484
Estimated Coefficients: 
[ 0.72873647  0.50370319  0.31764889 -0.19977065  0.11566863  0.0848393
  0.06661372  0.4438441 ]
Estimated Intercept: 
2.4853192336701973e-16


## Ridge Regression -> Gradient descent

### Synthetic Data

In [28]:
syn_ridge_gd_custom = Ridge_GD()
syn_ridge_gd_custom.fit(syn_train_scaled_x, syn_train_scaled_y.reshape(-1, ))
syn_ridge_gd_custom_test_pred = syn_ridge_gd_custom.predict(syn_test_scaled_x)
syn_ridge_gd_custom_mse = mse(syn_test_scaled_y, syn_ridge_gd_custom_test_pred)
syn_ridge_gd_custom_r2_score = r2_score(syn_test_scaled_y, syn_ridge_gd_custom_test_pred)
print(f"MSE : {syn_ridge_gd_custom_mse} \nR2 Score : {syn_ridge_gd_custom_r2_score}")
print(f"Actual Coefficients: \n{np.array(syn_true_beta)}")
print(f"Estimated Coefficients: \n{syn_ridge_gd_custom.coefficients}")
print(f"Estimated Intercept: \n{syn_ridge_gd_custom.intercept}")

Final train loss = 0.0010
MSE : 0.002411533772892586 
R2 Score : 0.9973019863400567
Actual Coefficients: 
[[-3.21747533]
 [ 0.        ]
 [-3.26519926]
 [ 0.        ]
 [ 1.37553264]
 [-0.03833616]
 [ 0.        ]
 [-2.0055649 ]
 [ 0.        ]
 [ 3.01366916]
 [ 2.66588524]
 [ 2.3361583 ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]
Estimated Coefficients: 
[-8.42232765e-03 -5.94848467e-01  7.73771877e-03  2.05854858e-01
 -1.43157094e-02 -2.67046406e-04 -3.64979941e-01 -1.43166868e-04
  4.66247561e-01  4.53813525e-01  3.76397332e-01  1.22062416e-03
 -2.18584834e-02  4.79244716e-03  7.34021291e-03]
Estimated Intercept: 
5.2100684877487496e-17


### Concrete Compression Strength Data

In [29]:
ccs_ridge_gd_custom = Ridge_GD()
ccs_ridge_gd_custom.fit(ccs_train_scaled_x, ccs_train_scaled_y.reshape(-1, ), lr = 0.01)
ccs_ridge_gd_custom_test_pred = ccs_ridge_gd_custom.predict(ccs_test_scaled_x)
ccs_ridge_gd_custom_mse = mse(ccs_test_scaled_y, ccs_ridge_gd_custom_test_pred)
ccs_ridge_gd_custom_r2_score = r2_score(ccs_test_scaled_y, ccs_ridge_gd_custom_test_pred)
print(f"MSE : {ccs_ridge_gd_custom_mse} \nR2 Score : {ccs_ridge_gd_custom_r2_score}")
print(f"Estimated Coefficients: \n{ccs_ridge_gd_custom.coefficients}")
print(f"Estimated Intercept: \n{ccs_ridge_gd_custom.intercept}")

Final train loss = 0.3800
MSE : 0.45480139231526 
R2 Score : 0.5632501855733761
Estimated Coefficients: 
[ 0.55925745  0.33982546  0.17283808 -0.33983302  0.10066343 -0.0459934
 -0.09451954  0.43740919]
Estimated Intercept: 
3.23312035180889e-16


## Lasso Regression -> Coordinate Descent

### Synthetic Data

In [30]:
syn_lasso_cd_custom = Lasso_CD()
syn_lasso_cd_custom.fit(syn_train_scaled_x, syn_train_scaled_y.reshape(-1, ))
syn_lasso_cd_custom_test_pred = syn_lasso_cd_custom.predict(syn_test_scaled_x)
syn_lasso_cd_custom_mse = mse(syn_test_scaled_y, syn_lasso_cd_custom_test_pred)
syn_lasso_cd_custom_r2_score = r2_score(syn_test_scaled_y, syn_lasso_cd_custom_test_pred)
print(f"MSE : {syn_sgd_lasso_mse} \nR2 Score : {syn_lasso_cd_custom_r2_score}")
print(f"Actual Coefficients: \n{np.array(syn_true_beta)}")
print(f"Estimated Coefficients: \n{syn_lasso_cd_custom.coefficients}")
print(f"Estimated Intercept: \n{syn_lasso_cd_custom.intercept}")

Final Train Loss : 0.0009
MSE : 0.039761497742818994 
R2 Score : 0.9978113211018478
Actual Coefficients: 
[[-3.21747533]
 [ 0.        ]
 [-3.26519926]
 [ 0.        ]
 [ 1.37553264]
 [-0.03833616]
 [ 0.        ]
 [-2.0055649 ]
 [ 0.        ]
 [ 3.01366916]
 [ 2.66588524]
 [ 2.3361583 ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]
Estimated Coefficients: 
[-1.22521019e-02 -5.97432336e-01  1.49383078e-03  2.25067797e-01
 -1.22426467e-02 -5.34481028e-03 -3.66962619e-01  2.60277725e-04
  4.68111793e-01  4.53726599e-01  3.78729136e-01 -2.58320924e-03
 -1.62928708e-02  2.57082660e-03  4.47030067e-03]
Estimated Intercept: 
-4.874572967494828e-16


### Concrete Compression Strength Data

In [31]:
ccs_lasso_cd_custom = Lasso_CD()
ccs_lasso_cd_custom.fit(ccs_train_scaled_x, ccs_train_scaled_y.reshape(-1, ))
ccs_lasso_cd_custom_test_pred = ccs_lasso_cd_custom.predict(ccs_test_scaled_x)
ccs_lasso_cd_custom_mse = mse(ccs_test_scaled_y, ccs_lasso_cd_custom_test_pred)
ccs_lasso_cd_custom_r2_score = r2_score(ccs_test_scaled_y, ccs_lasso_cd_custom_test_pred)
print(f"MSE : {ccs_lasso_cd_custom_mse} \nR2 Score : {ccs_lasso_cd_custom_r2_score}")
print(f"Estimated Coefficients: \n{ccs_lasso_cd_custom.coefficients}")
print(f"Estimated Intercept: \n{ccs_lasso_cd_custom.intercept}")

Final Train Loss : 0.3760
MSE : 0.4444285862610935 
R2 Score : 0.5732112833091989
Estimated Coefficients: 
[ 0.72853284  0.5035003   0.31747429 -0.19993779  0.11563616  0.08466813
  0.0664118   0.44382526]
Estimated Intercept: 
9.90712366513201e-16


## Storing Metrics of the trained & tested models

In [34]:
# Synthetic Data
coefficients = [pd.Series(model.coefficients.reshape(-1, )) for model in (syn_ols_custom , syn_ols_gd_custom, syn_ridge_custom, syn_ridge_gd_custom, syn_lasso_cd_custom)]
intercept = [np.array([model.intercept]) for model in (syn_ols_custom , syn_ols_gd_custom, syn_ridge_custom, syn_ridge_gd_custom, syn_lasso_cd_custom)]
test_mse = [np.array([mse]) for mse in (syn_ols_custom_mse , syn_ols_gd_custom_mse, syn_ridge_custom_mse, syn_ridge_gd_custom_mse, syn_lasso_cd_custom_mse)]
test_r2 = [np.array([r2]) for r2 in (syn_ols_custom_r2_score , syn_ols_gd_custom_r2_score, syn_ridge_custom_r2_score, syn_ridge_gd_custom_r2_score, syn_lasso_cd_custom_r2_score)]
labels = ['Intercept', 'Feature 1', 'Feature 2', 'Feature 3', 'Feature 4',
           'Feature 5', 'Feature 6', 'Feature 7', 'Feature 8', 'Feature 9',
           'Feature 10', 'Feature 11', 'Feature 12', 'Feature 13', 'Feature 14',
           'Feature 15', 'Test MSE', 'Test R2 Score']
df1 = save_metrics(labels, coefficients, intercept, test_mse, test_r2, True)
df1.to_csv(os.path.join(os.path.dirname(cwd), r"results/metrics", "Group3_Metrics_Synthetic.csv"), index=False)

# Concrete Compressive Strength
coefficients = [pd.Series(model.coefficients.reshape(-1, )) for model in (ccs_ols_custom , ccs_ols_gd_custom, ccs_ridge_custom, ccs_ridge_gd_custom, ccs_lasso_cd_custom)]
intercept = [np.array([model.intercept]) for model in (ccs_ols_custom , ccs_ols_gd_custom, ccs_ridge_custom, ccs_ridge_gd_custom, ccs_lasso_cd_custom)]
test_mse = [np.array([mse]) for mse in (ccs_ols_custom_mse , ccs_ols_gd_custom_mse, ccs_ridge_custom_mse, ccs_ridge_gd_custom_mse, ccs_lasso_cd_custom_mse)]
test_r2 = [np.array([r2]) for r2 in (ccs_ols_custom_r2_score , ccs_ols_gd_custom_r2_score, ccs_ridge_custom_r2_score, ccs_ridge_gd_custom_r2_score, ccs_lasso_cd_custom_r2_score)]
labels = [ 'Intercept', 'Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer',
       'Coarse Aggregate', 'Fine Aggregate', 'Age (day)', 'Test MSE', 'Test R2 Score']
df = save_metrics(labels, coefficients, intercept, test_mse, test_r2, True)
df.to_csv(os.path.join(os.path.dirname(cwd), r"results/metrics", "Group3_Metrics_CCS.csv"), index=False)

In [37]:
# losses from the custom functions using gradient descent & coordinate descent is also saved

# synthetic data
losses = [pd.Series(model.losses) for model in ( syn_ols_gd_custom, syn_ridge_gd_custom, syn_lasso_cd_custom)]
labels = [f"Epoch {i+1}" for i in range(1001)]

custom_gd_losses_df = pd.DataFrame(losses).T
custom_gd_losses_df.columns = ["Linear_GD", "Ridge_GD", "Lasso_CD"]
custom_gd_losses_df.to_csv(os.path.join(os.path.dirname(cwd), r"results/metrics", "Group3_Losses_Synthetic.csv"), index=False)

# concrete compressive strength
losses = [pd.Series(model.losses) for model in ( ccs_ols_gd_custom, ccs_ridge_gd_custom, ccs_lasso_cd_custom)]
labels = [f"Epoch {i+1}" for i in range(1001)]

custom_gd_losses_df = pd.DataFrame(losses).T
custom_gd_losses_df.columns = ["Linear_GD", "Ridge_GD", "Lasso_CD"]
custom_gd_losses_df.to_csv(os.path.join(os.path.dirname(cwd), r"results/metrics", "Group3_Losses_CCS.csv"), index=False)