Q1: K-Fold Cross Validation for Multiple Linear Regression (Least Square Error Fit)

a) Divide the dataset into input features (all columns except price) and output variable
(price)

In [57]:
import pandas as pd
df = pd.read_csv("USA_Housing.csv")
x = df.drop(columns=["Price"])
columns = x.columns
y = df["Price"]
x.head()


Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population
0,79545.45857,5.682861,7.009188,4.09,23086.8005
1,79248.64245,6.0029,6.730821,3.09,40173.07217
2,61287.06718,5.86589,8.512727,5.13,36882.1594
3,63345.24005,7.188236,5.586729,3.26,34310.24283
4,59982.19723,5.040555,7.839388,4.23,26354.10947


b) Scale the values of input features.

In [58]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_scaled = sc.fit_transform(x)
x = pd.DataFrame(x_scaled, columns=columns)
x.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population
0,1.02866,-0.296927,0.021274,0.088062,-1.317599
1,1.000808,0.025902,-0.255506,-0.722301,0.403999
2,-0.684629,-0.112303,1.516243,0.93084,0.07241
3,-0.491499,1.221572,-1.393077,-0.58454,-0.186734
4,-0.807073,-0.944834,0.846742,0.201513,-0.988387


c) Divide input and output features into five folds.

d) Run five iterations, in each iteration consider one-fold as test set and remaining
four sets as training set. Find the beta (β) matrix, predicted values, and R2_score
for each iteration using least square error fit.

In [59]:
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import r2_score
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_r2 = -np.inf
best_beta = None
r2_scores = []

for train_idx, test_idx in kf.split(x):
    x_train, x_test = x.iloc[train_idx], x.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    x_train_new = np.hstack([np.ones((x_train.shape[0],1)), x_train])
    x_test_new = np.hstack([np.ones((x_test.shape[0],1)), x_test])
    
    #β = (XᵀX)^(-1) Xᵀ y
    beta = np.linalg.pinv(x_train_new.T @ x_train_new)@x_train_new.T @ y_train
    y_pred = x_test_new @ beta
    
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    if r2>best_r2:
        best_r2 = r2
        best_beta = beta
    
    

e) Use the best value of (β) matrix (for which R2_score is maximum), to train the
regressor for 70% of data and test the performance for remaining 30% data.

In [60]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.3, random_state=42)

X_train_final = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
X_test_final = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

beta_final = np.linalg.pinv(X_train_final.T @ X_train_final) @ X_train_final.T @ Y_train
Y_pred = X_test_final @ beta_final
r2_final = r2_score(Y_test, Y_pred)

r2_final

0.9146818498916267

Q2 Concept of Validation set for Multiple Linear Regression (Gradient Descent
Optimization)

a) Consider the same dataset of Q1, rather than dividing the dataset into five folds, divide the
dataset into training set (56%), validation set (14%), and test set (30%).

In [61]:
x_aug = np.hstack([np.ones((x.shape[0], 1)), x])
x_temp, x_test, y_temp, y_test = train_test_split(
    x_aug, y, test_size=0.30, random_state=42
)
x_train, x_val, y_train, y_val = train_test_split(
    x_temp, y_temp, test_size=0.20, random_state=42
)

b) Consider four different values of learning rate i.e. {0.001,0.01,0.1,1}. Compute the values of
regression coefficients for each value of learning rate after 1000 iterations.

c) For each set of regression coefficients, compute R2_score for validation and test set and find
the best value of regression coefficients.

In [62]:
def gradient_descent(X, Y, lr, num_iters):
    m, n = X.shape
    beta = np.zeros(n)
    for i in range(num_iters):
        y_pred = X @ beta
        grad = (2/m) * (X.T @ (y_pred - Y))
        beta -= lr * grad
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
results = []

for lr in learning_rates:
    beta_gd = gradient_descent(x_train, y_train, lr, 1000)
    y_val_pred = x_val @ beta_gd
    r2_val = r2_score(y_val, y_val_pred)
    y_test_pred = x_test @ beta_gd
    r2_test = r2_score(y_test, y_test_pred)
    
    results.append({'lr': lr, 'beta': beta_gd, 'r2_val': r2_val, 'r2_test': r2_test})

results

  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


[{'lr': 0.001,
  'beta': array([1065444.29940522,  198687.06076461,  140098.69079504,
          103050.27041846,   25950.4759087 ,  125942.09158332]),
  'r2_val': 0.6820187423659023,
  'r2_test': 0.6490453443347968},
 {'lr': 0.01,
  'beta': array([1232618.31836202,  230067.95333238,  163710.26584918,
          121680.22876975,    2833.37135223,  150657.57448494]),
  'r2_val': 0.909799626728122,
  'r2_test': 0.9147569598865972},
 {'lr': 0.1,
  'beta': array([1232618.32011841,  230067.9889464 ,  163710.33259401,
          121681.42752284,    2832.15066521,  150657.52262836]),
  'r2_val': 0.9097995626742027,
  'r2_test': 0.9147570103083724},
 {'lr': 1,
  'beta': array([-1.27984491e+284, -7.93780641e+283, -1.89878064e+283,
         -1.21190221e+285, -1.19690539e+285,  7.62932836e+283]),
  'r2_val': -inf,
  'r2_test': -inf}]

Q3 Pre-processing and Multiple Linear Regression

1. Load the dataset with following column names ["symboling", "normalized_losses",
"make", "fuel_type", "aspiration","num_doors", "body_style", "drive_wheels",
"engine_location", "wheel_base", "length", "width", "height", "curb_weight",
"engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke",
"compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]
and replace all ? values with NaN

In [63]:
import pandas as pd
import numpy as np

columns = ["symboling", "normalized_losses",
"make", "fuel_type", "aspiration","num_doors", "body_style", "drive_wheels",
"engine_location", "wheel_base", "length", "width", "height", "curb_weight",
"engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke",
"compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

df = pd.read_csv("CarPrice.csv", header=None, names=columns)
df.replace('?', np.nan, inplace=True)
df.head()


Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


2. Replace all NaN values with central tendency imputation. Drop the rows with NaN
values in price column

In [64]:
df.dropna(subset=["price"], inplace=True)
df = df.apply(pd.to_numeric, errors='ignore')
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
non_numeric_cols = df.select_dtypes(exclude=['number']).columns.tolist()

for col in numeric_cols:
    df[col] = df[col].fillna(df[col].mean())
for col in non_numeric_cols:
    df[col] = df[col].fillna(df[col].mode()[0])
    
df.isnull().sum()


  df = df.apply(pd.to_numeric, errors='ignore')


symboling            0
normalized_losses    0
make                 0
fuel_type            0
aspiration           0
num_doors            0
body_style           0
drive_wheels         0
engine_location      0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
engine_type          0
num_cylinders        0
engine_size          0
fuel_system          0
bore                 0
stroke               0
compression_ratio    0
horsepower           0
peak_rpm             0
city_mpg             0
highway_mpg          0
price                0
dtype: int64

3. There are 10 columns in the dataset with non-numeric values. Convert these values to
numeric values using following scheme:

(i) For “num_doors” and “num_cylinders”: convert words (number names) to figures
for e.g., two to 2

In [65]:
word_to_num = {
    'zero':0, 'one':1, 'two':2, 'three':3, 'four':4,
    'five':5, 'six':6, 'seven':7, 'eight':8, 'nine':9,
    'ten':10, 'twelve':12
}

for col in ['num_doors', 'num_cylinders']:
    df[col] = df[col].str.lower().map(word_to_num)

(ii) For "body_style", "drive_wheels": use dummy encoding scheme

In [66]:
df = pd.get_dummies(df, columns=['body_style', 'drive_wheels'], drop_first=True)

(iii) For “make”, “aspiration”, “engine_location”,fuel_type: use label encoding
scheme

In [67]:
from sklearn.preprocessing import LabelEncoder
label_cols = ['make', 'aspiration', 'engine_location', 'fuel_type']
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])

(iv) For fuel_system: replace values containing string pfi to 1 else all values to 0.

In [68]:
df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in str(x) else 0)

(v) For engine_type: replace values containing string ohc to 1 else all values to 0.

In [69]:
df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in str(x) else 0)

4. Divide the dataset into input features (all columns except price) and output variable
(price). Scale all input features.

In [70]:
x = df.iloc[:, :-1]
y = df["price"]
cols = x.columns

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = pd.DataFrame(sc.fit_transform(x), columns=cols)

5. Train a linear regressor on 70% of data (using inbuilt linear regression function of
Python) and test its performance on remaining 30% of data.

In [86]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
lr.score(x_test, y_test)

1.0

6. Reduce the dimensionality of the feature set using inbuilt PCA decomposition and then
again train a linear regressor on 70% of reduced data (using inbuilt linear regression
function of Python). Does it lead to any performance improvement on test set?

In [87]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95, random_state=42)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

lr_pca = LinearRegression()
lr_pca.fit(x_train_pca, y_train)
y_pred_pca = lr_pca.predict(x_test_pca)
lr_pca.score(x_test_pca, y_test)

0.9019740275906319