In [2]:
from google.colab import files
uploaded=files.upload()

Saving USA_Housing.csv to USA_Housing.csv


**Q1: K-Fold Cross Validation for Multiple Linear Regression (Least Square Error Fit)
Download the dataset regarding USA House Price Prediction from the following link:
https://drive.google.com/file/d/1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX/view?usp=sharing
Load the dataset and Implement 5- fold cross validation for multiple linear regression
(using least square error fit).
Steps:
a) Divide the dataset into input features (all columns except price) and output variable
(price)
b) Scale the values of input features.
c) Divide input and output features into five folds.
d) Run five iterations, in each iteration consider one-fold as test set and remaining
four sets as training set. Find the beta (𝛽) matrix, predicted values, and R2_score
for each iteration using least square error fit.
Use the best value of (𝛽) matrix (for which R2_score is maximum), to train the
regressor for 70% of data and test the performance for remaining 30% data.**

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

df = pd.read_csv("USA_Housing.csv")
X = df.drop("Price", axis=1).values
y = df["Price"].values.reshape(-1, 1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

k = 5
fold_size = len(X_scaled) // k
indices = np.arange(len(X_scaled))

np.random.shuffle(indices)
folds = [indices[i*fold_size:(i+1)*fold_size] for i in range(k)]

best_beta = None
best_r2 = -np.inf

for i in range(k):
    test_idx = folds[i]
    train_idx = np.hstack([folds[j] for j in range(k) if j != i])

    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

    beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ X_train_bias.T @ y_train

    y_pred = X_test_bias @ beta

    r2 = r2_score(y_test, y_pred)
    print(f"Fold {i+1}: R² = {r2}")

    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

print("\nBest β (from fold with max R²):\n", best_beta)
print("Best R² from CV:", best_r2)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

beta_final = np.linalg.inv(X_train_bias.T @ X_train_bias) @ X_train_bias.T @ y_train
y_pred_final = X_test_bias @ beta_final

final_r2 = r2_score(y_test, y_pred_final)
print("\nFinal R² on 70-30 split:", final_r2)


Fold 1: R² = 0.9158787836618586
Fold 2: R² = 0.9192065033914482
Fold 3: R² = 0.9186578729667091
Fold 4: R² = 0.9171728127932868
Fold 5: R² = 0.9175992617261742

Best β (from fold with max R²):
 [[1232493.52470754]
 [ 229056.71020855]
 [ 163613.8285993 ]
 [ 121709.96247152]
 [   1602.29101596]
 [ 151975.82131891]]
Best R² from CV: 0.9192065033914482

Final R² on 70-30 split: 0.9146818498916266


**Q2: Concept of Validation set for Multiple Linear Regression (Gradient Descent
Optimization)
Consider the same dataset of Q1, rather than dividing the dataset into five folds, divide the
dataset into training set (56%), validation set (14%), and test set (30%).
Consider four different values of learning rate i.e. {0.001,0.01,0.1,1}. Compute the values of
regression coefficients for each value of learning rate after 1000 iterations.
For each set of regression coefficients, compute R2_score for validation and test set and find
the best value of regression coefficients.  **

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("USA_Housing.csv")
X = df.drop("Price", axis=1).values
y = df["Price"].values.reshape(-1, 1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.44, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.682, random_state=42)

X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_val_bias = np.c_[np.ones((X_val.shape[0], 1)), X_val]
X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

def gradient_descent(X, y, lr, iterations):
    m = len(y)
    beta = np.zeros((X.shape[1], 1))
    for _ in range(iterations):
        prediction = X @ beta
        error = prediction - y
        gradient = (2/m) * X.T @ error
        beta -= lr * gradient
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
iterations = 1000

best_beta = None
best_r2_val = -np.inf
best_r2_test = -np.inf
best_lr = None

for lr in learning_rates:
    beta = gradient_descent(X_train_bias, y_train, lr, iterations)
    y_val_pred = X_val_bias @ beta
    r2_val = r2_score(y_val, y_val_pred)
    y_test_pred = X_test_bias @ beta
    r2_test = r2_score(y_test, y_test_pred)

    if r2_val > best_r2_val:
        best_r2_val = r2_val
        best_r2_test = r2_test
        best_beta = beta
        best_lr = lr

print("\nBest Learning Rate:", best_lr)
print("Best Beta (coefficients):\n", best_beta)
print("Best Validation R²:", best_r2_val)
print("Best Test R²:", best_r2_test)



Best Learning Rate: 0.1
Best Beta (coefficients):
 [[1232180.27200919]
 [ 230645.88389435]
 [ 165328.94019375]
 [ 120045.00851908]
 [   2945.02108903]
 [ 151375.22971285]]
Best Validation R²: 0.9200080603669905
Best Test R²: 0.9134325765538891


  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


**Q3:Pre-processing and Multiple Linear Regression Download the dataset regarding Car Price Prediction from the following link: https:// archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data 1. Load the dataset with following column names ["symboling", "normalized_losses", "make", "fuel_type", "aspiration","num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base", "length", "width", "height", "curb_weight", "engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"] and replace all ? values with NaN 2. Replace all NaN values with central tendency imputation. Drop the rows with NaN values in price column 3. There are 10 columns in the dataset with non-numeric values. Convert these values to numeric values using following scheme: (i) For “num_doors” and “num_cylinders”: convert words (number names) to figures for e.g., two to 2 (ii) For "body_style", "drive_wheels": use dummy encoding scheme (iii) For “make”, “aspiration”, “engine_location”,fuel_type: use label encoding scheme (iv) For fuel_system: replace values containing string pfi to 1 else all values to 0. (v) For engine_type: replace values containing string ohc to 1 else all values to 0. 4. Divide the dataset into input features (all columns except price) and output variable (price). Scale all input features. 5. Train a linear regressor on 70% of data (using inbuilt linear regression function of Python) and test its performance on remaining 30% of data. 6. Reduce the dimensionality of the feature set using inbuilt PCA decomposition and then again train a linear regressor on 70% of reduced data (using inbuilt linear regression function of Python). Does it lead to any performance improvement on test set?**

In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
column_names = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
                "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base",
                "length", "width", "height", "curb_weight", "engine_type", "num_cylinders",
                "engine_size", "fuel_system", "bore", "stroke", "compression_ratio", "horsepower",
                "peak_rpm", "city_mpg", "highway_mpg", "price"]

df = pd.read_csv(url, names=column_names)
df.replace('?', np.nan, inplace=True)

df = df.apply(pd.to_numeric, errors='ignore')
for column in df.columns:
    if df[column].dtype == 'float64' or df[column].dtype == 'int64':
        df[column].fillna(df[column].mean(), inplace=True)
    else:
        df[column].fillna(df[column].mode()[0], inplace=True)

df.dropna(subset=['price'], inplace=True)

num_doors_mapping = {'two': 2, 'four': 4}
df['num_doors'] = df['num_doors'].map(num_doors_mapping)

df['num_cylinders'] = df['num_cylinders'].replace(
    {'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'eight': 8, 'twelve': 12}
)

df = pd.get_dummies(df, columns=["body_style", "drive_wheels"], drop_first=True)

label_columns = ["make", "aspiration", "engine_location", "fuel_type"]
encoder = LabelEncoder()
for col in label_columns:
    df[col] = encoder.fit_transform(df[col])

df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in x else 0)

df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in x else 0)

X = df.drop('price', axis=1)
y = df['price']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error without PCA: {mse}')

pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

model.fit(X_train_pca, y_train)
y_pred_pca = model.predict(X_test_pca)
mse_pca = mean_squared_error(y_test, y_pred_pca)
print(f'Mean Squared Error with PCA: {mse_pca}')


Mean Squared Error without PCA: 13422229.591732549
Mean Squared Error with PCA: 16625134.07007222


  df = df.apply(pd.to_numeric, errors='ignore')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode()[0], inplace=True)
  df['num_cylinders'] = df['num_cylinders'].replace(
