<a href="https://colab.research.google.com/github/Bhavya728/bhavya_machine_learning_UML501/blob/main/ML_Assignment3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Q.1

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import r2_score

# Load the dataset
data = pd.read_csv('/content/USA_Housing.csv')
# Separate features and target variable
X = data.drop(['Price'], axis=1).values
y = data['Price'].values
# Normalize the feature set
normalizer = StandardScaler()
X_scaled = normalizer.fit_transform(X)
# Initialize K-Fold cross validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
optimal_beta = None
max_r2 = -np.inf
for fold, (train_idx, test_idx) in enumerate(kfold.split(X_scaled), start=1):
    X_train, X_valid = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_valid = y[train_idx], y[test_idx]
    # Add intercept term
    X_train_aug = np.c_[np.ones(X_train.shape[0]), X_train]
    X_valid_aug = np.c_[np.ones(X_valid.shape[0]), X_valid]
    # Closed-form solution: β = (XᵀX)^(-1) Xᵀy
    beta_hat = np.linalg.inv(X_train_aug.T @ X_train_aug) @ (X_train_aug.T @ y_train)
    # Predictions and R² score
    y_pred = X_valid_aug @ beta_hat
    score = r2_score(y_valid, y_pred)
    # Track best model
    if score > max_r2:
        max_r2 = score
        optimal_beta = beta_hat

# Final evaluation on a 70/30 split
X_train70, X_test30, y_train70, y_test30 = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_test30_aug = np.c_[np.ones(X_test30.shape[0]), X_test30]
y_pred30 = X_test30_aug @ optimal_beta

final_r2 = r2_score(y_test30, y_pred30)

print("\nPerformance Evaluation on 70/30 split")
print("R² Score on test data:", final_r2)
print("Optimal Coefficients:\n", optimal_beta)



Performance Evaluation on 70/30 split
R² Score on test data: 0.9147458156636434
Optimal Coefficients:
 [1.23161736e+06 2.30225051e+05 1.63956839e+05 1.21115120e+05
 7.83467170e+02 1.50662447e+05]


Q.2

In [3]:
from sklearn.model_selection import train_test_split


# Load dataset
housing = pd.read_csv('/content/USA_Housing.csv')

# Features and target separation
features = housing.drop("Price", axis=1).values
target = housing["Price"].values.reshape(-1, 1)

# Standardization
std = StandardScaler()
features_scaled = std.fit_transform(features)

# Add intercept column
features_scaled = np.c_[np.ones((features_scaled.shape[0], 1)), features_scaled]

# Split into train, validation, and test
X_train, X_tmp, y_train, y_tmp = train_test_split(features_scaled, target, test_size=0.44, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_tmp, y_tmp, test_size=30/44, random_state=42)

# Gradient descent function
def run_gradient_descent(X, y, lr, steps):
    m, n = X.shape
    coeffs = np.zeros((n, 1))
    for _ in range(steps):
        preds = X @ coeffs
        error = preds - y
        for j in range(n):
            grad = (1 / m) * np.sum(error * X[:, [j]])
            coeffs[j, 0] -= lr * grad
    return coeffs

# Try different learning rates
candidates = [0.001, 0.01, 0.1, 1]
top_coeffs = None
top_lr = None
top_val_r2 = -np.inf

for lr in candidates:
    coeffs = run_gradient_descent(X_train, y_train, lr, steps=1000)

    val_preds  = X_val  @ coeffs
    test_preds = X_test @ coeffs

    r2_val  = r2_score(y_val, val_preds)
    r2_test = r2_score(y_test, test_preds)

    if r2_val > top_val_r2:
        top_val_r2 = r2_val
        top_coeffs = coeffs
        top_lr = lr

print("\nChosen learning rate (based on validation R²):", top_lr)
print("Validation R² of chosen model:", top_val_r2)



Chosen learning rate (based on validation R²): 0.1
Validation R² of chosen model: 0.9199649194854793


Q.3

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.decomposition import PCA

columns = [
    "symboling", "normalized_losses", "make", "fuel_type", "aspiration",
    "num_doors", "body_style", "drive_wheels", "engine_location",
    "wheel_base", "length", "width", "height", "curb_weight",
    "engine_type", "num_cylinders", "engine_size", "fuel_system", "bore",
    "stroke", "compression_ratio", "horsepower", "peak_rpm",
    "city_mpg", "highway_mpg", "price"
]

cars = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
    names=columns
)

# Replace missing values marked as "?"
cars.replace("?", np.nan, inplace=True)

# Fill categorical with mode, numeric with mean
for col in cars.columns:
    if cars[col].dtype == "object":
        cars[col].fillna(cars[col].mode()[0], inplace=True)
    else:
        cars[col].fillna(cars[col].astype(float).mean(), inplace=True)

# Drop rows with no price
cars = cars[cars["price"].notna()]

# Convert numeric columns
num_cols = [
    "symboling", "normalized_losses", "wheel_base", "length", "width", "height",
    "curb_weight", "engine_size", "bore", "stroke", "compression_ratio",
    "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"
]
cars[num_cols] = cars[num_cols].astype(float)

# Map text numbers to integers
text_to_num = {"two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "eight": 8, "twelve": 12}
cars["num_doors"] = cars["num_doors"].replace(text_to_num)
cars["num_cylinders"] = cars["num_cylinders"].replace(text_to_num)

# One-hot encode body_style and drive_wheels
cars = pd.get_dummies(cars, columns=["body_style", "drive_wheels"], drop_first=True)

# Label encode selected categorical columns
for col in ["make", "aspiration", "engine_location", "fuel_type"]:
    cars[col] = LabelEncoder().fit_transform(cars[col])

# Binary encoding for engine_type and fuel_system
cars["fuel_system"] = cars["fuel_system"].apply(lambda x: 1 if "pfi" in x.lower() else 0)
cars["engine_type"] = cars["engine_type"].apply(lambda x: 1 if "ohc" in x.lower() else 0)

X_all = cars.drop("price", axis=1).values
y_all = cars["price"].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_all)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_all, test_size=0.3, random_state=42
)

model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
preds = model_lr.predict(X_test)

print("Linear Regression without PCA:")
print("R² Score:", r2_score(y_test, preds))
print("MSE:", mean_squared_error(y_test, preds))

pca = PCA(n_components=0.95)  # Retain 95% variance
X_reduced = pca.fit_transform(X_scaled)

Xp_train, Xp_test, yp_train, yp_test = train_test_split(
    X_reduced, y_all, test_size=0.3, random_state=42
)

model_lr_pca = LinearRegression()
model_lr_pca.fit(Xp_train, yp_train)
preds_pca = model_lr_pca.predict(Xp_test)

print("\nLinear Regression with PCA:")
print("R² Score:", r2_score(yp_test, preds_pca))
print("MSE:", mean_squared_error(yp_test, preds_pca))


Linear Regression without PCA:
R² Score: 0.7895045576733848
MSE: 14448999.011837844

Linear Regression with PCA:
R² Score: 0.7478420860380317
MSE: 17308828.207359694


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cars[col].fillna(cars[col].astype(float).mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cars[col].fillna(cars[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which 