In [2]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Step 2: Load the dataset
file_path = 'USA_Housing.csv'
df = pd.read_csv(file_path)


# Step 3: Divide dataset into input features (X) and output (y)
X = df.drop(columns=['Price'])
y = df['Price'].values.reshape(-1, 1)

# Step 4: Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Initialize 5-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_r2 = -np.inf
best_beta = None

# Step 6: Run 5 iterations
fold = 1
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Compute beta using Least Squares: beta = (X^T X)^(-1) X^T y
    X_train_bias = np.hstack([np.ones((X_train.shape[0], 1)), X_train])  # Add bias column
    X_test_bias = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

    beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ X_train_bias.T @ y_train
    y_pred = X_test_bias @ beta

    r2 = r2_score(y_test, y_pred)
    print(f"Fold {fold} R2 Score: {r2}")

    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

    fold += 1

print("\nBest R2 Score:", best_r2)
print("Best Beta Coefficients:\n", best_beta)


Fold 1 R2 Score: 0.9179971706985147
Fold 2 R2 Score: 0.9145677884802818
Fold 3 R2 Score: 0.9116116385364478
Fold 4 R2 Score: 0.9193091764960816
Fold 5 R2 Score: 0.9243869413350316

Best R2 Score: 0.9243869413350316
Best Beta Coefficients:
 [[1.23161736e+06]
 [2.30225051e+05]
 [1.63956839e+05]
 [1.21115120e+05]
 [7.83467170e+02]
 [1.50662447e+05]]


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load dataset
file_path = 'USA_Housing.csv'
df = pd.read_csv(file_path)

# Features and target
X = df.drop('Price', axis=1).values  # Replace 'Price' with your target column
y = df['Price'].values

# Split into training+validation and test (70%-30%)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Further split training+validation into training (56%) and validation (14%)
# Since 70% of data is train+val, training set = 56/70 ≈ 0.8, validation = 0.2 of X_train_val
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")


Train: (2800, 5), Validation: (700, 5), Test: (1500, 5)


In [4]:
import pandas as pd
import numpy as np

# Column names
columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base",
           "length", "width", "height", "curb_weight", "engine_type", "num_cylinders",
           "engine_size", "fuel_system", "bore", "stroke", "compression_ratio",
           "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df = pd.read_csv(url, names=columns, na_values='?')

# Quick check
df.head()


Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [5]:
# Separate numeric and categorical columns
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()

# Impute numeric columns with mean
for col in numeric_cols:
    df[col].fillna(df[col].mean(), inplace=True)

# Impute categorical columns with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Drop rows where price is NaN
df.dropna(subset=['price'], inplace=True)
df['price'] = df['price'].astype(float)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [6]:
num_map = {'two': 2, 'four': 4, 'six': 6, 'five': 5, 'eight': 8, 'three': 3, 'twelve':12}
df['num_doors'] = df['num_doors'].map(num_map)
df['num_cylinders'] = df['num_cylinders'].map(num_map)


In [7]:
df = pd.get_dummies(df, columns=['body_style', 'drive_wheels'], drop_first=True)


In [8]:
from sklearn.preprocessing import LabelEncoder

label_cols = ['make', 'aspiration', 'engine_location', 'fuel_type']
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])


In [9]:
df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in x.lower() else 0)


In [10]:
df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in x.lower() else 0)


In [11]:
from sklearn.preprocessing import StandardScaler

X = df.drop('price', axis=1)
y = df['price']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict and evaluate
y_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Linear Regression - MSE: {mse:.2f}, R2 Score: {r2:.2f}")


Linear Regression - MSE: 13422229.59, R2 Score: 0.80


In [13]:
from sklearn.decomposition import PCA

# Reduce dimensionality (retain 95% variance)
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

# Split reduced dataset
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)

# Train Linear Regression on PCA data
lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train_pca)

# Predict and evaluate
y_pred_pca = lr_pca.predict(X_test_pca)
mse_pca = mean_squared_error(y_test_pca, y_pred_pca)
r2_pca = r2_score(y_test_pca, y_pred_pca)

print(f"PCA + Linear Regression - MSE: {mse_pca:.2f}, R2 Score: {r2_pca:.2f}")


PCA + Linear Regression - MSE: 17154268.25, R2 Score: 0.75
