In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
# 1. Data Preprocessing
df = pd.read_csv("C:\\Users\\chall\\OneDrive\\Desktop\\MLR\\MLR\\ToyotaCorolla - MLR.csv")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

print("Missing values per column:\n", df.isnull().sum())
df = df.dropna().reset_index(drop=True)

# Encode categorical variable Fuel_Type
df = pd.get_dummies(df, columns=["Fuel_Type"], drop_first=True)

# Features and target
X = df.drop(columns=['Price'])
y = df['Price']

# Feature scaling
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
print("Features used:", X_scaled.columns.tolist())

Missing values per column:
 Price        0
Age_08_04    0
KM           0
Fuel_Type    0
HP           0
Automatic    0
cc           0
Doors        0
Cylinders    0
Gears        0
Weight       0
dtype: int64
Features used: ['Age_08_04', 'KM', 'HP', 'Automatic', 'cc', 'Doors', 'Cylinders', 'Gears', 'Weight', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol']


In [3]:
# 2. Multicollinearity Check
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data['feature'] = X.columns
    vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

vif_df = calculate_vif(X_scaled)
print(vif_df)

             feature        VIF
0          Age_08_04   1.920520
1                 KM   2.001790
2                 HP   2.299766
3          Automatic   1.094550
4                 cc   1.223892
5              Doors   1.217898
6          Cylinders        NaN
7              Gears   1.117413
8             Weight   3.297145
9   Fuel_Type_Diesel  11.317251
10  Fuel_Type_Petrol   9.702194


  return 1 - self.ssr/self.uncentered_tss


In [4]:
# 3. Model Building

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model 1: Standard Multiple Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Model 2: Linear Regression with Interaction Terms (Age * KM, HP * Weight)
X_int = X_scaled.copy()
X_int['Age_KM'] = X_int['Age_08_04'] * X_int['KM']
X_int['HP_Weight'] = X_int['HP'] * X_int['Weight']
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_int, y, test_size=0.2, random_state=42)
lr2 = LinearRegression()
lr2.fit(X_train2, y_train2)
y_pred_lr2 = lr2.predict(X_test2)

# Model 3: Polynomial Features (degree=2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_scaled)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_poly, y, test_size=0.2, random_state=42)
lr3 = LinearRegression()
lr3.fit(X_train3, y_train3)
y_pred_lr3 = lr3.predict(X_test3)

In [5]:
# 4. Model Evaluation
def eval_model(y_test, y_pred, name="Model"):
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name}:\n  RMSE: {rmse:.2f}\n  MAE: {mae:.2f}\n  R2: {r2:.3f}\n")

eval_model(y_test, y_pred_lr, "Linear Regression")
eval_model(y_test2, y_pred_lr2, "Linear Regression + Interactions")
eval_model(y_test3, y_pred_lr3, "Polynomial Regression (deg=2)")


Linear Regression:
  RMSE: 1484.27
  MAE: 990.89
  R2: 0.835

Linear Regression + Interactions:
  RMSE: 1368.30
  MAE: 891.42
  R2: 0.860

Polynomial Regression (deg=2):
  RMSE: 1786.79
  MAE: 908.95
  R2: 0.761



In [6]:
# 5. Lasso and Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

eval_model(y_test, y_pred_ridge, "Ridge Regression")
eval_model(y_test, y_pred_lasso, "Lasso Regression")

# Compare Coefficients: Standard vs Ridge vs Lasso
coef_df = pd.DataFrame({
    'Linear': lr.coef_,
    'Ridge': ridge.coef_,
    'Lasso': lasso.coef_
}, index=X_scaled.columns)
print(coef_df)

Ridge Regression:
  RMSE: 1483.47
  MAE: 990.87
  R2: 0.835

Lasso Regression:
  RMSE: 1484.16
  MAE: 990.90
  R2: 0.835

                        Linear        Ridge        Lasso
Age_08_04        -2.246662e+03 -2244.540333 -2246.643042
KM               -6.085706e+02  -610.243379  -608.624409
HP                2.102533e+02   211.552393   210.364888
Automatic         3.413608e+01    34.349741    34.078222
cc               -1.288507e+01   -12.749655   -12.786570
Doors            -5.743684e+01   -56.548223   -57.254803
Cylinders         3.410605e-13     0.000000     0.000000
Gears             1.039463e+02   104.054836   103.883104
Weight            1.362139e+03  1359.382176  1361.705593
Fuel_Type_Diesel -2.127088e+01   -21.825934   -21.029790
Fuel_Type_Petrol  4.451027e+02   441.206721   444.955639


In [7]:
# 6. Addressing Multicollinearity
print("Features with VIF > 10 (potential multicollinearity):")
print(vif_df[vif_df['VIF'] > 10])
# To drop high VIF features and rebuild model:
# X_reduced = X_scaled.drop(columns=vif_df[vif_df['VIF'] > 10]['feature'])

Features with VIF > 10 (potential multicollinearity):
            feature        VIF
9  Fuel_Type_Diesel  11.317251


In [None]:
### Interview Questions & Answers

1. What is Normalization & Standardization and how is it helpful?  

- Normalization: Scales feature values into a fixed range, usually [0,1].  
  Formula:  
  \[
  x' = \frac{x - x_{min}}{x_{max} - x_{min}}
  \]  
  Useful when features have different ranges or when algorithms rely on distance (e.g., KNN, K-means).  

- Standardization: Transforms features to have zero mean and unit variance (standard normal distribution).  
  Formula:  
  \[
  x' = \frac{x - \mu}{\sigma}
  \]  
  Helpful for algorithms sensitive to scale (e.g., Linear Regression, Logistic Regression, PCA).  

Why it helps in regression:  
- Prevents features with large scales from dominating coefficients.  
- Improves numerical stability and interpretability.  
- Helps optimization converge faster. 

In [None]:
# Explanation
### Interpretation of Coefficients

Each coefficient in a multiple linear regression model represents the expected change in the target variable (Price) 
for a one-unit increase in the predictor variable, holding all other predictors constant.

For example:
- If the coefficient of `Age` is `-120`, it means that for every additional year of the car's age, the price is expected to decrease by 120 units, assuming other factors remain constant.
- If the coefficient of `KM` is `-0.02`, then each additional kilometer driven reduces the car's price by 0.02 units, keeping other features constant.
- Positive coefficients (e.g., `HP`) indicate that higher horsepower increases the predicted car price.*/