# Toyota Corolla Price Prediction — Multiple Linear Regression


This notebook performs EDA, preprocessing, builds three MLR models, evaluates them, applies Lasso & Ridge, interprets coefficients, and includes interview answers & assumptions.


In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import RFE
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load dataset 
df = pd.read_csv("ToyotaCorolla - MLR.csv")
# Standardize column names
df.columns = [c.strip() for c in df.columns]
# Rename known columns
if 'Age_08_04' in df.columns:
    df = df.rename(columns={'Age_08_04':'Age'})
if 'Fuel_Type' in df.columns:
    df = df.rename(columns={'Fuel_Type':'FuelType'})
if 'cc' in df.columns:
    df = df.rename(columns={'cc':'CC'})
print('Columns:', df.columns.tolist())
df.head()

Columns: ['Price', 'Age', 'KM', 'FuelType', 'HP', 'Automatic', 'CC', 'Doors', 'Cylinders', 'Gears', 'Weight']


Unnamed: 0,Price,Age,KM,FuelType,HP,Automatic,CC,Doors,Cylinders,Gears,Weight
0,13500,23,46986,Diesel,90,0,2000,3,4,5,1165
1,13750,23,72937,Diesel,90,0,2000,3,4,5,1165
2,13950,24,41711,Diesel,90,0,2000,3,4,5,1165
3,14950,26,48000,Diesel,90,0,2000,3,4,5,1165
4,13750,30,38500,Diesel,90,0,2000,3,4,5,1170


In [3]:
# Basic EDA
print('Shape:', df.shape)
display(df.describe())
print('\nMissing values per column:\n', df.isnull().sum())

# Correlations with Price
if 'Price' in df.columns:
    corr = df.corr(numeric_only=True)
    print(corr['Price'].sort_values(ascending=False))


Shape: (1436, 11)


Unnamed: 0,Price,Age,KM,HP,Automatic,CC,Doors,Cylinders,Gears,Weight
count,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0
mean,10730.824513,55.947075,68533.259749,101.502089,0.05571,1576.85585,4.033426,4.0,5.026462,1072.45961
std,3626.964585,18.599988,37506.448872,14.98108,0.229441,424.38677,0.952677,0.0,0.18851,52.64112
min,4350.0,1.0,1.0,69.0,0.0,1300.0,2.0,4.0,3.0,1000.0
25%,8450.0,44.0,43000.0,90.0,0.0,1400.0,3.0,4.0,5.0,1040.0
50%,9900.0,61.0,63389.5,110.0,0.0,1600.0,4.0,4.0,5.0,1070.0
75%,11950.0,70.0,87020.75,110.0,0.0,1600.0,5.0,4.0,5.0,1085.0
max,32500.0,80.0,243000.0,192.0,1.0,16000.0,5.0,4.0,6.0,1615.0



Missing values per column:
 Price        0
Age          0
KM           0
FuelType     0
HP           0
Automatic    0
CC           0
Doors        0
Cylinders    0
Gears        0
Weight       0
dtype: int64
Price        1.000000
Weight       0.581198
HP           0.314990
Doors        0.185326
CC           0.126389
Gears        0.063104
Automatic    0.033081
KM          -0.569960
Age         -0.876590
Cylinders         NaN
Name: Price, dtype: float64


In [4]:
# Preprocessing
# Map Automatic if 'Yes'/'No'
if 'Automatic' in df.columns and df['Automatic'].dtype == object:
    df['Automatic'] = df['Automatic'].map({'Yes':1,'No':0})

# One-hot encode FuelType (if present)
if 'FuelType' in df.columns:
    df = pd.concat([df.drop(columns=['FuelType']), pd.get_dummies(df['FuelType'], prefix='Fuel', drop_first=True)], axis=1)

# Ensure Doors numeric when string like '4' or '4.0'
if 'Doors' in df.columns and df['Doors'].dtype == object:
    df['Doors'] = df['Doors'].str.extract('(\\d+)').astype(float)

# Drop rows with NA (dataset is complete in this case)
df = df.dropna().reset_index(drop=True)
print('After preprocessing shape:', df.shape)
df.head()

After preprocessing shape: (1436, 12)


Unnamed: 0,Price,Age,KM,HP,Automatic,CC,Doors,Cylinders,Gears,Weight,Fuel_Diesel,Fuel_Petrol
0,13500,23,46986,90,0,2000,3,4,5,1165,True,False
1,13750,23,72937,90,0,2000,3,4,5,1165,True,False
2,13950,24,41711,90,0,2000,3,4,5,1165,True,False
3,14950,26,48000,90,0,2000,3,4,5,1165,True,False
4,13750,30,38500,90,0,2000,3,4,5,1170,True,False


In [5]:
# Features and target
X = df.drop(columns=['Price'])
y = df['Price']
print('Features used:', X.columns.tolist())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

Features used: ['Age', 'KM', 'HP', 'Automatic', 'CC', 'Doors', 'Cylinders', 'Gears', 'Weight', 'Fuel_Diesel', 'Fuel_Petrol']
Train shape: (1148, 11) Test shape: (288, 11)


In [6]:
# ===== Final robust OLS run: force numeric ndarrays for statsmodels =====
import numpy as np
import pandas as pd
import statsmodels.api as sm

# quick check
print("Before conversion dtypes:")
print(X_train.dtypes)
print("\ny_train dtype:", y_train.dtype)
print("\nSample X_train head:")
display(X_train.head())

# Convert boolean columns to int (so pandas doesn't use nullable bool types)
for c in X_train.columns:
    if X_train[c].dtype == 'bool' or X_train[c].dtype.name == 'boolean':
        X_train[c] = X_train[c].astype(int)
        X_test[c]  = X_test[c].astype(int)

# Force all feature columns to float
X_train_num = X_train.apply(pd.to_numeric, errors='coerce').astype(float)
X_test_num  = X_test.apply(pd.to_numeric, errors='coerce').astype(float)

# Force target to numeric float as well
y_train_num = pd.to_numeric(y_train, errors='coerce').astype(float)
y_test_num  = pd.to_numeric(y_test, errors='coerce').astype(float)

# Check for any NaNs introduced by coercion
nans_train = X_train_num.isna().any().sum() + y_train_num.isna().sum()
nans_test  = X_test_num.isna().any().sum() + y_test_num.isna().sum()
print(f"NaNs introduced? train cols with NaN count + NaN in y: {nans_train}, test: {nans_test}")

# If any NaNs exist, drop affected rows (safe for this dataset)
if X_train_num.isna().any().any() or y_train_num.isna().any():
    mask_tr = (~X_train_num.isna().any(axis=1)) & (~y_train_num.isna())
    X_train_num = X_train_num.loc[mask_tr].copy()
    y_train_num = y_train_num.loc[mask_tr].copy()
    print("Dropped some train rows due to NaN coercion. New train shape:", X_train_num.shape)

if X_test_num.isna().any().any() or y_test_num.isna().any():
    mask_te = (~X_test_num.isna().any(axis=1)) & (~y_test_num.isna())
    X_test_num = X_test_num.loc[mask_te].copy()
    y_test_num = y_test_num.loc[mask_te].copy()
    print("Dropped some test rows due to NaN coercion. New test shape:", X_test_num.shape)

# Convert to numpy arrays (explicit)
X_train_arr = np.asarray(X_train_num, dtype=float)
X_test_arr  = np.asarray(X_test_num, dtype=float)
y_train_arr = np.asarray(y_train_num, dtype=float)
y_test_arr  = np.asarray(y_test_num, dtype=float)

print("Numpy dtypes:", X_train_arr.dtype, y_train_arr.dtype)
print("Shapes:", X_train_arr.shape, X_test_arr.shape, y_train_arr.shape, y_test_arr.shape)

# Add constant column to the numpy matrix (statsmodels accepts numpy arrays)
X_train_sm = sm.add_constant(X_train_arr, has_constant='add')

# Fit OLS
ols_all = sm.OLS(y_train_arr, X_train_sm).fit()
print(ols_all.summary())


X_train = pd.DataFrame(X_train_arr, columns=X_train_num.columns, index=X_train_num.index)
X_test  = pd.DataFrame(X_test_arr,  columns=X_test_num.columns,  index=X_test_num.index)
y_train = pd.Series(y_train_arr, index=y_train_num.index, name='Price')
y_test  = pd.Series(y_test_arr,  index=y_test_num.index,  name='Price')


Before conversion dtypes:
Age            int64
KM             int64
HP             int64
Automatic      int64
CC             int64
Doors          int64
Cylinders      int64
Gears          int64
Weight         int64
Fuel_Diesel     bool
Fuel_Petrol     bool
dtype: object

y_train dtype: int64

Sample X_train head:


Unnamed: 0,Age,KM,HP,Automatic,CC,Doors,Cylinders,Gears,Weight,Fuel_Diesel,Fuel_Petrol
899,62,59295,86,0,1300,5,4,5,1035,False,True
881,68,61568,110,0,1600,4,4,5,1035,False,True
310,40,38653,110,0,1600,5,4,5,1080,False,True
1145,75,101855,110,0,1600,5,4,5,1070,False,True
31,22,35199,97,0,1400,3,4,5,1100,False,True


NaNs introduced? train cols with NaN count + NaN in y: 0, test: 0
Numpy dtypes: float64 float64
Shapes: (1148, 11) (288, 11) (1148,) (288,)
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.870
Model:                            OLS   Adj. R-squared:                  0.869
Method:                 Least Squares   F-statistic:                     762.7
Date:                Sat, 22 Nov 2025   Prob (F-statistic):               0.00
Time:                        21:54:54   Log-Likelihood:                -9863.2
No. Observations:                1148   AIC:                         1.975e+04
Df Residuals:                    1137   BIC:                         1.980e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025    

In [7]:
# Rebuild p-values with column names
feature_names = ['const'] + X_train.columns.tolist()   # names from your cleaned X_train dataframe
pvals_series = pd.Series(ols_all.pvalues, index=feature_names)

# Drop the constant
pvals_no_const = pvals_series.drop('const')

# Find weakest feature (highest p-value)
weakest_feature = pvals_no_const.sort_values(ascending=False).index[0]
print("Weakest feature:", weakest_feature)

# Build Model 2 using sklearn
from sklearn.linear_model import LinearRegression

X_train_m2 = X_train.drop(columns=[weakest_feature])
X_test_m2  = X_test.drop(columns=[weakest_feature])

lr2 = LinearRegression().fit(X_train_m2, y_train)
print("Model 2 trained successfully.")


Weakest feature: Fuel_Diesel
Model 2 trained successfully.


In [8]:
# Model 3: RFE-selected features (demonstration)
k = min(5, max(3, int(len(X_train.columns)/2)))
rfe = RFE(LinearRegression(), n_features_to_select=k)
rfe.fit(X_train, y_train)
selected_features = [f for f, s in zip(X_train.columns, rfe.support_) if s]
print('RFE selected features:', selected_features)

X_train_m3 = X_train[selected_features]
X_test_m3 = X_test[selected_features]
lr3 = LinearRegression().fit(X_train_m3, y_train)

RFE selected features: ['Automatic', 'Doors', 'Gears', 'Fuel_Diesel', 'Fuel_Petrol']


In [9]:
# Fit Model1 using sklearn and evaluate all models
lr1 = LinearRegression().fit(X_train, y_train)

def evaluate(model, Xte, yte):
    preds = model.predict(Xte)
    r2 = r2_score(yte, preds)
    mse = mean_squared_error(yte, preds)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(yte, preds)
    return {'r2':r2,'mse':mse,'rmse':rmse,'mae':mae, 'preds':preds}

res1 = evaluate(lr1, X_test, y_test)
res2 = evaluate(lr2, X_test_m2, y_test)
res3 = evaluate(lr3, X_test_m3, y_test)

import pandas as pd
results = pd.DataFrame([
    {'model':'Model 1 (All)', **res1},
    {'model':f'Model 2 (drop {weakest_feature})', **res2},
    {'model':f'Model 3 (RFE top {k})', **res3},
])
results[['model','r2','mse','rmse','mae']]

Unnamed: 0,model,r2,mse,rmse,mae
0,Model 1 (All),0.834889,2203044.0,1484.265415,990.887274
1,Model 2 (drop Fuel_Diesel),0.835262,2198060.0,1482.585606,991.018862
2,Model 3 (RFE top 5),0.074279,12351700.0,3514.49842,2518.303943


In [10]:
# Coefficients for Model 1 (all features)
coef_df1 = pd.DataFrame({'feature': X_train.columns, 'coef': lr1.coef_}).sort_values(by='coef', key=abs, ascending=False)
display(coef_df1)

# Show key coefficient interpretations
for feat in ['Age','KM','Weight','HP']:
    if feat in coef_df1['feature'].values:
        c = float(coef_df1.loc[coef_df1['feature']==feat,'coef'])
        print(f"{feat}: coef = {c:.3f}")

Unnamed: 0,feature,coef
10,Fuel_Petrol,1370.809
7,Gears,551.6007
3,Automatic,148.8309
0,Age,-120.8305
9,Fuel_Diesel,-68.54876
5,Doors,-60.31097
8,Weight,25.88496
2,HP,14.03948
4,CC,-0.03037219
1,KM,-0.01623141


Age: coef = -120.830
KM: coef = -0.016
Weight: coef = 25.885
HP: coef = 14.039


In [11]:
# Lasso & Ridge (with scaling) - Grid search for alpha
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

from sklearn.model_selection import GridSearchCV
gs_lasso = GridSearchCV(Lasso(max_iter=10000), {'alpha':[0.0001,0.001,0.01,0.1,1,10,50,100]}, cv=5, scoring='r2')
gs_ridge = GridSearchCV(Ridge(max_iter=10000), {'alpha':[0.0001,0.001,0.01,0.1,1,10,50,100]}, cv=5, scoring='r2')

gs_lasso.fit(X_train_s, y_train)
gs_ridge.fit(X_train_s, y_train)
best_lasso = gs_lasso.best_estimator_; best_ridge = gs_ridge.best_estimator_
print('Best Lasso alpha:', gs_lasso.best_params_)
print('Best Ridge alpha:', gs_ridge.best_params_)

res_lasso = evaluate(best_lasso, X_test_s, y_test)
res_ridge = evaluate(best_ridge, X_test_s, y_test)
print('\nLasso performance:', res_lasso)
print('Ridge performance:', res_ridge)

# Coefficients
lasso_coefs = pd.DataFrame({'feature': X_train.columns, 'coef': best_lasso.coef_}).sort_values(by='coef', key=abs, ascending=False)
ridge_coefs = pd.DataFrame({'feature': X_train.columns, 'coef': best_ridge.coef_}).sort_values(by='coef', key=abs, ascending=False)
display(lasso_coefs.head(10))
display(ridge_coefs.head(10))

Best Lasso alpha: {'alpha': 50}
Best Ridge alpha: {'alpha': 100}

Lasso performance: {'r2': 0.8425843609224032, 'mse': np.float64(2100363.6329087433), 'rmse': np.float64(1449.2631344613521), 'mae': np.float64(994.2263372986542), 'preds': array([11184.93358498,  8948.07997258,  9437.39060219,  9001.42646192,
       10120.87597079,  7955.67674895,  8738.67716457,  8264.4881506 ,
       14081.34583757, 13118.74225552,  9522.50947077,  9311.04785463,
       12947.2016958 , 12960.31366539,  9760.03164866,  8339.95399469,
       12219.00488784, 17398.25215504,  7434.00597388,  9203.39319588,
       12658.35987054, 17692.26575901,  7889.35357539, 10427.01304356,
        7649.07338029, 15959.92006728,  9869.13689778,  7380.41578936,
       15634.35167205, 14472.31977339,  8154.45497504,  9779.09687256,
        8563.95980505, 10087.36743016, 10239.71766981,  8050.07363727,
        8856.00120388,  9954.92606659, 16334.58040536,  9630.30380734,
        9829.6175054 ,  9501.56968221,  7940.2124972

Unnamed: 0,feature,coef
0,Age,-2252.726439
8,Weight,1149.7625
1,KM,-634.232429
10,Fuel_Petrol,307.034044
2,HP,252.300211
7,Gears,69.894602
3,Automatic,2.239938
6,Cylinders,0.0
5,Doors,-0.0
4,CC,-0.0


Unnamed: 0,feature,coef
0,Age,-2070.198371
8,Weight,1153.251521
1,KM,-719.773798
2,HP,286.76551
10,Fuel_Petrol,236.63182
7,Gears,111.814708
3,Automatic,42.850483
9,Fuel_Diesel,-26.680445
5,Doors,1.605719
4,CC,-0.6158


# Conclusion 

 The Multiple Linear Regression model built for predicting Toyota Corolla prices performed well, with the full-feature model achieving an R² of ~0.87, explaining most of the price variation. Key predictors were Age, KM, HP, Weight, and Fuel_Petrol, showing that newer, more powerful, and heavier cars generally have higher prices. The weakest feature based on p-value was Fuel_Diesel, and removing it (Model 2) did not reduce performance, confirming it has minimal impact. Regularization models (Lasso/Ridge) also performed strongly and handled multicollinearity better. Overall, the analysis shows that the model is reliable, interpretable, and effective for predicting Toyota Corolla prices.

# Interview Questions & Answers

**1. Normalization & Standardization:**

- Normalization rescales features to a fixed range (e.g., [0,1]). Useful for distance-based algorithms and when bounded inputs are desired.
- Standardization rescales features to mean 0 and std 1. Preferred for regularized linear models (Lasso/Ridge) because optimization behaves better and coefficients are comparable.

**2. Techniques for multicollinearity:**

- Detect with VIF or correlation matrix.
- Drop or combine correlated variables.
- Use PCA to generate orthogonal components.
- Use regularization (Ridge/Lasso) to reduce instability.
