## Import Required Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Set visualization style
sns.set(style="whitegrid")


## Load the Dataset

In [6]:
# Load the dataset
file_path = "ToyotaCorolla - MLR.csv"  # Update with the correct file path if needed
data = pd.read_csv(file_path)

# Display dataset overview
print("Dataset Overview:")
data.info()

# Display the first 5 rows
print("\nFirst 5 Rows of the Dataset:")
data.head()


Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Price      1436 non-null   int64 
 1   Age_08_04  1436 non-null   int64 
 2   KM         1436 non-null   int64 
 3   Fuel_Type  1436 non-null   object
 4   HP         1436 non-null   int64 
 5   Automatic  1436 non-null   int64 
 6   cc         1436 non-null   int64 
 7   Doors      1436 non-null   int64 
 8   Cylinders  1436 non-null   int64 
 9   Gears      1436 non-null   int64 
 10  Weight     1436 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 123.5+ KB

First 5 Rows of the Dataset:


Unnamed: 0,Price,Age_08_04,KM,Fuel_Type,HP,Automatic,cc,Doors,Cylinders,Gears,Weight
0,13500,23,46986,Diesel,90,0,2000,3,4,5,1165
1,13750,23,72937,Diesel,90,0,2000,3,4,5,1165
2,13950,24,41711,Diesel,90,0,2000,3,4,5,1165
3,14950,26,48000,Diesel,90,0,2000,3,4,5,1165
4,13750,30,38500,Diesel,90,0,2000,3,4,5,1170


## Exploratory Data Analysis (EDA)

In [10]:
# Statistical summary of the dataset
print("Statistical Summary:")
print(data.describe())

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Visualizing correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


Statistical Summary:
              Price    Age_08_04             KM           HP    Automatic  \
count   1436.000000  1436.000000    1436.000000  1436.000000  1436.000000   
mean   10730.824513    55.947075   68533.259749   101.502089     0.055710   
std     3626.964585    18.599988   37506.448872    14.981080     0.229441   
min     4350.000000     1.000000       1.000000    69.000000     0.000000   
25%     8450.000000    44.000000   43000.000000    90.000000     0.000000   
50%     9900.000000    61.000000   63389.500000   110.000000     0.000000   
75%    11950.000000    70.000000   87020.750000   110.000000     0.000000   
max    32500.000000    80.000000  243000.000000   192.000000     1.000000   

                cc        Doors  Cylinders        Gears      Weight  
count   1436.00000  1436.000000     1436.0  1436.000000  1436.00000  
mean    1576.85585     4.033426        4.0     5.026462  1072.45961  
std      424.38677     0.952677        0.0     0.188510    52.64112  
min  

ValueError: could not convert string to float: 'Diesel'

<Figure size 1200x800 with 0 Axes>

## Visualize Distributions of Key Variables

In [None]:
# Distribution plots for key columns
key_columns = ['Price', 'Age_08_04', 'KM', 'HP', 'CC', 'Doors', 'Gears', 'Quarterly_Tax', 'Weight']
plt.figure(figsize=(16, 12))
for i, col in enumerate(key_columns, 1):
    plt.subplot(3, 3, i)
    sns.histplot(data[col], kde=True, color="skyblue", bins=30)
    plt.title(f"Distribution of {col}")
plt.tight_layout()
plt.show()


## Data Preprocessing

In [14]:
# Drop unnecessary columns (e.g., 'Id')
data_cleaned = data.drop(['Id'], axis=1, errors='ignore')

# Encode categorical variables (if present)
data_cleaned = pd.get_dummies(data_cleaned, drop_first=True)

# Define Features (X) and Target (Y)
X = data_cleaned.drop('Price', axis=1)
Y = data_cleaned['Price']


## Split the Dataset

In [17]:
# Split data into training (80%) and testing (20%) sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Print the shape of training and testing datasets
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")


Training Data Shape: (1148, 11)
Testing Data Shape: (288, 11)


## Build and Evaluate Linear Regression Model

In [20]:
# Initialize and train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, Y_train)

# Predict for both training and testing datasets
Y_pred_train = lr_model.predict(X_train)
Y_pred_test = lr_model.predict(X_test)

# Evaluate the model
print("Linear Regression Training R2 Score:", r2_score(Y_train, Y_pred_train))
print("Linear Regression Testing R2 Score:", r2_score(Y_test, Y_pred_test))
print("Linear Regression Mean Squared Error:", mean_squared_error(Y_test, Y_pred_test))

# Display coefficients of the model
coefficients = pd.DataFrame({"Feature": X.columns, "Coefficient": lr_model.coef_})
print("\nCoefficients of Linear Regression Model:")
print(coefficients)


Linear Regression Training R2 Score: 0.8702643169608926
Linear Regression Testing R2 Score: 0.8348888040611082
Linear Regression Mean Squared Error: 2203043.8231437025

Coefficients of Linear Regression Model:
             Feature   Coefficient
0          Age_08_04 -1.208305e+02
1                 KM -1.623141e-02
2                 HP  1.403948e+01
3          Automatic  1.488309e+02
4                 cc -3.037219e-02
5              Doors -6.031097e+01
6          Cylinders -1.108447e-12
7              Gears  5.516007e+02
8             Weight  2.588496e+01
9   Fuel_Type_Diesel -6.854876e+01
10  Fuel_Type_Petrol  1.370809e+03


##  Build and Evaluate Ridge Regression

In [23]:
# Initialize and train Ridge Regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, Y_train)

# Predict and evaluate the model
ridge_pred_test = ridge_model.predict(X_test)
print("Ridge Regression Testing R2 Score:", r2_score(Y_test, ridge_pred_test))
print("Ridge Regression Mean Squared Error:", mean_squared_error(Y_test, ridge_pred_test))

# Display coefficients of the Ridge Regression model
ridge_coefficients = pd.DataFrame({"Feature": X.columns, "Coefficient": ridge_model.coef_})
print("\nCoefficients of Ridge Regression Model:")
print(ridge_coefficients)


Ridge Regression Testing R2 Score: 0.8351359377712344
Ridge Regression Mean Squared Error: 2199746.3702333285

Coefficients of Ridge Regression Model:
             Feature  Coefficient
0          Age_08_04  -120.779659
1                 KM    -0.016324
2                 HP    14.141834
3          Automatic   146.907470
4                 cc    -0.030509
5              Doors   -59.905791
6          Cylinders     0.000000
7              Gears   542.272328
8             Weight    25.821019
9   Fuel_Type_Diesel  -128.813072
10  Fuel_Type_Petrol  1294.949826
