In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Step 1: Read the cars dataset
cars_df = pd.read_csv('car data (2).csv')  # Adjust the filename as per your dataset

# Step 2: Exploratory Data Analysis (EDA)
# Data Quality Check
print(cars_df.info())
print(cars_df.describe())

# Treat Missing Values if any
cars_df.dropna(inplace=True)  # Example: Drop rows with missing values, you might want a more sophisticated approach

# Step 3: Transform Categorical Data
# Example: Convert categorical columns to numerical using one-hot encoding
cars_df = pd.get_dummies(cars_df, columns=['Fuel_Type', 'Seller_Type', 'Transmission'])

# Step 4: Apply Multiple Linear Regression Model without regularization
# Assume 'Selling_Price' is the target variable, and other relevant features are predictors
X = cars_df.drop(['Car_Name', 'Selling_Price'], axis=1)
y = cars_df['Selling_Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the linear regression model without regularization
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

# Step 5: Print the Model Results without regularization
print("Results without Regularization:")
print(f'Intercept: {model_lr.intercept_}')
print('Coefficients:')
for feature, coefficient in zip(X.columns, model_lr.coef_):
    print(f'{feature}: {coefficient}')

# Step 6: Get Feature Importance without regularization
# In linear regression, coefficients can be considered as feature importance

# Step 7: Compare all the models using the Sum of Squared Error metric
y_pred_lr = model_lr.predict(X_test)
sse_lr = mean_squared_error(y_test, y_pred_lr)

# Step 8: Apply Multiple Linear Regression Model with Ridge regularization
# Assume 'Selling_Price' is the target variable, and other relevant features are predictors
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the scaled data into training and testing sets
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Fit the linear regression model with Ridge regularization
model_ridge = Ridge(alpha=1.0)  # Adjust alpha as needed
model_ridge.fit(X_train_scaled, y_train_scaled)

# Step 9: Print the Model Results with Ridge regularization
print("\nResults with Ridge Regularization:")
print(f'Intercept: {model_ridge.intercept_}')
print('Coefficients:')
for feature, coefficient in zip(X.columns, model_ridge.coef_):
    print(f'{feature}: {coefficient}')

# Step 10: Compare all the models using the Sum of Squared Error metric for Ridge
y_pred_ridge = model_ridge.predict(X_test_scaled)
sse_ridge = mean_squared_error(y_test_scaled, y_pred_ridge)

# Repeat similar steps for Lasso and ElasticNet regularizations

# Step 11: Apply Multiple Linear Regression Model with Lasso regularization
model_lasso = Lasso(alpha=1.0)  # Adjust alpha as needed
model_lasso.fit(X_train_scaled, y_train_scaled)

# Print the Model Results with Lasso regularization
print("\nResults with Lasso Regularization:")
print(f'Intercept: {model_lasso.intercept_}')
print('Coefficients:')
for feature, coefficient in zip(X.columns, model_lasso.coef_):
    print(f'{feature}: {coefficient}')

# Compare all the models using the Sum of Squared Error metric for Lasso
y_pred_lasso = model_lasso.predict(X_test_scaled)
sse_lasso = mean_squared_error(y_test_scaled, y_pred_lasso)

# Step 12: Apply Multiple Linear Regression Model with ElasticNet regularization
model_elasticnet = ElasticNet(alpha=1.0, l1_ratio=0.5)  # Adjust alpha and l1_ratio as needed
model_elasticnet.fit(X_train_scaled, y_train_scaled)

# Print the Model Results with ElasticNet regularization
print("\nResults with ElasticNet Regularization:")
print(f'Intercept: {model_elasticnet.intercept_}')
print('Coefficients:')
for feature, coefficient in zip(X.columns, model_elasticnet.coef_):
    print(f'{feature}: {coefficient}')

# Compare all the models using the Sum of Squared Error metric for ElasticNet
y_pred_elasticnet = model_elasticnet.predict(X_test_scaled)
sse_elasticnet = mean_squared_error(y_test_scaled, y_pred_elasticnet)

# Step 13: Compare the models based on SSE
print("\nComparison of Models based on Sum of Squared Error:")
print(f'Linear Regression SSE: {sse_lr}')
print(f'Ridge Regression SSE: {sse_ridge}')
print(f'Lasso Regression SSE: {sse_lasso}')
print(f'ElasticNet Regression SSE: {sse_elasticnet}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB
None
              Year  Selling_Price  Present_Price     Kms_Driven       Owner
count   301.000000     301.000000     301.000000     301.000000  301.000000
mean   2013.627907       4.661296       7.628472   36947.205980    0.043189
std       2.891554       5.082812       8.644115   38886.883882    0.247915
min    2003.000000       0.100000       