##  Importing Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.graph_objects as g
import plotly.subplots as sp
from plotly.subplots import make_subplots
import plotly.io as pio
import plotly.express as px
import matplotlib.cm as cm
from scipy.stats import gaussian_kde
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
%matplotlib inline
sns.set()

# Bulding a Linear Regression Model

In [None]:


file_path = r"cleaned_car_prices.csv"
df = pd.read_csv(file_path)

df = df.drop(columns=['saledate', 'seller'])

for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

X = df.drop(columns=['sell_price'])
y = df['sell_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("R² Score:", r2)

- Mean Absolute Error (MAE):** `764.23`  
  This means that, on average, the model's predictions are off by approximately **$764.23** from the actual car prices. A lower MAE indicates better accuracy.

- R² Score:** `0.9831`  
  This indicates that the model explains **98.31%** of the variance in the car prices. An R² score close to 1 suggests that the model fits the data very well.

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(2, 2, 1)
sns.histplot(y, bins=30, kde=True, color="blue")
plt.title("Distribution of Sell Prices")
plt.xlabel("Sell Price")
plt.ylabel("Frequency")

plt.subplot(2, 2, 2)
plt.scatter(y_test, y_pred, alpha=0.5, color="red")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "--", color="black")
plt.title("Actual vs. Predicted Sell Prices")
plt.xlabel("Actual Sell Price")
plt.ylabel("Predicted Sell Price")

plt.subplot(2, 2, 3)
sns.regplot(x=y_test, y=y_pred, scatter_kws={"alpha": 0.5}, line_kws={"color": "black"})
plt.title("Scatter Plot: Actual vs. Predicted Prices")
plt.xlabel("Actual Sell Price")
plt.ylabel("Predicted Sell Price")

plt.subplot(2, 2, 4)
residuals = y_test - y_pred
sns.histplot(residuals, bins=30, kde=True, color="green")
plt.title("Residuals Distribution")
plt.xlabel("Prediction Errors (Residuals)")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

### Distribution of Sell Prices

The image provides a visual analysis of the distribution of actual and predicted sell prices, along with the residuals (prediction errors). Here's a breakdown:

1. **Actual Sell Prices:**
   - The distribution shows the frequency of actual car prices, with most prices concentrated between **$0 and $35,000**.

2. **Predicted Sell Prices:**
   - The predicted prices closely follow the actual prices, indicating that the model performs well in predicting car prices.

3. **Residuals Distribution:**
   - The residuals (differences between actual and predicted prices) are centered around **0**, with most errors falling within the range of **-$4,000 to $4,000**. This suggests that the model's predictions are generally accurate.

4. **Scatter Plot: Actual vs. Predicted Prices:**
   - The scatter plot shows a strong linear relationship between actual and predicted prices, further confirming the model's accuracy.

### Key Insights:
- The model effectively predicts car prices, with most predictions being close to the actual values.
- The residuals are symmetrically distributed around zero, indicating no significant bias in the predictions.