In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

warnings.filterwarnings('ignore')


In [7]:
# Load Dataset
df = pd.read_csv('CAR DETAILS FROM CAR DEKHO.csv')
print(df.info())
print(df.isnull().sum())  # Checking for null values
print("Duplicate entries:", df.duplicated().sum())

df = df.drop_duplicates()  # Dropping duplicates

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB
None
name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64
Duplicate entries: 763


In [8]:
# Extracting brand name from name column
df['brand_name'] = df['name'].str.split().str[0]
df['brand_name'] = df['brand_name'].replace('Land', 'Land Rover')

In [9]:
# Dropping unnecessary columns
df = df.drop(['fuel', 'seller_type', 'name'], axis=1)


In [10]:
# One-Hot Encoding
df = pd.get_dummies(df, columns=['transmission', 'brand_name'], drop_first=True)

In [11]:
# Label Encoding for 'owner' column
le = LabelEncoder()
df['owner'] = le.fit_transform(df['owner'])

In [12]:
# Adding 'car_age' column
current_year = 2025
df['car_age'] = current_year - df['year']
df.drop('year', axis=1, inplace=True)


In [13]:
# Scaling 'km_driven' column
scaler = StandardScaler()
df[['km_driven']] = scaler.fit_transform(df[['km_driven']])

In [14]:
# Outlier handling using log transformation
df['selling_price'] = np.log1p(df['selling_price'])
df['km_driven'] = np.log1p(df['km_driven'])

In [15]:
# Handling any remaining NaN values
df['km_driven'].fillna(df['km_driven'].median(), inplace=True)

In [16]:
# Splitting Data into Train and Test Sets
X = df.drop('selling_price', axis=1)
y = df['selling_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Model Training and Evaluation
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R² Score: {r2}")

Mean Absolute Error (MAE): 0.3416063645866975
Mean Squared Error (MSE): 0.18833405561953828
R² Score: 0.7253465642019441


In [18]:
# Save Model and Column Names
pickle.dump(model, open('model_lr.pkl', 'wb'))
pickle.dump(X_train.columns, open('model_columns.pkl', 'wb'))