## Car Selling Price Prediction


In [None]:
#installing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error  
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
## Data Collection and Preprocessing
car_dataset = pd.read_csv('car.csv')

In [3]:
X = car_dataset.drop('selling_price', axis=1).copy()
y = car_dataset['selling_price']

In [4]:
#inspecting the dataset
print(car_dataset.shape)
car_dataset.head()

(8128, 12)


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0


In [8]:
# Remove rows with missing target values
X = X[y.notnull()]
y = y[y.notnull()]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

X_train, y_train = X_train.loc[~X_train.duplicated()], y_train.loc[~X_train.duplicated()]
X_val, y_val = X_val.loc[~X_val.duplicated()], y_val.loc[~X_val.duplicated()]

In [9]:
# Identify numeric and categorical columns from X_train
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns

# Impute numeric on X_train and transform X_val
num_imputer = SimpleImputer(strategy='mean')
X_train[num_cols] = pd.DataFrame(
    num_imputer.fit_transform(X_train[num_cols]),
    columns=num_cols,
    index=X_train.index
)
X_val[num_cols] = pd.DataFrame(
    num_imputer.transform(X_val[num_cols]),
    columns=num_cols,
    index=X_val.index
)

# Impute categorical on X_train and transform X_val
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train[cat_cols] = pd.DataFrame(
    cat_imputer.fit_transform(X_train[cat_cols]),
    columns=cat_cols,
    index=X_train.index
)
X_val[cat_cols] = pd.DataFrame(
    cat_imputer.transform(X_val[cat_cols]),
    columns=cat_cols,
    index=X_val.index
)

In [10]:
print("Duplicate rows in X_train:", X_train.duplicated().sum())
print("Duplicate rows in X_val:", X_val.duplicated().sum())


Duplicate rows in X_train: 0
Duplicate rows in X_val: 0


In [16]:
X_train.info()
X_val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5506 entries, 3042 to 2732
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                5506 non-null   object 
 1   year                5506 non-null   float64
 2   km_driven           5506 non-null   float64
 3   fuel                5506 non-null   object 
 4   seller_type         5506 non-null   object 
 5   transmission        5506 non-null   object 
 6   owner               5506 non-null   object 
 7   mileage(km/ltr/kg)  5506 non-null   float64
 8   engine              5506 non-null   float64
 9   max_power           5506 non-null   object 
 10  seats               5506 non-null   float64
dtypes: float64(5), object(6)
memory usage: 516.2+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 1491 entries, 3558 to 5934
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 