In [42]:
import pandas as pd

# Load the uploaded file
data = pd.read_csv('cardekho_dataset.csv')

# Show first 5 rows of the dataset
data.head()


Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [43]:
# Drop unnecessary columns
data = data.drop(['car_name'], axis=1)

# Show remaining columns
print(data.columns)



Index(['Unnamed: 0', 'brand', 'model', 'vehicle_age', 'km_driven',
       'seller_type', 'fuel_type', 'transmission_type', 'mileage', 'engine',
       'max_power', 'seats', 'selling_price'],
      dtype='object')


In [44]:
# Drop the 'Unnamed: 0' column (unnecessary index column)
data = data.drop(['Unnamed: 0'], axis=1)

# Show the updated columns
print(data.columns)


Index(['brand', 'model', 'vehicle_age', 'km_driven', 'seller_type',
       'fuel_type', 'transmission_type', 'mileage', 'engine', 'max_power',
       'seats', 'selling_price'],
      dtype='object')


In [45]:
# Check for missing values in each column
print(data.isnull().sum())


brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64


In [46]:
# Convert categorical columns into numeric using one-hot encoding
data = pd.get_dummies(data, drop_first=True)

# Show the new shape of the dataset (rows, columns)
print(data.shape)

# Show first 5 rows to see the changes
data.head()


(15411, 164)


Unnamed: 0,vehicle_age,km_driven,mileage,engine,max_power,seats,selling_price,brand_BMW,brand_Bentley,brand_Datsun,...,model_i10,model_i20,model_redi-GO,seller_type_Individual,seller_type_Trustmark Dealer,fuel_type_Diesel,fuel_type_Electric,fuel_type_LPG,fuel_type_Petrol,transmission_type_Manual
0,9,120000,19.7,796,46.3,5,120000,False,False,False,...,False,False,False,True,False,False,False,False,True,True
1,5,20000,18.9,1197,82.0,5,550000,False,False,False,...,False,False,False,True,False,False,False,False,True,True
2,11,60000,17.0,1197,80.0,5,215000,False,False,False,...,False,True,False,True,False,False,False,False,True,True
3,9,37000,20.92,998,67.1,5,226000,False,False,False,...,False,False,False,True,False,False,False,False,True,True
4,6,30000,22.77,1498,98.59,5,570000,False,False,False,...,False,False,False,False,False,True,False,False,False,True


In [47]:
# Select features (X) by dropping the 'selling_price' column
X = data.drop('selling_price', axis=1)

# Select the target (y)
y = data['selling_price']

# Print the shapes to confirm they are correct
print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (15411, 163)
Target shape: (15411,)


In [48]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
predictions = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 150539582029.7094
