#Load and Inspect the dataset

In [10]:
import pandas as pd
file_path = '/content/car data.csv'
df = pd.read_csv(file_path)
df.head()


Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


#Check for Missing Values and Data Types

In [11]:
missing_values = df.isnull().sum()
data_types = df.dtypes
missing_values, data_types


(Car_Name         0
 Year             0
 Selling_Price    0
 Present_Price    0
 Driven_kms       0
 Fuel_Type        0
 Selling_type     0
 Transmission     0
 Owner            0
 dtype: int64,
 Car_Name          object
 Year               int64
 Selling_Price    float64
 Present_Price    float64
 Driven_kms         int64
 Fuel_Type         object
 Selling_type      object
 Transmission      object
 Owner              int64
 dtype: object)

#Data Preprocessing

##Handling Missing Values

In [12]:
df = df.dropna()
df.isnull().sum().sum()


0

##Encoding Categorical Variables

In [13]:
from sklearn.preprocessing import OneHotEncoder
categorical_cols = df.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df_encoded.head()


Unnamed: 0,Year,Selling_Price,Present_Price,Driven_kms,Owner,Car_Name_Activa 3g,Car_Name_Activa 4g,Car_Name_Bajaj ct 100,Car_Name_Bajaj Avenger 150,Car_Name_Bajaj Avenger 150 street,...,Car_Name_swift,Car_Name_sx4,Car_Name_verna,Car_Name_vitara brezza,Car_Name_wagon r,Car_Name_xcent,Fuel_Type_Diesel,Fuel_Type_Petrol,Selling_type_Individual,Transmission_Manual
0,2014,3.35,5.59,27000,0,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
1,2013,4.75,9.54,43000,0,False,False,False,False,False,...,False,True,False,False,False,False,True,False,False,True
2,2017,7.25,9.85,6900,0,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
3,2011,2.85,4.15,5200,0,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,True
4,2014,4.6,6.87,42450,0,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,True


##Feature Scaling

In [14]:
from sklearn.preprocessing import StandardScaler
numerical_cols = df_encoded.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])
df_encoded.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Driven_kms,Owner,Car_Name_Activa 3g,Car_Name_Activa 4g,Car_Name_Bajaj ct 100,Car_Name_Bajaj Avenger 150,Car_Name_Bajaj Avenger 150 street,...,Car_Name_swift,Car_Name_sx4,Car_Name_verna,Car_Name_vitara brezza,Car_Name_wagon r,Car_Name_xcent,Fuel_Type_Diesel,Fuel_Type_Petrol,Selling_type_Individual,Transmission_Manual
0,0.128897,-0.258416,-0.236256,-0.256224,-0.174501,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
1,-0.217514,0.017481,0.221544,0.155911,-0.174501,False,False,False,False,False,...,False,True,False,False,False,False,True,False,False,True
2,1.168129,0.510154,0.257472,-0.773969,-0.174501,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
3,-0.910335,-0.35695,-0.403151,-0.817758,-0.174501,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,True
4,0.128897,-0.012079,-0.087906,0.141743,-0.174501,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,True


#Splitting the Data into Training and Test Sets

In [15]:
from sklearn.model_selection import train_test_split
X = df_encoded.drop(columns=['Selling_Price'])
y = df_encoded['Selling_Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((240, 105), (61, 105))

#Model Training

In [16]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

#Model Evaluation

In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

mae, rmse, r2


(937301197547.0785, 3783812651268.394, -1.600378799315433e+25)