In [182]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
#from sklearn.metrics import mean_squared_error, r2_score

In [183]:
df = pd.read_csv('C:/Users/Antonia Reyes/Downloads/Assignment 2/cardekho.csv')
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0


In [184]:
# Show the shape of the dataset
df.shape

(8128, 12)

In [185]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                8128 non-null   object 
 1   year                8128 non-null   int64  
 2   selling_price       8128 non-null   int64  
 3   km_driven           8128 non-null   int64  
 4   fuel                8128 non-null   object 
 5   seller_type         8128 non-null   object 
 6   transmission        8128 non-null   object 
 7   owner               8128 non-null   object 
 8   mileage(km/ltr/kg)  7907 non-null   float64
 9   engine              7907 non-null   float64
 10  max_power           7913 non-null   object 
 11  seats               7907 non-null   float64
dtypes: float64(3), int64(3), object(6)
memory usage: 762.1+ KB


In [186]:
df.columns = df.columns.str.replace('mileage(km/ltr/kg)', 'mileage')
print(df.columns)

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'mileage', 'engine', 'max_power', 'seats'],
      dtype='object')


In [187]:
# check dataset has duplicate values or not
df.duplicated().sum()

np.int64(1202)

In [188]:
# Erase duplicates
df = df.drop_duplicates()

In [189]:
# check dataset has missing values or not
df.isnull().sum().sort_values(ascending=False)

seats            208
mileage          208
engine           208
max_power        205
km_driven          0
selling_price      0
year               0
name               0
owner              0
transmission       0
seller_type        0
fuel               0
dtype: int64

In [190]:
df = df.dropna()  # Drop missing values

In [191]:
# check dataset has missing values or not
df.isnull().sum().sort_values(ascending=False)

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
mileage          0
engine           0
max_power        0
seats            0
dtype: int64

In [192]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6718 entries, 0 to 8125
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6718 non-null   object 
 1   year           6718 non-null   int64  
 2   selling_price  6718 non-null   int64  
 3   km_driven      6718 non-null   int64  
 4   fuel           6718 non-null   object 
 5   seller_type    6718 non-null   object 
 6   transmission   6718 non-null   object 
 7   owner          6718 non-null   object 
 8   mileage        6718 non-null   float64
 9   engine         6718 non-null   float64
 10  max_power      6718 non-null   object 
 11  seats          6718 non-null   float64
dtypes: float64(3), int64(3), object(6)
memory usage: 682.3+ KB


In [193]:
# this command is to see the descriptive summary of the dataset
df.describe() # only for num cols

Unnamed: 0,year,selling_price,km_driven,mileage,engine,seats
count,6718.0,6718.0,6718.0,6718.0,6718.0,6718.0
mean,2013.60911,526319.6,73402.3,19.46531,1430.891337,5.434653
std,3.900648,523539.8,58699.8,4.04915,493.493277,0.98423
min,1994.0,29999.0,1.0,0.0,624.0,2.0
25%,2011.0,250000.0,38000.0,16.8,1197.0,5.0
50%,2014.0,420000.0,68221.5,19.44,1248.0,5.0
75%,2017.0,650000.0,100000.0,22.5,1498.0,5.0
max,2020.0,10000000.0,2360457.0,42.0,3604.0,14.0


In [205]:
# split the data into input and output variables
X = df.drop(columns=['selling_price', 'name'])  
y = df['selling_price']

In [206]:
# Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

In [207]:
# Create transformer for preprocessing
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

In [208]:
# split this X and y into Train_data and test_data
X_train, X_test,y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=1 )

In [209]:
# Apply transformations before training the model
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [210]:
# intialize the model
model=LinearRegression()

# train the model
model.fit(X_train_transformed, y_train)

# test the model
y_pred = model.predict(X_test_transformed)

In [211]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

print('Mean Squared Error: ', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Mean Absolute Error: ', mean_absolute_error(y_test, y_pred))
print('R2 Score: ', r2_score(y_test, y_pred))

Mean Squared Error:  59111271226.40497
Root Mean Squared Error:  243128.096332787
Mean Absolute Error:  111625.17611128683
R2 Score:  0.8090217254885621


In [216]:
import warnings
warnings.filterwarnings("ignore", message="missing ScriptRunContext!")

In [212]:
import pickle

pickle.dump(model, open('model_lr_car.pkl', 'wb'))

In [217]:
# Streamlit UI
import streamlit as st
st.title("Car Price Prediction using Linear Regression 🚗")
st.write("### Dataset Preview")
st.write(df.head())



In [218]:
# Visualization: Actual vs Predicted Prices
fig, ax = plt.subplots()
sns.scatterplot(x=y_test, y=y_pred, ax=ax)
ax.set_xlabel("Actual Sell Prices")
ax.set_ylabel("Predicted Sell Prices")
ax.set_title("Actual vs Predicted Sell Prices")
st.pyplot(fig)



DeltaGenerator()

In [219]:
# Prediction on user input
st.write("## Predict Your Car Price")
input_data = {}
for feature in X.columns:
    if feature in categorical_features:
        input_data[feature] = st.selectbox(f"{feature}", df[feature].unique())
    else:
        input_data[feature] = st.number_input(f"{feature}", float(df[feature].min()), float(df[feature].max()))

# Convert input to dataframe
input_df = pd.DataFrame([input_data])

# Preprocess input and make prediction
input_pred = model.predict(input_df)
st.write(f"### Predicted Price: ${input_pred[0]:,.2f}")



ValueError: could not convert string to float: 'Diesel'