In [25]:
import pandas as pd
import math
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
!ls archive

CAR DETAILS FROM CAR DEKHO.csv car data.csv
Car details v3.csv             car details v4.csv


In [3]:
def getPath(path):
    BASE_PATH = "archive/"
    return BASE_PATH + path

In [4]:
orig_data = pd.read_csv(getPath("CAR DETAILS FROM CAR DEKHO.csv"))
data = orig_data

In [5]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [6]:
def min_max_scaling(value, min_value, max_value):
    return (value - min_value) / (max_value - min_value)

def reverse_min_max_scaling(scaled_value, min_value, max_value):
    return (scaled_value * (max_value - min_value)) + min_value

### Feature Engineering

In [7]:

## name - the variable name is not meaningful, excluded from training
data = data.drop(['name'], axis=1)

## year - Scaled using the min-max scaling. To preserve the relationship in the data.
data.year = min_max_scaling(data["year"], data["year"].min(), data["year"].max())

## selling_price - Scaling using min-max scaling
data.selling_price = min_max_scaling(data.selling_price, data.selling_price.min(), data.selling_price.max())


## km_driven - Scaling using z-score normalization
data.km_driven = min_max_scaling(data.km_driven, data.km_driven.min(), data.km_driven.max())

## fuel - One hot Encoding
data = pd.get_dummies(data, columns=['fuel'], dtype=int)


# seller_type - Randomly chosen One hot Encoding
data = pd.get_dummies(data, columns=['seller_type'], dtype=int)

## transmission - binary normalization
data.transmission = data.transmission.map({'Manual': 0, 'Automatic': 1})


## owner - ordinal relationship
data.owner = data.owner.map({'Test Drive Car': 0, 'First Owner': 1, 'Second Owner': 2, 'Third Owner': 3, 'Fourth & Above Owner': 4})

In [8]:
data

Unnamed: 0,year,selling_price,km_driven,transmission,owner,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer
0,0.535714,0.004505,0.086783,0,1,0,0,0,0,1,0,1,0
1,0.535714,0.012950,0.061988,0,1,0,0,0,0,1,0,1,0
2,0.714286,0.065315,0.123976,0,1,0,1,0,0,0,0,1,0
3,0.892857,0.025901,0.057028,0,1,0,0,0,0,1,0,1,0
4,0.785714,0.048423,0.174807,0,2,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4335,0.785714,0.043919,0.099181,0,2,0,1,0,0,0,0,1,0
4336,0.785714,0.043919,0.099181,0,2,0,1,0,0,0,0,1,0
4337,0.607143,0.010135,0.102900,0,2,0,0,0,0,1,0,1,0
4338,0.857143,0.095158,0.111579,0,1,0,1,0,0,0,0,1,0


### Splitting the dataset

In [45]:
X = data.drop(columns=['selling_price'])
y = data['selling_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
model = LinearRegression()

model.fit(X_train, y_train)

In [47]:
#### TODO: Test everyting below


# Predict the target variable (y) using the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 0.0023150281055642906
R-squared: 0.4018088897526899


In [76]:
y_pred_r = reverse_min_max_scaling(y_pred, data.selling_price.min(), data.selling_price.max())
# y_pred_r = y_pred

new_df = pd.DataFrame({ 'pred': y_pred_r, 'ground_truth': y_test, 'diff': y_pred_r - y_test })
new_df
print(X_test.loc[3978])
print(reverse_min_max_scaling(y_test.loc[3978], data.selling_price.min(), data.selling_price.max()))

year                            0.642857
km_driven                       0.099181
transmission                    0.000000
owner                           1.000000
fuel_CNG                        0.000000
fuel_Diesel                     1.000000
fuel_Electric                   0.000000
fuel_LPG                        0.000000
fuel_Petrol                     0.000000
seller_type_Dealer              0.000000
seller_type_Individual          1.000000
seller_type_Trustmark Dealer    0.000000
Name: 3978, dtype: float64
0.01632882882882883


### Train Model

### Draw Useful Charts