In [1]:
# LIBRARIES
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('used_car_clean.csv')

In [3]:
df.head()

Unnamed: 0,brand,year,km_driven,fuel,seller_type,transmission,owner,mileage(kmpl),engine(cc),max_power(bhp),seats,selling_price
0,Maruti,2014,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0,450000
1,Skoda,2014,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0,370000
2,Honda,2006,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0,158000
3,Hyundai,2010,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0,225000
4,Maruti,2007,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0,130000


In [4]:
print('Brand count: ', df['brand'].nunique())
print('Fuel: ',df['fuel'].unique())
print('Seller: ',df['seller_type'].unique())
print('Transmission: ',df['transmission'].unique())
print('Owner: ',df['owner'].unique())

Brand count:  31
Fuel:  ['Diesel' 'Petrol' 'LPG' 'CNG']
Seller:  ['Individual' 'Dealer' 'Trustmark Dealer']
Transmission:  ['Manual' 'Automatic']
Owner:  ['First Owner' 'Second Owner' 'Third Owner' 'Fourth & Above Owner'
 'Test Drive Car']


In [5]:
#Count/Frequency Encoding - Instead of OneHotEncoding for brand col
#Dropped brand Col, return to brand_mapping to trace brand to brand frequency
brand_mapping = df['brand'].value_counts().to_dict()
df['brand_frequency'] = df['brand'].map(brand_mapping)
df.drop(['brand'], axis = 1 , inplace = True)

In [6]:
#Values are based ordinal, e.g. first owner type cars has the second highest selling_price
df['owner'].replace(
    {
        'First Owner': 1,
        'Second Owner': 2,
        'Third Owner': 3,
        'Fourth & Above Owner': 4,
        'Test Drive Car': 0
    },
    inplace=True)

df['seller_type'].replace(
    {
        'Dealer': 2,
        'Individual': 0,
        'Trustmark Dealer': 1 
    },
    inplace=True)

df['transmission'].replace(
    {
        'Manual': 0,
        'Automatic': 1
    },
    inplace=True)


In [7]:
#OneHotEncode Fuel, nominal (no clear correlation which has higher selling_price) check later if untrue
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['fuel'])], remainder='passthrough')  
df = ct.fit_transform(df)

In [12]:
df = pd.DataFrame(data = df)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,1.0,0.0,0.0,2014.0,145500.0,0.0,0.0,1.0,23.4,1248.0,74.0,5.0,450000.0,2448.0
1,0.0,1.0,0.0,0.0,2014.0,120000.0,0.0,0.0,2.0,21.14,1498.0,103.52,5.0,370000.0,105.0
2,0.0,0.0,0.0,1.0,2006.0,140000.0,0.0,0.0,3.0,17.7,1497.0,78.0,5.0,158000.0,467.0
3,0.0,1.0,0.0,0.0,2010.0,127000.0,0.0,0.0,1.0,23.0,1396.0,90.0,5.0,225000.0,1415.0
4,0.0,0.0,0.0,1.0,2007.0,120000.0,0.0,0.0,1.0,16.1,1298.0,88.2,5.0,130000.0,2448.0


In [13]:
#Dropped column 0 to handle dummy variable trap, rearrange columns, put selling_price(13) at the last col
df.drop([0], axis=1, inplace=True)
new_columns = [1,2,3,4,5,6,7,8,9,10,11,12,14,13]
df = df[new_columns]
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,14,13
0,1.0,0.0,0.0,2014.0,145500.0,0.0,0.0,1.0,23.4,1248.0,74.0,5.0,2448.0,450000.0
1,1.0,0.0,0.0,2014.0,120000.0,0.0,0.0,2.0,21.14,1498.0,103.52,5.0,105.0,370000.0
2,0.0,0.0,1.0,2006.0,140000.0,0.0,0.0,3.0,17.7,1497.0,78.0,5.0,467.0,158000.0
3,1.0,0.0,0.0,2010.0,127000.0,0.0,0.0,1.0,23.0,1396.0,90.0,5.0,1415.0,225000.0
4,0.0,0.0,1.0,2007.0,120000.0,0.0,0.0,1.0,16.1,1298.0,88.2,5.0,2448.0,130000.0


In [14]:
#Preprocesing before predictions
x = df.iloc[:, :-1].values #independent variable
y = df.iloc[:, -1].values #dependent variable

In [15]:
# SPLIT TRAINING SET AND TEST SET
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [16]:
#Train Scaler using only Training data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [17]:
# Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)
regressor.score(x_test, y_test)

0.6498252517558168

In [18]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
dt_regressor = DecisionTreeRegressor(random_state= 0)
dt_regressor.fit(x_train, y_train)
dt_regressor.score(x_test, y_test)

0.9462146068079729

In [19]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state= 69, n_estimators = 5)
rf_regressor.fit(x_train, y_train)
rf_regressor.score(x_test, y_test)

0.9545612075671815