In [20]:
# LIBRARIES
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [21]:
df = pd.read_csv('cleaned_car_details_v3.csv')

In [22]:
df.head()

Unnamed: 0,brand,year,km_driven,fuel,seller_type,transmission,owner,mileage(kmpl),engine(cc),max_power(bhp),seats,selling_price
0,Maruti,2014,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0,450000
1,Skoda,2014,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0,370000
2,Honda,2006,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0,158000
3,Hyundai,2010,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0,225000
4,Maruti,2007,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0,130000


In [23]:
df.info()
#no more null

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8127 entries, 0 to 8126
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   brand           8127 non-null   object 
 1   year            8127 non-null   int64  
 2   km_driven       8127 non-null   int64  
 3   fuel            8127 non-null   object 
 4   seller_type     8127 non-null   object 
 5   transmission    8127 non-null   object 
 6   owner           8127 non-null   object 
 7   mileage(kmpl)   8127 non-null   float64
 8   engine(cc)      8127 non-null   float64
 9   max_power(bhp)  8127 non-null   float64
 10  seats           8127 non-null   float64
 11  selling_price   8127 non-null   int64  
dtypes: float64(4), int64(3), object(5)
memory usage: 762.0+ KB


#### Check on how to encode non-numerical data/categorical feautures by using unique() 

In [24]:
print('Brands: ',df['brand'].unique(), df['brand'].nunique())
print('Fuel: ',df['fuel'].unique())
print('Seller: ',df['seller_type'].unique())
print('Transmission: ',df['transmission'].unique())
print('Owner(ordinal variable): ',df['owner'].unique())

Brands:  ['Maruti' 'Skoda' 'Honda' 'Hyundai' 'Toyota' 'Ford' 'Renault' 'Mahindra'
 'Tata' 'Chevrolet' 'Fiat' 'Datsun' 'Jeep' 'Mercedes-Benz' 'Mitsubishi'
 'Audi' 'Volkswagen' 'BMW' 'Nissan' 'Lexus' 'Jaguar' 'Land' 'MG' 'Volvo'
 'Daewoo' 'Kia' 'Force' 'Ambassador' 'Ashok' 'Isuzu' 'Opel'] 31
Fuel:  ['Diesel' 'Petrol' 'LPG' 'CNG']
Seller:  ['Individual' 'Dealer' 'Trustmark Dealer']
Transmission:  ['Manual' 'Automatic']
Owner(ordinal variable):  ['First Owner' 'Second Owner' 'Third Owner' 'Fourth & Above Owner'
 'Test Drive Car']


In [25]:
brand_mapping = df['brand'].value_counts().to_dict()
df['brand_frequency'] = df['brand'].map(brand_mapping)

In [26]:
df['owner'].replace(
    {
        'First Owner': 1,
        'Second Owner': 2,
        'Third Owner': 3,
        'Fourth & Above Owner': 4,
        'Test Drive Car': 0
    },
    inplace=True)

df['seller_type'].replace(
    {
        'Dealer': 2,
        'Individual': 0,
        'Trustmark Dealer': 1 
    },
    inplace=True)

df['transmission'].replace(
    {
        'Manual': 0,
        'Automatic': 1
    },
    inplace=True)


In [27]:
df.head()

Unnamed: 0,brand,year,km_driven,fuel,seller_type,transmission,owner,mileage(kmpl),engine(cc),max_power(bhp),seats,selling_price,brand_frequency
0,Maruti,2014,145500,Diesel,0,0,1,23.4,1248.0,74.0,5.0,450000,2448
1,Skoda,2014,120000,Diesel,0,0,2,21.14,1498.0,103.52,5.0,370000,105
2,Honda,2006,140000,Petrol,0,0,3,17.7,1497.0,78.0,5.0,158000,467
3,Hyundai,2010,127000,Diesel,0,0,1,23.0,1396.0,90.0,5.0,225000,1415
4,Maruti,2007,120000,Petrol,0,0,1,16.1,1298.0,88.2,5.0,130000,2448


In [28]:
#from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['fuel'])], remainder='passthrough')  
df = ct.fit_transform(df)

In [29]:
df = pd.DataFrame(data = df)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,1.0,0.0,0.0,Maruti,2014,145500,0,0,1,23.4,1248.0,74.0,5.0,450000,2448
1,0.0,1.0,0.0,0.0,Skoda,2014,120000,0,0,2,21.14,1498.0,103.52,5.0,370000,105
2,0.0,0.0,0.0,1.0,Honda,2006,140000,0,0,3,17.7,1497.0,78.0,5.0,158000,467
3,0.0,1.0,0.0,0.0,Hyundai,2010,127000,0,0,1,23.0,1396.0,90.0,5.0,225000,1415
4,0.0,0.0,0.0,1.0,Maruti,2007,120000,0,0,1,16.1,1298.0,88.2,5.0,130000,2448


In [30]:
#Dropped column 0 to handle dummy variable trap, dropped brand column replaced by the frequency at col 15
df.drop([0,4], axis=1, inplace=True)

In [31]:
new_columns = [1,2,3,5,6,7,8,9,10,11,12,13,15,14]
df = df[new_columns]
df

Unnamed: 0,1,2,3,5,6,7,8,9,10,11,12,13,15,14
0,1.0,0.0,0.0,2014,145500,0,0,1,23.4,1248.0,74.0,5.0,2448,450000
1,1.0,0.0,0.0,2014,120000,0,0,2,21.14,1498.0,103.52,5.0,105,370000
2,0.0,0.0,1.0,2006,140000,0,0,3,17.7,1497.0,78.0,5.0,467,158000
3,1.0,0.0,0.0,2010,127000,0,0,1,23.0,1396.0,90.0,5.0,1415,225000
4,0.0,0.0,1.0,2007,120000,0,0,1,16.1,1298.0,88.2,5.0,2448,130000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8122,0.0,0.0,1.0,2013,110000,0,0,1,18.5,1197.0,82.85,5.0,1415,320000
8123,1.0,0.0,0.0,2007,119000,0,0,4,16.8,1493.0,110.0,5.0,1415,135000
8124,1.0,0.0,0.0,2009,120000,0,0,1,19.3,1248.0,73.9,5.0,2448,382000
8125,1.0,0.0,0.0,2013,25000,0,0,1,23.57,1396.0,70.0,5.0,734,290000


##### Before Model training split dataset, feature scaling

In [32]:
x = df.iloc[:, :-1].values #independent variable
y = df.iloc[:, -1].values #dependent variable

In [33]:
# SPLIT TRAINING SET AND TEST SET
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [34]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

#### Predictions

In [43]:
# Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)
regressor.score(x_test, y_test)

0.6498252517558166

In [46]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
dt_regressor = DecisionTreeRegressor(random_state= 0)
dt_regressor.fit(x_train, y_train)
dt_regressor.score(x_test, y_test)

0.9462146068079729

In [48]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state= 69, n_estimators = 5)
rf_regressor.fit(x_train, y_train)
rf_regressor.score(x_test, y_test)

0.9545612075671815

In [None]:
#df.to_csv('very_clean_data_for_classif.csv', index = False)