In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
cars = pd.read_csv('car-sales-extended.csv')
cars.head(10)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
5,Honda,Red,42652,4,23883
6,Toyota,Blue,163453,4,8473
7,Honda,White,43120,4,20306
8,Nissan,White,130538,4,9374
9,Honda,Blue,51029,4,26683


In [3]:
len(cars)

1000

In [4]:
cars.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [9]:
## Splitting the data
X = cars.drop('Price',axis=1)

Y = cars['Price']

In [10]:
X

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3
...,...,...,...,...
995,Toyota,Black,35820,4
996,Nissan,White,155144,3
997,Nissan,Blue,66604,4
998,Honda,White,215883,4


In [11]:
Y

0      15323
1      19943
2      28343
3      13434
4      14043
       ...  
995    32042
996     5716
997    31570
998     4001
999    12732
Name: Price, Length: 1000, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2)

In [13]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((800, 4), (200, 4), (800,), (200,))

In [14]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(x_train,y_train)
model.score(x_test,y_test)



ValueError: could not convert string to float: 'Toyota'

In [16]:
cars['Make'].value_counts()

Toyota    398
Honda     304
Nissan    198
BMW       100
Name: Make, dtype: int64

In [17]:
cars['Colour'].value_counts()

White    407
Blue     321
Black     99
Red       94
Green     79
Name: Colour, dtype: int64

In [18]:
cars['Doors'].value_counts()

4    856
5     79
3     65
Name: Doors, dtype: int64

In [20]:
## we are going to use one hot encoding for above columns especially the first two cols because they are not numerical so
## our model cannot fit into the data

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_cols = ['Make','Colour','Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([
    ("one_hot",
     one_hot,
     categorical_cols
    )],
    remainder = 'passthrough'
)

transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [22]:
tr_X = pd.DataFrame(transformed_X)
tr_X


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [24]:
## we have 12 cols for encoded cols because : 4(Make types) + 5(Colour types) + 3(Doors types)

dummies = pd.get_dummies(cars[categorical_cols])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [25]:
## fit the model with new form of data

x_train,x_test,y_train,y_test = train_test_split(tr_X,Y,test_size=0.2)
model.fit(x_train,y_train)

RandomForestRegressor()

In [26]:
model.score(x_test,y_test)

0.2956125506860936

In [28]:
y_pred = model.predict(x_test)
y_pred

array([16674.71, 30572.4 ,  8130.26, 13888.99, 11666.21, 10199.78,
       10663.34, 20570.65, 14130.84, 16461.64, 15023.59, 11999.32,
       14699.62, 10852.74, 25702.95,  6194.38, 19416.56, 14690.86,
       10124.27, 14861.86, 11899.63, 32260.66, 39041.46, 16639.5 ,
       25854.77, 10443.62, 10971.96, 21614.56, 14734.19, 23215.1 ,
        7527.64, 12323.08,  9695.97,  9854.92, 23626.34,  9433.57,
       38077.83, 19775.41, 31675.06,  8020.26, 19676.74, 12265.87,
       20031.63, 11654.24, 15536.55, 11291.86,  7284.44, 10406.07,
       14090.19, 16744.53, 22390.6 , 22199.78, 12634.14, 14369.43,
       10741.42, 18272.36, 14933.82, 22414.19, 18620.12, 12418.37,
       16409.59, 22557.57,  6245.98, 16418.66, 19548.14, 21459.27,
       12370.29, 13178.47, 10155.25, 12911.  , 28872.25, 12915.27,
        9405.3 , 19224.65, 10026.7 , 13513.32, 18795.06, 11311.69,
       19836.68, 22145.88, 20993.37, 10799.96,  9448.73,  9818.87,
       13751.66, 22157.23, 10734.01, 17425.36, 17954.89, 11915

In [30]:
y_test

997    31570
294    33353
372    15047
876    19832
517    10663
       ...  
121     3944
616    16320
620    19408
880     4606
779    19545
Name: Price, Length: 200, dtype: int64

In [31]:
x_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
294,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,20985.0
372,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,178774.0
876,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,124924.0
517,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,207048.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,206073.0
616,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,76610.0
620,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,26531.0
880,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,200219.0
