In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
car_sales=pd.read_csv("data/car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [4]:
X=car_sales.drop("Price",axis=1)
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [5]:
y=car_sales["Price"]
y.head()

0    15323
1    19943
2    28343
3    13434
4    14043
Name: Price, dtype: int64

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features=["Make","Colour","Doors"]
one_hot=OneHotEncoder()
transformer=ColumnTransformer([("one_hot",
                               one_hot,
                               categorical_features)],
                               remainder="passthrough")
transformed_X=transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(transformed_X,y,test_size=0.2)

In [10]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()

model.fit(X_train,y_train)

RandomForestRegressor()

In [11]:
model.score(X_test,y_test)

0.39746111602854983

In [2]:
# Working with missing data
car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")
X=car_sales_missing.drop("Price", axis=1)
y=car_sales_missing["Price"]
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [13]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features=["Make","Colour","Doors"]
one_hot=OneHotEncoder()
transformer=ColumnTransformer([("one_hot",
                               one_hot,
                               categorical_features)],
                               remainder="passthrough")
transformed_X=transformer.fit_transform(X)
transformed_X

<1000x16 sparse matrix of type '<class 'numpy.float64'>'
	with 4000 stored elements in Compressed Sparse Row format>

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(transformed_X,y,test_size=0.3)

In [42]:
# Working with missing data - Sklearn way
car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")
X=car_sales_missing.drop("Price", axis=1)
y=car_sales_missing["Price"]
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [43]:
car_sales_missing.dropna(subset=["Price"],inplace=True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [44]:
X=car_sales_missing.drop("Price",axis=1)
y=car_sales_missing["Price"]

In [45]:
# Best practise - Always split training and test data and work on them individually
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
X_train.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
521,Nissan,Red,196130.0,3.0
394,BMW,Blue,85320.0,3.0
47,Toyota,Blue,243969.0,
686,Toyota,,228619.0,4.0
61,Honda,Black,16933.0,4.0


In [46]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

cat_imputer=SimpleImputer(strategy="constant", fill_value="missing")
door_imputer=SimpleImputer(strategy="constant", fill_value=4)
num_imputer=SimpleImputer(strategy="mean")

cat_features=["Make","Colour"]
door_features=["Doors"]
num_features=["Odometer (KM)"]

imputer=ColumnTransformer([
    ("cat_imputer",cat_imputer,cat_features),
    ("door_imputer",door_imputer,door_features),
    ("num_imputer",num_imputer, num_features)
])

filled_X_train=imputer.fit_transform(X_train)
filled_X_test=imputer.fit_transform(X_test)

In [47]:
car_sales_filled_train=pd.DataFrame(filled_X_train,columns=["Make","Colour","Doors","Odometer (KM)"])
car_sales_filled_test=pd.DataFrame(filled_X_test,columns=["Make","Colour","Doors","Odometer (KM)"])
car_sales_filled_train.head(),car_sales_filled_test.head()                                   

(     Make   Colour Doors Odometer (KM)
 0  Nissan      Red   3.0      196130.0
 1     BMW     Blue   3.0       85320.0
 2  Toyota     Blue   4.0      243969.0
 3  Toyota  missing   4.0      228619.0
 4   Honda    Black   4.0       16933.0,
      Make   Colour Doors Odometer (KM)
 0   Honda  missing   4.0      178774.0
 1  Toyota    Black   4.0      132103.0
 2   Honda     Blue   4.0      197664.0
 3  Toyota     Blue   4.0      128016.0
 4  Nissan     Blue   3.0       57558.0)

In [48]:
car_sales_filled_train.isna().sum(), car_sales_filled_test.isna().sum()

(Make             0
 Colour           0
 Doors            0
 Odometer (KM)    0
 dtype: int64,
 Make             0
 Colour           0
 Doors            0
 Odometer (KM)    0
 dtype: int64)

In [49]:
len(car_sales_filled_train), len(car_sales_filled_test)

(760, 190)

In [50]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features=["Make","Colour","Doors"]
one_hot=OneHotEncoder()
transformer=ColumnTransformer([("one_hot",
                               one_hot,
                               categorical_features)],
                               remainder="passthrough")
transformed_X_train=transformer.fit_transform(car_sales_filled_train)
transformed_X_test=transformer.fit_transform(car_sales_filled_test)

In [51]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()

model.fit(transformed_X_train,y_train)

RandomForestRegressor()

In [52]:
model.score(transformed_X_test,y_test)

0.19513083754975025