In [47]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

In [48]:
#load the data
car_price = pd.read_csv('carprices.csv')

In [49]:
car_price.head(4)

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2


In [50]:
car_price.tail(4)

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
9,Mercedez Benz C class,67000,22000,6
10,Mercedez Benz C class,83000,20000,7
11,Mercedez Benz C class,79000,21000,7
12,Mercedez Benz C class,59000,33000,5


## Data Overview

In [51]:
#data info
car_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Car Model      13 non-null     object
 1   Mileage        13 non-null     int64 
 2   Sell Price($)  13 non-null     int64 
 3   Age(yrs)       13 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 544.0+ bytes


In [52]:
#check the data columns
car_price.columns

Index(['Car Model', 'Mileage', 'Sell Price($)', 'Age(yrs)'], dtype='object')

In [53]:
#check the data shape
car_price.shape

(13, 4)

In [54]:
#check for missing values
car_price.isnull().sum()

Car Model        0
Mileage          0
Sell Price($)    0
Age(yrs)         0
dtype: int64

In [55]:
#check for duplicates
car_price.duplicated().sum()

0

In [56]:
#check for uniques values
car_price.nunique()

Car Model         3
Mileage          12
Sell Price($)    13
Age(yrs)          7
dtype: int64

 ## Create Dummy Variable
 This is done to convert the strings in the car model columns into numeric values because machine learning only recognise numeric values during training

In [57]:
car_price_dummies = pd.get_dummies(car_price['Car Model'])
car_price_dummies

Unnamed: 0,Audi A5,BMW X5,Mercedez Benz C class
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,0,0,1


## Merge The Dummy Variables With The Orignal Dataset

In [59]:
merged = pd.concat([car_price, car_price_dummies], axis = 'columns')
merged

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5,Mercedez Benz C class
0,BMW X5,69000,18000,6,0,1,0
1,BMW X5,35000,34000,3,0,1,0
2,BMW X5,57000,26100,5,0,1,0
3,BMW X5,22500,40000,2,0,1,0
4,BMW X5,46000,31500,4,0,1,0
5,Audi A5,59000,29400,5,1,0,0
6,Audi A5,52000,32000,5,1,0,0
7,Audi A5,72000,19300,6,1,0,0
8,Audi A5,91000,12000,8,1,0,0
9,Mercedez Benz C class,67000,22000,6,0,0,1


Drop the original Car Model Column and Mercedez Benz C class column. This is done to avoid trap variables between the created dummies.

In [13]:
new_car_price = merged.drop(['Car Model', 'Mercedez Benz C class'], axis = 'columns')
new_car_price.head(4)

Unnamed: 0,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5
0,69000,18000,6,0,1
1,35000,34000,3,0,1
2,57000,26100,5,0,1
3,22500,40000,2,0,1


In [14]:
#get dataset for the X-axis which are mainly the independent variables.
x = new_car_price.drop('Sell Price($)', axis = 'columns')
x

Unnamed: 0,Mileage,Age(yrs),Audi A5,BMW X5
0,69000,6,0,1
1,35000,3,0,1
2,57000,5,0,1
3,22500,2,0,1
4,46000,4,0,1
5,59000,5,1,0
6,52000,5,1,0
7,72000,6,1,0
8,91000,8,1,0
9,67000,6,0,0


In [15]:
#get dataset for the y-axis which is the dependent variable.
y = new_car_price['Sell Price($)']
y

0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: Sell Price($), dtype: int64

In [16]:
#create class object
model = LinearRegression()
model

LinearRegression()

In [17]:
#Train the dataset
model.fit(x.values, y)

LinearRegression()

In [60]:
#What is the price of mercedez benz that is 4 yr old with mileage 45000
model.predict([[45000, 4, 0, 0]])

array([-1.92743659e+08])

In [61]:
merged

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5,Mercedez Benz C class
0,BMW X5,69000,18000,6,0,1,0
1,BMW X5,35000,34000,3,0,1,0
2,BMW X5,57000,26100,5,0,1,0
3,BMW X5,22500,40000,2,0,1,0
4,BMW X5,46000,31500,4,0,1,0
5,Audi A5,59000,29400,5,1,0,0
6,Audi A5,52000,32000,5,1,0,0
7,Audi A5,72000,19300,6,1,0,0
8,Audi A5,91000,12000,8,1,0,0
9,Mercedez Benz C class,67000,22000,6,0,0,1


In [62]:
#What is the price of BMW X5 that is 7 yr old with mileage 86000
model.predict([[86000, 7, 0, 1]])

array([-3.68408961e+08])

In [64]:
#Check the accurancy of the prediction
model.score(x,y)

0.9417050937281083

## Using One-Hot Encoder 
First ensure to use Label Encoder to convert Car Model into numbers because in Machine Learning only numbers are recognised.

In [65]:
#label encoder 
LE = LabelEncoder()
LE

LabelEncoder()

In [66]:
#fitting the data into the model and converting the string in Car Model into numeric values
car_price_LE = car_price
car_price_LE['Car Model'] = LE.fit_transform(car_price_LE['Car Model'])
car_price_LE

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,1,69000,18000,6
1,1,35000,34000,3
2,1,57000,26100,5
3,1,22500,40000,2
4,1,46000,31500,4
5,0,59000,29400,5
6,0,52000,32000,5
7,0,72000,19300,6
8,0,91000,12000,8
9,2,67000,22000,6


In [67]:
#get dataset for the X-axis which are mainly the independent variables.
x = car_price_LE[['Car Model', 'Mileage', 'Age(yrs)']].values
x

array([[    1, 69000,     6],
       [    1, 35000,     3],
       [    1, 57000,     5],
       [    1, 22500,     2],
       [    1, 46000,     4],
       [    0, 59000,     5],
       [    0, 52000,     5],
       [    0, 72000,     6],
       [    0, 91000,     8],
       [    2, 67000,     6],
       [    2, 83000,     7],
       [    2, 79000,     7],
       [    2, 59000,     5]], dtype=int64)

In [69]:
#get dataset for the y-axis which is the dependent variable.
y = car_price_LE['Sell Price($)']
y

0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: Sell Price($), dtype: int64

In [70]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
CT = ColumnTransformer([('Car Model', OneHotEncoder(), [0])], remainder = 'passthrough')

In [71]:
x = CT.fit_transform(x)
x

array([[0.00e+00, 1.00e+00, 0.00e+00, 6.90e+04, 6.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 3.50e+04, 3.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 5.70e+04, 5.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 2.25e+04, 2.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 4.60e+04, 4.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 5.90e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 5.20e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 7.20e+04, 6.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 9.10e+04, 8.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 6.70e+04, 6.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 8.30e+04, 7.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 7.90e+04, 7.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 5.90e+04, 5.00e+00]])

In [72]:
x = x[:, 1:]
x

array([[1.00e+00, 0.00e+00, 6.90e+04, 6.00e+00],
       [1.00e+00, 0.00e+00, 3.50e+04, 3.00e+00],
       [1.00e+00, 0.00e+00, 5.70e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 2.25e+04, 2.00e+00],
       [1.00e+00, 0.00e+00, 4.60e+04, 4.00e+00],
       [0.00e+00, 0.00e+00, 5.90e+04, 5.00e+00],
       [0.00e+00, 0.00e+00, 5.20e+04, 5.00e+00],
       [0.00e+00, 0.00e+00, 7.20e+04, 6.00e+00],
       [0.00e+00, 0.00e+00, 9.10e+04, 8.00e+00],
       [0.00e+00, 1.00e+00, 6.70e+04, 6.00e+00],
       [0.00e+00, 1.00e+00, 8.30e+04, 7.00e+00],
       [0.00e+00, 1.00e+00, 7.90e+04, 7.00e+00],
       [0.00e+00, 1.00e+00, 5.90e+04, 5.00e+00]])

In [40]:
model.fit(x,y)

LinearRegression()

In [41]:
#What is the price of mercedez benz that is 4 yr old with mileage 45000
model.predict([[0, 1, 45000, 4]])

array([36991.31721063])

In [42]:
#What is the price of BMW X5 that is 7 yr old with mileage 86000
model.predict([[1, 0, 86000, 7]])

array([11080.74313217])

In [43]:
#check for accuracy
model.score(x,y)

0.9417050937281083