# Using pandas get_dummies() one hot encoding

In [47]:
import pandas as pd

df = pd.read_csv("car_prices.csv")
df.head()

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4


In [7]:
# car model is categorical so we perform one hot encoding on it
dummies = pd.get_dummies(df["Car Model"])
dummies

Unnamed: 0,Audi A5,BMW X5,Mercedez Benz C class
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,0,0,1


In [11]:
merged = pd.concat([df,dummies], axis = "columns")
merged

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5,Mercedez Benz C class
0,BMW X5,69000,18000,6,0,1,0
1,BMW X5,35000,34000,3,0,1,0
2,BMW X5,57000,26100,5,0,1,0
3,BMW X5,22500,40000,2,0,1,0
4,BMW X5,46000,31500,4,0,1,0
5,Audi A5,59000,29400,5,1,0,0
6,Audi A5,52000,32000,5,1,0,0
7,Audi A5,72000,19300,6,1,0,0
8,Audi A5,91000,12000,8,1,0,0
9,Mercedez Benz C class,67000,22000,6,0,0,1


In [12]:
# dropping one of the dummy columns as a rule of the thumb to prevent multicollinearity
# also dropping the Car Model as we no longer need it
final = merged.drop(["Car Model", "Mercedez Benz C class"], axis = "columns")
final

Unnamed: 0,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5
0,69000,18000,6,0,1
1,35000,34000,3,0,1
2,57000,26100,5,0,1
3,22500,40000,2,0,1
4,46000,31500,4,0,1
5,59000,29400,5,1,0
6,52000,32000,5,1,0
7,72000,19300,6,1,0
8,91000,12000,8,1,0
9,67000,22000,6,0,0


In [13]:
x = final.drop("Sell Price($)", axis = "columns")
x

Unnamed: 0,Mileage,Age(yrs),Audi A5,BMW X5
0,69000,6,0,1
1,35000,3,0,1
2,57000,5,0,1
3,22500,2,0,1
4,46000,4,0,1
5,59000,5,1,0
6,52000,5,1,0
7,72000,6,1,0
8,91000,8,1,0
9,67000,6,0,0


In [16]:
y = final["Sell Price($)"]
y

0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: Sell Price($), dtype: int64

In [17]:
# usiing linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [18]:
model.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [20]:
y_pred = model.predict(x)
y_pred

array([18705.2723644 , 35286.78445645, 24479.19112468, 41245.76426391,
       29882.98779056, 28023.6135243 , 30614.46818502, 21879.57266964,
       12182.34562104, 26183.72387884, 18929.31674102, 20409.80511857,
       30477.15426156])

In [25]:
model.score(x,y)

0.9417050937281082

In [29]:
model.predict([[56000,3,1,0]])

array([31798.88706269])

# Using sklearn one hot encoding

In [64]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

ohe_car_model = ohe.fit_transform(df[["Car Model"]]).toarray() 
# Gives us three dummy variables/columns as there were 3 unique values in our Car Model column
ohe_car_model = pd.DataFrame(ohe_car_model)

# merge with main df bridge_df on key values
merged_df = pd.concat([df,ohe_car_model], axis = "columns")

# again dropping Car Model column and one of our dummy column (to prevent dummy variable trap/multi collinearity)
merged_df.drop(["Car Model", 0], axis = "columns", inplace = True)
merged_df

Unnamed: 0,Mileage,Sell Price($),Age(yrs),1,2
0,69000,18000,6,1.0,0.0
1,35000,34000,3,1.0,0.0
2,57000,26100,5,1.0,0.0
3,22500,40000,2,1.0,0.0
4,46000,31500,4,1.0,0.0
5,59000,29400,5,0.0,0.0
6,52000,32000,5,0.0,0.0
7,72000,19300,6,0.0,0.0
8,91000,12000,8,0.0,0.0
9,67000,22000,6,0.0,1.0


In [66]:
x = merged_df.drop("Sell Price($)", axis = "columns")
x

Unnamed: 0,Mileage,Age(yrs),1,2
0,69000,6,1.0,0.0
1,35000,3,1.0,0.0
2,57000,5,1.0,0.0
3,22500,2,1.0,0.0
4,46000,4,1.0,0.0
5,59000,5,0.0,0.0
6,52000,5,0.0,0.0
7,72000,6,0.0,0.0
8,91000,8,0.0,0.0
9,67000,6,0.0,1.0


In [67]:
y = merged_df["Sell Price($)"]
y

0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: Sell Price($), dtype: int64

In [69]:
model.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [70]:
model.score(x,y)

0.9417050937281082

In [74]:
model.predict([[11111,25,0,1]])

array([21552.85868749])