In [37]:
#topics: Categorical, Dummy Variables, One Hot Encoding

import math
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('homeprices3.csv')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [7]:
#Categorical Vars: Nominal (like red, green, blue) vs Ordinal (Grauate, Masters, PHD)

#For Nominal, we do one hot encoding
dummies = pd.get_dummies(df['town'])
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,True,False,False
1,True,False,False
2,True,False,False
3,True,False,False
4,True,False,False
5,False,False,True
6,False,False,True
7,False,False,True
8,False,False,True
9,False,True,False


In [8]:
merged = pd.concat([df, dummies], axis='columns')
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,True,False,False
1,monroe township,3000,565000,True,False,False
2,monroe township,3200,610000,True,False,False
3,monroe township,3600,680000,True,False,False
4,monroe township,4000,725000,True,False,False
5,west windsor,2600,585000,False,False,True
6,west windsor,2800,615000,False,False,True
7,west windsor,3300,650000,False,False,True
8,west windsor,3600,710000,False,False,True
9,robinsville,2600,575000,False,True,False


In [12]:
#getting rid of our now useless column
final = merged.drop('town', axis='columns')

#the less rows the better accuracy, we can delete one of the hot encoder columns because it will already account as False, False
final = final.drop('west windsor', axis="columns")

final #these two rows account for all three because if it isn't these two towns, it is west windsor

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,True,False
1,3000,565000,True,False
2,3200,610000,True,False
3,3600,680000,True,False
4,4000,725000,True,False
5,2600,585000,False,False
6,2800,615000,False,False
7,3300,650000,False,False
8,3600,710000,False,False
9,2600,575000,False,True


In [13]:
#creating ml model
X = final.drop('price', axis='columns')
X

Unnamed: 0,area,monroe township,robinsville
0,2600,True,False
1,3000,True,False
2,3200,True,False
3,3600,True,False
4,4000,True,False
5,2600,False,False
6,2800,False,False
7,3300,False,False
8,3600,False,False
9,2600,False,True


In [14]:
y = final['price']
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [22]:
model = LinearRegression()

In [23]:
model.fit(X, y)

In [24]:
model.predict([[3400, False, False]]) #Predicting a 2500 sq ft area in west windsor



array([681241.66845839])

In [25]:
model.score(X, y) #the price is guessed to 95% accuracy 

0.9573929037221873

In [46]:
#Second strategy to make the one hot encoder: sklearn's LabelEncoder
le = LabelEncoder()

In [47]:
dfle = df
dfle.town = le.fit_transform(dfle.town) #changes that column to numbers for labels
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [48]:
X = df[['town', 'area']].values #.values creates a 2d array
X

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]], dtype=int64)

In [49]:
y = dfle.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [50]:
#saying we're gonna use the one hot encoder on the [0]th column
ct = ColumnTransformer([('town',OneHotEncoder(),[0])], remainder='passthrough')

In [51]:
X = ct.fit_transform(X)
X #mow converted to columns like the one earlier

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [52]:
X = X[:, 1:] #all the same rows, delete the first element (meaning the first column)
X #similar to the strategy in the previous dummy example

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [53]:
model.fit(X, y)

In [54]:
model.predict([[0, 1, 3000]]) #predict west windsor at 3000

array([630482.69189422])

In [56]:
model.score(X, y)

0.9573929037221874

In [57]:
#exercise
df2 = pd.read_csv('carprices.csv')

In [58]:
df2

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [60]:
ohe2 = pd.get_dummies(df2['Car Model'])
ohe2

Unnamed: 0,Audi A5,BMW X5,Mercedez Benz C class
0,False,True,False
1,False,True,False
2,False,True,False
3,False,True,False
4,False,True,False
5,True,False,False
6,True,False,False
7,True,False,False
8,True,False,False
9,False,False,True


In [62]:
new_df2 = pd.concat([ohe2, df2], axis='columns')
new_df2

Unnamed: 0,Audi A5,BMW X5,Mercedez Benz C class,Car Model,Mileage,Sell Price($),Age(yrs)
0,False,True,False,BMW X5,69000,18000,6
1,False,True,False,BMW X5,35000,34000,3
2,False,True,False,BMW X5,57000,26100,5
3,False,True,False,BMW X5,22500,40000,2
4,False,True,False,BMW X5,46000,31500,4
5,True,False,False,Audi A5,59000,29400,5
6,True,False,False,Audi A5,52000,32000,5
7,True,False,False,Audi A5,72000,19300,6
8,True,False,False,Audi A5,91000,12000,8
9,False,False,True,Mercedez Benz C class,67000,22000,6


In [64]:
new_df2 = new_df2.drop(['Car Model', 'Mercedez Benz C class'], axis='columns')
new_df2

Unnamed: 0,Audi A5,BMW X5,Mileage,Sell Price($),Age(yrs)
0,False,True,69000,18000,6
1,False,True,35000,34000,3
2,False,True,57000,26100,5
3,False,True,22500,40000,2
4,False,True,46000,31500,4
5,True,False,59000,29400,5
6,True,False,52000,32000,5
7,True,False,72000,19300,6
8,True,False,91000,12000,8
9,False,False,67000,22000,6


In [71]:
car_X = new_df2.drop('Sell Price($)', axis='columns')
car_y = new_df2['Sell Price($)']
car_X

Unnamed: 0,Audi A5,BMW X5,Mileage,Age(yrs)
0,False,True,69000,6
1,False,True,35000,3
2,False,True,57000,5
3,False,True,22500,2
4,False,True,46000,4
5,True,False,59000,5
6,True,False,52000,5
7,True,False,72000,6
8,True,False,91000,8
9,False,False,67000,6


In [70]:
model2 = LinearRegression()
model2.fit(car_X, car_y)

In [73]:
model2.predict([[False, True, 86000, 7]])



array([11080.74313219])

In [75]:
model2.score(car_X, car_y)

0.9417050937281083