In [1]:
import pandas as pd
from sklearn import linear_model
import numpy as np

In [2]:
df = pd.read_csv('/content/homeprices(Categorical Variables and One Hot Encoding).csv')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


#Using pandas to create dummy variables

In [3]:
dummies = pd.get_dummies(df.town)
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


**Now merge both original data with new columns with dummies**

In [4]:
merged = pd.concat([df,dummies],axis=1)
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [5]:
final = merged.drop(['town'],axis=1)
final

Unnamed: 0,area,price,monroe township,robinsville,west windsor
0,2600,550000,1,0,0
1,3000,565000,1,0,0
2,3200,610000,1,0,0
3,3600,680000,1,0,0
4,4000,725000,1,0,0
5,2600,585000,0,0,1
6,2800,615000,0,0,1
7,3300,650000,0,0,1
8,3600,710000,0,0,1
9,2600,575000,0,1,0


#Dummy Variable Trap
When you can derive one variable from other variables, they are known to be multi-colinear. Here if you know values of california and georgia then you can easily infer value of new jersey state, i.e. california=0 and georgia=0. There for these state variables are called to be multi-colinear. In this situation linear regression won't work as expected. Hence you need to drop one column.

**NOTE: sklearn library takes care of dummy variable trap hence even if you don't drop one of the state columns it is going to work, however we should make a habit of taking care of dummy variable trap ourselves just in case library that you are using is not handling this for you**

In [6]:
final = final.drop(['west windsor'],axis=1)
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [7]:
reg = linear_model.LinearRegression()
reg.fit(final.drop('price',axis='columns'),df.price)

In [52]:
reg.predict([[3600,0,0]])



array([706621.15674048])

In [8]:
reg.predict([[3400,0,0]])



array([681241.66845839])

In [9]:
reg.predict([[3400,0,1]])



array([666914.10449365])

In [10]:
reg.coef_

array([   126.89744141, -40013.97548914, -14327.56396474])

In [12]:
reg.intercept_

249790.36766292527

In [11]:
#y=m1x1+m2x2+m3x3+c
126.8974414*3400+-40013.97548914*0+-14327.56396474*1+249790.36766292527

666914.1044581853

#or


In [13]:
x = final.drop('price',axis=1)

In [14]:
y = final.price

In [15]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [16]:
model.fit(x,y)

In [17]:
model.predict(x)

array([539709.7398409 , 590468.71640508, 615848.20468716, 666607.18125134,
       717366.15781551, 579723.71533005, 605103.20361213, 668551.92431735,
       706621.15674048, 565396.15136531, 603465.38378844, 628844.87207052,
       692293.59277574])

In [18]:
final.head()

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0


In [19]:
model.predict([[3400,0,1]])



array([666914.10449365])

In [20]:
model.predict([[3000,1,0]])



array([590468.71640508])

In [21]:
model.score(x,y)  #reg.score(x,y) -> same answer 0.9573929037221872

0.9573929037221872

**Save Trained Model to file Using joblib**

In [22]:
import joblib

In [23]:
joblib.dump(model,'model_joblib')

['model_joblib']

In [24]:
mj=joblib.load('model_joblib')

In [25]:
mj.predict([[3000,1,0]])



array([590468.71640508])

In [26]:
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


#Using sklearn OneHotEncoder

**In order to use OneHotEncoder we need to use it on town column so, for this we are going use LabelEncoder from sklearn preprocessing**

In [27]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

**fit_transform means it takes the label column as an input so, here we have label column ,i.e, town  and it will return the lavels in array form...**

**First step is to use label encoder to convert town names into numbers**

In [28]:
dfle = df
le.fit_transform(dfle.town)

array([0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1])

#or

In [29]:
dfle = df
dfle.town = le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [43]:
x = dfle[['town','area']].values
x

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]])

In [44]:
y = dfle.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

**Now use one hot encoder to create dummy variables for each of the town**

In [45]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder = 'passthrough')

**[0] here it is specifying that whatever x i am supplying , the 0th column in that x is my categorical feature**

In [55]:
df.head(10)

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [46]:
x = ct.fit_transform(x)
x

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

**To avoid dummy variable trap I am going to drop one of the column so, here i am droping 1st column.**

In [47]:
x = x[:,1:]

In [:,1.:] I am taking all the rows and for the columns take all the columns from index 1 onwards...This will essentially drop our 0th column

In [48]:
x

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [49]:
model.fit(x,y)

In [56]:
model.predict([[1,0,3600]])

array([692293.59277574])

In [53]:
model.predict([[0,0,4000]])

array([717366.15781552])

In [50]:
model.predict([[0,1,3400]])   # 3400 sqr ft home in west windsor

array([681241.6684584])

In [51]:
model.predict([[1,0,2800]]) # 2800 sqr ft home in robbinsville

array([590775.63964739])