In [1]:
# Dummy Variables & One Hot Encoding
# Here ewe have to build a model which predict price by using area and town values
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('DataSets/ML6_DATA.csv')
df

Unnamed: 0,town,area,price
0,pune,2600,550000
1,pune,3000,565000
2,pune,3200,610000
3,pune,3600,680000
4,pune,4000,725000
5,satara,2600,585000
6,satara,2800,615000
7,satara,3300,650000
8,satara,3600,710000
9,mumbai,2600,575000


In [2]:
# 1)This method is used to create dummy values for respective column unique attributes..........................................
# Here we drop the first dummy column using (drop_first = True)to avoid multicollinearity as it affect our prediction model 
dummies = pd.get_dummies(df.town, drop_first = True)
dummies

Unnamed: 0,pune,satara
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
5,0,1
6,0,1
7,0,1
8,0,1
9,0,0


In [3]:
# now we concatinate the dummies dataframe with oiginal dataframe
merged = pd.concat([df,dummies],axis = 'columns')
merged
# here axis = 'columns' means concatinate along columns

Unnamed: 0,town,area,price,pune,satara
0,pune,2600,550000,1,0
1,pune,3000,565000,1,0
2,pune,3200,610000,1,0
3,pune,3600,680000,1,0
4,pune,4000,725000,1,0
5,satara,2600,585000,0,1
6,satara,2800,615000,0,1
7,satara,3300,650000,0,1
8,satara,3600,710000,0,1
9,mumbai,2600,575000,0,0


In [4]:
# Now we can drop the town column
final = merged.drop(columns = ['town'])
final

Unnamed: 0,area,price,pune,satara
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,1
6,2800,615000,0,1
7,3300,650000,0,1
8,3600,710000,0,1
9,2600,575000,0,0


In [5]:
# Now we create a model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [6]:
# here X is all columns except price column and Y is price
X = final[['area','pune','satara']]
y = final['price']

In [7]:
model.fit(X,y)

LinearRegression()

In [8]:
model.predict([[2800,1,0]])
# Here 1,0 is pune | 0,1 is satara | and 0,0 is mumbai 

array([565089.22812299])

In [9]:
model.predict([[2800,0,1]])

array([605103.20361213])

In [10]:
model.predict([[2800,0,0]])

array([590775.6396474])

In [11]:
# for accuracy of your model
model.score(X,y)
# 95% accurate

0.9573929037221873

In [12]:
# 2) Using One Hot Encoding.....................................................................................................
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [13]:
# Now we create dataframe for lable encoder(dfle)
dfle = df
dfle

Unnamed: 0,town,area,price
0,pune,2600,550000
1,pune,3000,565000
2,pune,3200,610000
3,pune,3600,680000
4,pune,4000,725000
5,satara,2600,585000
6,satara,2800,615000
7,satara,3300,650000
8,satara,3600,710000
9,mumbai,2600,575000


In [14]:
le.fit_transform(dfle.town)
# this method label the unique data from respective column

array([1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0])

In [15]:
dfle.town = le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,1,2600,550000
1,1,3000,565000
2,1,3200,610000
3,1,3600,680000
4,1,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,0,2600,575000


In [16]:
X = dfle[['town','area']].values
X
# Here we not use direct datafram in x insted we use array of values

array([[   1, 2600],
       [   1, 3000],
       [   1, 3200],
       [   1, 3600],
       [   1, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   0, 2600],
       [   0, 2900],
       [   0, 3100],
       [   0, 3600]], dtype=int64)

In [17]:
y = dfle.price.values
y

array([550000, 565000, 610000, 680000, 725000, 585000, 615000, 650000,
       710000, 575000, 600000, 620000, 695000], dtype=int64)

In [18]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder = 'passthrough')

In [19]:
X = ct.fit_transform(X)
X

array([[0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03]])

In [20]:
# Drop first dummy
X = X[:,1:]
X

array([[1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 0.0e+00, 3.6e+03]])

In [21]:
model.fit(X,y)

LinearRegression()

In [22]:
model.predict([[1,3400]])

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 3 is different from 2)

In [None]:
model.predict([[0,2800]]) 

In [None]:
help(ColumnTransformer)