In [135]:
import pandas as pd

In [136]:
df = pd.read_csv(r"homeprices (1).csv")
df

Unnamed: 0,town,area,price
0,Pune,2600,550000
1,Pune,3000,565000
2,Pune,3200,610000
3,Pune,3600,680000
4,Pune,4000,725000
5,Jalgaon,2600,585000
6,Jalgaon,2800,615000
7,Jalgaon,3300,650000
8,Jalgaon,3600,710000
9,Nasik,2600,575000


Using pandas to create dummy variables

In [137]:
dummies = pd.get_dummies(df.town)
dummies

Unnamed: 0,Jalgaon,Nasik,Pune
0,False,False,True
1,False,False,True
2,False,False,True
3,False,False,True
4,False,False,True
5,True,False,False
6,True,False,False
7,True,False,False
8,True,False,False
9,False,True,False


In [138]:
merged = pd.concat([df,dummies],axis= 'columns')
merged

Unnamed: 0,town,area,price,Jalgaon,Nasik,Pune
0,Pune,2600,550000,False,False,True
1,Pune,3000,565000,False,False,True
2,Pune,3200,610000,False,False,True
3,Pune,3600,680000,False,False,True
4,Pune,4000,725000,False,False,True
5,Jalgaon,2600,585000,True,False,False
6,Jalgaon,2800,615000,True,False,False
7,Jalgaon,3300,650000,True,False,False
8,Jalgaon,3600,710000,True,False,False
9,Nasik,2600,575000,False,True,False


Dummy Variable Trap

In [139]:
final = merged.drop(['town'], axis='columns')
final

Unnamed: 0,area,price,Jalgaon,Nasik,Pune
0,2600,550000,False,False,True
1,3000,565000,False,False,True
2,3200,610000,False,False,True
3,3600,680000,False,False,True
4,4000,725000,False,False,True
5,2600,585000,True,False,False
6,2800,615000,True,False,False
7,3300,650000,True,False,False
8,3600,710000,True,False,False
9,2600,575000,False,True,False


In [140]:
X = final.drop('price',axis = 'columns')
X

Unnamed: 0,area,Jalgaon,Nasik,Pune
0,2600,False,False,True
1,3000,False,False,True
2,3200,False,False,True
3,3600,False,False,True
4,4000,False,False,True
5,2600,True,False,False
6,2800,True,False,False
7,3300,True,False,False
8,3600,True,False,False
9,2600,False,True,False


In [141]:
y = final.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [142]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [143]:
model.fit(X,y)

In [144]:
model.predict(X) # 2600 sqr ft home in new jersey

array([539709.73984091, 590468.71640508, 615848.20468716, 666607.18125133,
       717366.1578155 , 579723.71533005, 605103.20361214, 668551.92431735,
       706621.15674047, 565396.15136531, 603465.38378844, 628844.87207052,
       692293.59277574])

score method is used to check the precision

In [145]:
model.score(X,y)

0.9573929037221874

In [146]:
model.predict([[3400, 0, 0, 1]]) # 3400 sqr ft home in west windsor



array([641227.69296925])

In [147]:
model.predict([[3400,0,0,1]]) # 2800 sqr ft home in robbinsville



array([641227.69296925])

First step is to use label encoder to convert town names into numbers

In [148]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


In [149]:
dfle = df
dfle.town = le.fit_transform(dfle[['town']])
dfle

  y = column_or_1d(y, warn=True)


Unnamed: 0,town,area,price
0,2,2600,550000
1,2,3000,565000
2,2,3200,610000
3,2,3600,680000
4,2,4000,725000
5,0,2600,585000
6,0,2800,615000
7,0,3300,650000
8,0,3600,710000
9,1,2600,575000


In [150]:
X  =dfle[['town','area']].values
X

array([[   2, 2600],
       [   2, 3000],
       [   2, 3200],
       [   2, 3600],
       [   2, 4000],
       [   0, 2600],
       [   0, 2800],
       [   0, 3300],
       [   0, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]])

In [151]:
y = dfle.price.values
y

array([550000, 565000, 610000, 680000, 725000, 585000, 615000, 650000,
       710000, 575000, 600000, 620000, 695000])

Now use one hot encoder to create dummy variables for each of the town

In [152]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder = 'passthrough')

In [153]:
X = ct.fit_transform(X)
X

array([[0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 4.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 2.8e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.3e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])