# Linear Regression with multi variables
#### And also learn how to work with missing data

In [1]:
# price = (m1 * area) + (m2 * bedrooms) + (m3 * age) + C
# area, bedrooms, age are independent and are the fetures of the model

In [8]:
import pandas as pd
import math
from sklearn import linear_model

In [9]:
df = pd.read_csv("homeprices.csv")
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [10]:
# we need to handle NaN here
# we can take the median of the bedrooms column/feature and put it in place of NaN

In [12]:
median = math.floor(df.bedrooms.median())
median

4

In [14]:
df.bedrooms = df.bedrooms.fillna(median) # fill all NaN with the median
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [15]:
# we have solved the missing data issue
# we can now train the model

In [25]:
reg = linear_model.LinearRegression()
reg.fit(df[['area', 'bedrooms', 'age']].values, df['price']) # first the independent variables and then the dependent variables

LinearRegression()

In [26]:
reg.coef_

array([  112.06244194, 23388.88007794, -3231.71790863])

In [27]:
# these values are m1, m2, m3 resp.

In [28]:
reg.intercept_

221323.00186540384

In [29]:
# this is the intercept C

In [30]:
# Now we predict the price for:
# area -> 3000
# bedrooms -> 3
# age -> 40

In [31]:
reg.predict([[3000, 3, 40]])

array([498408.25158031])

In [32]:
# so the price for the house of the requirement is 498408.25

In [33]:
(112.06244194*3000)+(23388.88007794*3)+(-3231.71790863*40)+221323.00186540384

498408.25157402386

In [34]:
reg.predict([[2500, 4, 5]])

array([578876.03748933])