In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
url = 'https://raw.githubusercontent.com/codebasics/py/master/ML/2_linear_reg_multivariate/homeprices.csv'
df = pd.read_csv(url)

In [3]:
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [4]:
# Data Preprocessing: Fill NA values with median value of a column
df.bedrooms.median()

4.0

In [5]:
df.bedrooms = df.bedrooms.fillna(df.bedrooms.median())
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [8]:
reg = linear_model.LinearRegression()
reg.fit(df.drop('price', axis = "columns"), df.price)

LinearRegression()

# Find price of home with 3000 sqr ft area, 3 bedrooms, 40 year old

In [10]:
reg.predict([[3000, 3, 40]])

array([498408.25158031])

In [12]:
reg.coef_

array([  112.06244194, 23388.88007794, -3231.71790863])

In [13]:
reg.intercept_

221323.00186540408

In [14]:
112.06244194*3000 + 23388.88007794*3 + -3231.71790863*40 + 221323.00186540384

498408.25157402386

# Find price of home with 2500 sqr ft area, 4 bedrooms, 5 year old

In [15]:
reg.predict([[2500, 4, 5]])

array([578876.03748933])

In [16]:
url = 'https://raw.githubusercontent.com/codebasics/py/master/ML/2_linear_reg_multivariate/Exercise/hiring.csv'
df = pd.read_csv(url)

In [17]:
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [30]:
df.experience = df.experience.fillna('zero')

In [28]:
pip install word2number

Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py): started
  Building wheel for word2number (setup.py): finished with status 'done'
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5583 sha256=27a68e18f9099406587c5a704bc2c1730a74d85b9194adb67263ff3c5f5e4506
  Stored in directory: c:\users\user\appdata\local\pip\cache\wheels\a0\4a\5b\d2f2df5c344ddbecb8bea759872c207ea91d93f57fb54e816e
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1
Note: you may need to restart the kernel to use updated packages.


In [29]:
from word2number import w2n

In [31]:
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [32]:
df.experience = df.experience.apply(w2n.word_to_num)

In [33]:
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,,7,72000
7,11,7.0,8,80000


In [34]:
import math
median_test_score = math.floor(df['test_score(out of 10)'].mean())
median_test_score

7

In [35]:
df['test_score(out of 10)'] = df['test_score(out of 10)'].fillna(median_test_score)
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,7.0,7,72000
7,11,7.0,8,80000


In [36]:
reg = linear_model.LinearRegression()

In [38]:
reg.fit(df[['experience', 'test_score(out of 10)', 'interview_score(out of 10)']],df[['salary($)']])

LinearRegression()

In [39]:
reg.predict([[2,9,6]])

array([[53713.86677124]])

In [40]:
reg.predict([[12,10,10]])

array([[93747.79628651]])

In [41]:
reg.coef_

array([[2922.26901502, 2221.30909959, 2147.48256637]])

In [42]:
reg.intercept_

array([14992.65144669])

In [46]:
reg.predict([[11,7,8]])

array([[79866.63483999]])

In [43]:
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,7.0,7,72000
7,11,7.0,8,80000
