# Multivariate Regression

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install word2number

Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5568 sha256=5cf55f108b5c908c743e77397c498e33c14c7999f4f050fc004d8adde23e28d3
  Stored in directory: /root/.cache/pip/wheels/5b/79/fb/d25928e599c7e11fe4e00d32048cd74933f34a74c633d2aea6
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from word2number import w2n

Step 1: Analyze data before training. eg., missing data, see patterns

In [5]:
## Reading data
df = pd.read_csv('/content/drive/MyDrive/ml_practice/linear_regression/hiring.csv')
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


Step 2: Data cleaning, Handling NA values

In [6]:
df.experience = df.experience.fillna('zero') # as this column can be assumed, filling NA values with zero
df

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,zero,8.0,9,50000
1,zero,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [7]:
df.experience = df.experience.apply(w2n.word_to_num) # changing column from text to number using w2n
df.experience

Unnamed: 0,experience
0,0
1,0
2,5
3,2
4,7
5,3
6,10
7,11


In [8]:
import math
mean_test_score = math.floor(df['test_score(out of 10)'].mean())
mean_test_score

7

In [9]:
df['test_score(out of 10)'] = df['test_score(out of 10)'].fillna(mean_test_score) # taking mean of 1 column and filling NA with the same.
df['test_score(out of 10)']

Unnamed: 0,test_score(out of 10)
0,8.0
1,8.0
2,6.0
3,10.0
4,9.0
5,7.0
6,7.0
7,7.0


Step 3: Training the model

In [10]:
reg = linear_model.LinearRegression()

In [11]:
reg.fit(df[['experience', 'test_score(out of 10)', 'interview_score(out of 10)']], df['salary($)'])

Checking Coefficients, Intercept

In [12]:
reg.coef_ # coefficient

array([2922.26901502, 2221.30909959, 2147.48256637])

In [13]:
reg.intercept_ # intercept

np.float64(14992.65144669314)

Step 4: Predicting new data

In [14]:
reg.predict([[2,9,6]])



array([53713.86677124])

In [15]:
reg.predict([[12,10,10]])



array([93747.79628651])

Saving the model

In [16]:
import pickle

In [17]:
with open('model_pickel', 'wb') as f:
  pickle.dump(reg,f)

In [18]:
with open('model_pickel', 'rb') as f:
  mp = pickle.load(f)

In [19]:
mp.predict([[2,9,6]])



array([53713.86677124])