In [56]:
# import packages

import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
# pickle is used to save the model created by us

In [57]:
df = pd.read_csv("hiring.csv")
df.head()

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000


In [58]:
df.isna().sum()

experience         2
test_score         1
interview_score    0
salary             0
dtype: int64

In [59]:
# experience
df['experience'].fillna(0,inplace = True)
df.isna().sum()

experience         0
test_score         1
interview_score    0
salary             0
dtype: int64

In [60]:
df['test_score'].fillna(df['test_score'].mean(),inplace = True)
df.isna().sum()

experience         0
test_score         0
interview_score    0
salary             0
dtype: int64

Hence, the dataset is clean now

In [61]:
df.head()

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000


In [62]:
X = df.iloc[:,:-1]  #all the columns except -1th column as dependent variables
X.head()

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,five,6.0,7
3,two,10.0,10
4,seven,9.0,6


In [63]:
X.shape

(8, 3)

In [64]:
X.experience

0         0
1         0
2      five
3       two
4     seven
5     three
6       ten
7    eleven
Name: experience, dtype: object

In [65]:
# convert text into intergers
def conv(x):
    dict = {'two':2,
            'three':3,
            'five':5,
            'seven':7,
            'ten':10,
            'eleven':11,
            0:0 }
    return dict[x]

In [66]:
X['experience'] = X['experience'].apply(lambda x: conv(x))

In [68]:
X.head()

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,5,6.0,7
3,2,10.0,10
4,7,9.0,6


In [69]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   experience       8 non-null      int64  
 1   test_score       8 non-null      float64
 2   interview_score  8 non-null      int64  
dtypes: float64(1), int64(2)
memory usage: 320.0 bytes


In [70]:
Y = df.iloc[:,-1]
Y

0    50000
1    45000
2    60000
3    65000
4    70000
5    62000
6    72000
7    80000
Name: salary, dtype: int64

Not going to train_test_split as the dataset is very small

In [71]:
# Modeling

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [72]:
# Fit the model
lr.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [73]:
# Predict the phase
y_pred = lr.predict(X)
y_pred

array([52313.61238494, 45722.68644263, 58231.95591138, 63991.7318464 ,
       67429.06277517, 61080.55179794, 75922.72532666, 79307.67351488])

In [74]:
Y

0    50000
1    45000
2    60000
3    65000
4    70000
5    62000
6    72000
7    80000
Name: salary, dtype: int64

In [77]:
from sklearn.metrics import r2_score
r2_score(y_pred, Y)

0.9626511210293308

In [79]:
# unseen data
lr.predict([[3,9,7]])

array([58315.5019167])

In [80]:
lr.predict([[10,10,10]])

array([86612.80419155])

In [81]:
lr.predict([[10,2,3]])

array([55930.47274854])

## Model Deployment

In [87]:
# save the lr model to system hdd as 'model.py' using write bytes (wb) mode
import pickle

pickle.dump(lr,open('model.pkl','wb'))

In [88]:
## Client side execution
model = pickle.load(open('model.pkl','rb'))  #read bytes

In [89]:
model.predict([[3,9,7]])

array([58315.5019167])

In [90]:
model.predict([[10,10,10]])

array([86612.80419155])

In [91]:
model.predict([[10,2,3]])

array([55930.47274854])