In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
# pickle is used to save the model created.

In [3]:
path="/content/drive/MyDrive/hiring.csv"
Data=pd.read_csv(path)
Data.head()

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000


In [4]:
Data.shape

(8, 4)

In [5]:
Data.isna().sum()

experience         2
test_score         1
interview_score    0
salary             0
dtype: int64

In [6]:
# experience
Data['experience'].fillna(0,inplace=True)

In [7]:
Data.isna().sum()

experience         0
test_score         1
interview_score    0
salary             0
dtype: int64

In [8]:
Data['test_score'].mean()

7.857142857142857

In [9]:
Data['test_score'].fillna(Data['test_score'].mean(),inplace=True)

In [10]:
Data.isna().sum()

experience         0
test_score         0
interview_score    0
salary             0
dtype: int64

In [11]:
Data.head()

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000


In [12]:
X=Data.iloc[:,:-1]

In [13]:
X.head()

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,five,6.0,7
3,two,10.0,10
4,seven,9.0,6


In [14]:
X.shape

(8, 3)

In [15]:
X.experience

0         0
1         0
2      five
3       two
4     seven
5     three
6       ten
7    eleven
Name: experience, dtype: object

In [16]:
# Convert text in the cols to integer values
def conv(x):
    dict = {'two':2,'three':3,'five':5,'seven':7,'ten':10,0:0,'eleven':11}
    return dict[x]

In [17]:
X['experience'] = X['experience'].apply(lambda x:conv(x))

In [18]:
X.head()

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,5,6.0,7
3,2,10.0,10
4,7,9.0,6


In [19]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   experience       8 non-null      int64  
 1   test_score       8 non-null      float64
 2   interview_score  8 non-null      int64  
dtypes: float64(1), int64(2)
memory usage: 320.0 bytes


In [20]:
y=Data.iloc[:,-1]
y

0    50000
1    45000
2    60000
3    65000
4    70000
5    62000
6    72000
7    80000
Name: salary, dtype: int64

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [22]:
from sklearn.linear_model import LinearRegression
Lr=LinearRegression()

In [23]:
# Fit the model
Lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [24]:
# Prediction Phase
y_pred=Lr.predict(X_test)
y_pred

array([47414.37195592, 59888.03016152])

In [25]:
y_test

1    45000
5    62000
Name: salary, dtype: int64

In [26]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9287916364000987

In [27]:
Lr.predict([[3,9,7]])

array([59303.53236438])

In [28]:
Lr.predict([[10,10,10]])

array([85284.19916023])

In [29]:
Lr.predict([[10,2,3]])

array([56979.37953555])

In [30]:
# Here, we will save the 'lr' model to disk as 'model.pkl'
import pickle
pickle.dump(Lr,open('model.pkl','wb'))
# Dump this model by the name "model.pkl" in the systems HDD and while doing this write this file using "write bytes" mode.

In [31]:
# Lets now try to load the same model by reading it from the system and using it for prediction
model_2=pickle.load(open("model.pkl","rb"))

In [32]:
model_2.predict([[3,9,7]])

array([59303.53236438])

In [33]:
model_2.predict([[10,10,10]])

array([85284.19916023])

In [34]:
model_2.predict([[1,1,1]])

array([27178.51857898])