In [1]:
#including important libraries 
import numpy as np
import pandas as pd

#model Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

#libraries for Confusion matrix, precision score, f1 score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
#selecting csv file
data = pd.read_csv("salary,satisfaction.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2946 entries, 0 to 2945
Data columns (total 83 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          2946 non-null   int64  
 1   Target_Salary               2946 non-null   float64
 2   Target_Satisfied            2946 non-null   int64  
 3   LOAN_AMT                    2946 non-null   object 
 4   Business Title              2946 non-null   object 
 5   Civil Service Title         2946 non-null   object 
 6   Division/Work Unit          2946 non-null   object 
 7   Job Description             2946 non-null   object 
 8   Minimum Qual Requirements   2926 non-null   object 
 9   Preferred Skills            2553 non-null   object 
 10  Additional Information      1854 non-null   object 
 11  To Apply                    2945 non-null   object 
 12  Hours/Shift                 884 non-null    object 
 13  Residency Requirement       2942 

## Preprocessing the data

In [4]:
#Selecting important features on which model depends
features = ['id', 'Target_Salary', 'Target_Satisfied', 'LOAN_AMT', 'dti','Age', 'loan_status', 'JobSearchStatus', 'FormalEducation',
            'HopeFiveYears','CompanySize',  'YearsCoding', 'term', 'last_pymnt_amnt']

#creating dataframe "db" with above features
db = data[features]

In [5]:
db.head(3)

Unnamed: 0,id,Target_Salary,Target_Satisfied,LOAN_AMT,dti,Age,loan_status,JobSearchStatus,FormalEducation,HopeFiveYears,CompanySize,YearsCoding,term,last_pymnt_amnt
0,68407277,42405.0,0,"$3,600.00",5.91,25 - 34 years old,Fully Paid,"I�m not actively looking, but I am open to new...","Bachelor�s degree (BA, BS, B.Eng., etc.)",Working as a founder or co-founder of my own c...,20 to 99 employees,3-5 years,36 months,122.67
1,68355089,60740.0,1,"$24,700.00",16.06,35 - 44 years old,Fully Paid,I am actively looking for a job,"Bachelor�s degree (BA, BS, B.Eng., etc.)",Working in a different or more specialized tec...,"10,000 or more employees",30 or more years,36 months,926.35
2,68341763,51907.68,0,"$20,000.00",10.78,,Fully Paid,"I�m not actively looking, but I am open to new...",Associate degree,Working as a founder or co-founder of my own c...,20 to 99 employees,24-26 years,60 months,15813.3


In [6]:
#removing "$ and ," symbols from values in  LOAN_AMT feature so that we can easily convert object into float
db['Loan_Amt_$'] = db['LOAN_AMT'].map(lambda x: x.lstrip('$,'))
db['Loan_Amt_$'] = db['Loan_Amt_$'].replace(',','', regex=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2946 entries, 0 to 2945
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                2946 non-null   int64  
 1   Target_Salary     2946 non-null   float64
 2   Target_Satisfied  2946 non-null   int64  
 3   LOAN_AMT          2946 non-null   object 
 4   dti               2946 non-null   float64
 5   Age               2027 non-null   object 
 6   loan_status       2946 non-null   object 
 7   JobSearchStatus   2511 non-null   object 
 8   FormalEducation   2894 non-null   object 
 9   HopeFiveYears     2477 non-null   object 
 10  CompanySize       2946 non-null   object 
 11  YearsCoding       2943 non-null   object 
 12  term              2946 non-null   object 
 13  last_pymnt_amnt   2946 non-null   float64
 14  Loan_Amt_$        2946 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 345.4+ KB


In [8]:
db['YearsCoding'].value_counts()

3-5 years           699
6-8 years           630
9-11 years          443
12-14 years         277
0-2 years           248
15-17 years         205
18-20 years         179
30 or more years     93
21-23 years          82
24-26 years          54
27-29 years          33
Name: YearsCoding, dtype: int64

In [9]:
# Now converting string into integer values as catagorical data for diffrent features

age_group = {"Under 18 years old":0, "18 - 24 years old":1, "25 - 34 years old":2,
             "35 - 44 years old":3,"45 - 54 years old":4, "55 - 64 years old":5}

loan_status_groups = {"Fully Paid":0, "Charged Off":1, "Current":2, "Late (31-120 days)":3,
                      "In Grace Period":4, "Late (16-30 days)":5}

JobSearchStatus_groups = {"I�m not actively looking, but I am open to new opportunities":0, "I am not interested in new job opportunities":1,
                          "I am actively looking for a job":2}

FormalEducation_groups = {"Bachelor�s degree (BA, BS, B.Eng., etc.)":0, "Master�s degree (MA, MS, M.Eng., MBA, etc.)":1, "Some college/university study without earning a degree":2,
                          "Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)":3, "Associate degree":4, "Other doctoral degree (Ph.D, Ed.D., etc.)":5,
                          "Professional degree (JD, MD, etc.)":6, "Primary/elementary school":7, "I never completed any formal education":8}

CompanySize_groups = {"Fewer than 10 employees":0, "10 to 19 employees":1, "20 to 99 employees":2, "100 to 499 employees":3, "500 to 999 employees":4,
                      "1,000 to 4,999 employees":4, "5,000 to 9,999 employees":5, "10,000 or more employees":7}

YearsCoding_group = {"0-2 years":0, "3-5 years":1, "6-8 years":2, "9-11 years":3, "12-14 years":4, "15-17 years":5, "18-20 years":6,
                     "21-23 years":7, "24-26 years":8, "27-29 years":9, "30 or more years":10}

hope5years = {"Working in a different or more specialized technical role than the one I'm in now":0, "Working as a founder or co-founder of my own company":1,
              "Doing the same work":2, "Working as an engineering manager or other functional manager":3, "Working as a product manager or project manager":4,
              "Working in a career completely unrelated to software development":5, "Retirement":6}


db.replace({'Age':age_group}, inplace=True)
db.replace({'loan_status':loan_status_groups}, inplace=True)
db.replace({'JobSearchStatus':JobSearchStatus_groups}, inplace=True)
db.replace({'FormalEducation':FormalEducation_groups}, inplace=True)
db.replace({'CompanySize':CompanySize_groups}, inplace=True)
db.replace({'YearsCoding':YearsCoding_group}, inplace=True)

db['term'] = db['term'].replace('36 months','0', regex=True)
db['term'] = db['term'].replace('60 months','1', regex=True)

db.replace({'HopeFiveYears':hope5years}, inplace=True)
#db.replace({'':}, inplace=True)

#removing LOAN_AMT feature because we created a new feature LOAN_AMT_$
db = db.drop(['LOAN_AMT'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
db.isnull().sum()

id                    0
Target_Salary         0
Target_Satisfied      0
dti                   0
Age                 919
loan_status           0
JobSearchStatus     435
FormalEducation      52
HopeFiveYears       469
CompanySize           0
YearsCoding           3
term                  0
last_pymnt_amnt       0
Loan_Amt_$            0
dtype: int64

In [11]:
#Now replacing null values with median in 'Age' , 'JobSearchStatus', 'FormalEducation', 'YearsCoding','HopeFiveYears' 
db['Age'].fillna(int(db['Age'].median()), inplace=True)
db['JobSearchStatus'].fillna(int(db['JobSearchStatus'].median()), inplace=True)
db['FormalEducation'].fillna(int(db['FormalEducation'].median()),inplace=True)
db['YearsCoding'].fillna(int(db['YearsCoding'].median()),inplace=True)
db['HopeFiveYears'].fillna(int(db['HopeFiveYears'].median()), inplace=True)


In [12]:
db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2946 entries, 0 to 2945
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                2946 non-null   int64  
 1   Target_Salary     2946 non-null   float64
 2   Target_Satisfied  2946 non-null   int64  
 3   dti               2946 non-null   float64
 4   Age               2946 non-null   float64
 5   loan_status       2946 non-null   int64  
 6   JobSearchStatus   2946 non-null   float64
 7   FormalEducation   2946 non-null   float64
 8   HopeFiveYears     2946 non-null   float64
 9   CompanySize       2946 non-null   int64  
 10  YearsCoding       2946 non-null   float64
 11  term              2946 non-null   object 
 12  last_pymnt_amnt   2946 non-null   float64
 13  Loan_Amt_$        2946 non-null   object 
dtypes: float64(8), int64(4), object(2)
memory usage: 322.3+ KB


In [13]:
#changing datatype of some features
db = db.astype({"term": int})
db = db.astype({"Loan_Amt_$": float})

In [14]:
#Shuffling data
from sklearn.utils import shuffle
db = shuffle(db)

In [15]:
#checking total employees that are satisfied or not
db["Target_Satisfied"].value_counts()

0    2327
1     619
Name: Target_Satisfied, dtype: int64

In [16]:
db

Unnamed: 0,id,Target_Salary,Target_Satisfied,dti,Age,loan_status,JobSearchStatus,FormalEducation,HopeFiveYears,CompanySize,YearsCoding,term,last_pymnt_amnt,Loan_Amt_$
1165,68596033,57944.0,0,8.20,2.0,0,1.0,1.0,1.0,4,3.0,0,2497.74,9000.0
2457,68614804,58700.0,0,15.90,2.0,1,0.0,1.0,1.0,2,1.0,0,396.75,10825.0
188,66574010,38197.0,0,18.19,2.0,0,0.0,2.0,1.0,4,2.0,0,33804.36,35000.0
1667,68575973,65783.0,0,22.87,3.0,0,0.0,6.0,0.0,5,7.0,1,7464.96,15000.0
1564,68425588,37217.0,0,27.53,2.0,0,0.0,1.0,3.0,0,2.0,0,964.03,15000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2818,68594811,44083.0,0,22.15,2.0,1,1.0,0.0,0.0,7,4.0,0,847.66,25000.0
2489,68585787,36390.0,0,7.86,2.0,1,0.0,4.0,1.0,1,1.0,0,490.62,14000.0
2016,68564455,65783.0,0,18.40,3.0,0,0.0,4.0,1.0,3,8.0,0,489.15,7200.0
1198,68356170,87490.0,0,26.71,2.0,0,2.0,0.0,5.0,4,2.0,0,19.48,8000.0


In [17]:
#spliting the data into train : test as 80 : 20 randomly.
msk = np.random.rand(len(db)) < 0.8
train = db[msk]
test = db[~msk]

In [18]:
train.info()
print("\n ------------------------- \n")
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2373 entries, 1165 to 2834
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                2373 non-null   int64  
 1   Target_Salary     2373 non-null   float64
 2   Target_Satisfied  2373 non-null   int64  
 3   dti               2373 non-null   float64
 4   Age               2373 non-null   float64
 5   loan_status       2373 non-null   int64  
 6   JobSearchStatus   2373 non-null   float64
 7   FormalEducation   2373 non-null   float64
 8   HopeFiveYears     2373 non-null   float64
 9   CompanySize       2373 non-null   int64  
 10  YearsCoding       2373 non-null   float64
 11  term              2373 non-null   int32  
 12  last_pymnt_amnt   2373 non-null   float64
 13  Loan_Amt_$        2373 non-null   float64
dtypes: float64(9), int32(1), int64(4)
memory usage: 268.8 KB

 ------------------------- 

<class 'pandas.core.frame.DataFrame'>
Int64Ind

In [19]:
# removing some features and rearranging data
train_df = train.drop(['id','Target_Salary'], axis=1)
test_df = test.drop(['Target_Satisfied','Target_Salary'],axis=1)
target = test['Target_Satisfied']
combine = [train_df, test_df]
train_df.shape, test_df.shape

((2373, 12), (573, 12))

In [20]:
# spliting train and test data into X_train,Y_train , X_test, Y_test

X_train = train_df.drop("Target_Satisfied", axis=1)    # train data without 'Target_Satisfied' feature
Y_train = train_df["Target_Satisfied"]                 # train data that contains only 'Target_Satisfied'
X_test  = test_df.drop("id", axis=1).copy()            # test data without 'id' and 'Target_Satisfied'
Y_test = target                                        # test data that contains only 'Target_Satisfied'
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((2373, 11), (2373,), (573, 11), (573,))

In [21]:
# as we can see above, the data is in the required shape

## Dreating different models

In [22]:
#Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

100.0

In [23]:
# checking the confusion matrix , precision_score, recall_score and f1_score for random forest
confusion_matrix(target,Y_pred)
print("confusion_matrix :\n",confusion_matrix(target,Y_pred))


print("-------------------------- \n precision_score : ",precision_score(target,Y_pred))
ran_for_prec =  precision_score(target,Y_pred)
print("\n-------------------------- \n recall_score : ",recall_score(target,Y_pred))
ran_for_recall = recall_score(target,Y_pred)

print("\n-------------------------- \n f1_score : ",f1_score(target,Y_pred))
ran_for_f1 = f1_score(target,Y_pred)


confusion_matrix :
 [[447   0]
 [125   1]]
-------------------------- 
 precision_score :  1.0

-------------------------- 
 recall_score :  0.007936507936507936

-------------------------- 
 f1_score :  0.015748031496062992


In [24]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

confusion_matrix(target,Y_pred)
log_reg_prec =  precision_score(target,Y_pred)
log_reg_recall = recall_score(target,Y_pred)
log_reg_f1 = f1_score(target,Y_pred)


  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
#correation of features
coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
2,loan_status,0.004916
8,term,0.001812
9,last_pymnt_amnt,-1.2e-05
10,Loan_Amt_$,-2.9e-05
3,JobSearchStatus,-0.007949
0,dti,-0.020844
6,CompanySize,-0.025851
5,HopeFiveYears,-0.027395
4,FormalEducation,-0.031912
1,Age,-0.042886


In [26]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

confusion_matrix(target,Y_pred)
svm_prec =  precision_score(target,Y_pred)
svm_recall = recall_score(target,Y_pred)
svm_f1 = f1_score(target,Y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
#KNN (k-nearest neighbors algorithm)
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

confusion_matrix(target,Y_pred)
knn_prec =  precision_score(target,Y_pred)
knn_recall = recall_score(target,Y_pred)
knn_f1 = f1_score(target,Y_pred)

In [28]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

confusion_matrix(target,Y_pred)
gnb_prec =  precision_score(target,Y_pred)
gnb_recall = recall_score(target,Y_pred)
gnb_f1 = f1_score(target,Y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

confusion_matrix(target,Y_pred)
percept_prec =  precision_score(target,Y_pred)
percept_recall = recall_score(target,Y_pred)
percept_f1 = f1_score(target,Y_pred)

In [30]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc

confusion_matrix(target,Y_pred)
linear_svc_prec =  precision_score(target,Y_pred)
linear_svc_recall = recall_score(target,Y_pred)
linear_svc_f1 = f1_score(target,Y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

confusion_matrix(target,Y_pred)
sgd_prec =  precision_score(target,Y_pred)
sgd_recall = recall_score(target,Y_pred)
sgd_f1 = f1_score(target,Y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

confusion_matrix(target,Y_pred)
decision_prec =  precision_score(target,Y_pred)
decision_recall = recall_score(target,Y_pred)
decision_f1 = f1_score(target,Y_pred)

In [33]:
# Result of diffrent models with accuracy(score) ,precision_score, recall_score and f1_score
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree],

    'precision_score': [svm_prec, knn_prec, log_reg_prec, ran_for_prec,
                        gnb_prec, percept_prec, sgd_prec, linear_svc_prec,
                        decision_prec
                       ],
        
        
    'recall_score ': [svm_recall, knn_recall, log_reg_recall, ran_for_recall,
                      gnb_recall, percept_recall, sgd_recall, linear_svc_recall,
                      decision_recall
                     ],
                        
    'f1_score ': [svm_f1, knn_f1, log_reg_f1, ran_for_f1,
                  gnb_f1, percept_f1, sgd_f1, linear_svc_f1,
                  decision_f1
                 ]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score,precision_score,recall_score,f1_score
3,Random Forest,100.0,1.0,0.007937,0.015748
8,Decision Tree,100.0,0.23622,0.238095,0.237154
1,KNN,84.16,0.184211,0.111111,0.138614
0,Support Vector Machines,79.22,0.0,0.0,0.0
2,Logistic Regression,79.22,0.0,0.0,0.0
4,Naive Bayes,79.22,0.0,0.0,0.0
6,Stochastic Gradient Decent,79.22,0.0,0.0,0.0
7,Linear SVC,79.22,0.0,0.0,0.0
5,Perceptron,20.78,0.219895,1.0,0.360515


In [34]:
db.head()

Unnamed: 0,id,Target_Salary,Target_Satisfied,dti,Age,loan_status,JobSearchStatus,FormalEducation,HopeFiveYears,CompanySize,YearsCoding,term,last_pymnt_amnt,Loan_Amt_$
1165,68596033,57944.0,0,8.2,2.0,0,1.0,1.0,1.0,4,3.0,0,2497.74,9000.0
2457,68614804,58700.0,0,15.9,2.0,1,0.0,1.0,1.0,2,1.0,0,396.75,10825.0
188,66574010,38197.0,0,18.19,2.0,0,0.0,2.0,1.0,4,2.0,0,33804.36,35000.0
1667,68575973,65783.0,0,22.87,3.0,0,0.0,6.0,0.0,5,7.0,1,7464.96,15000.0
1564,68425588,37217.0,0,27.53,2.0,0,0.0,1.0,3.0,0,2.0,0,964.03,15000.0


### To find RMSE for 'Target_salery'

In [35]:

x = db.drop(['id','Target_Salary','Target_Satisfied'], axis = 1)
y = db['Target_Salary']

In [36]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)

In [37]:
# Fitting Random Forest Regression to the Training set
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 20, random_state = 0)

In [38]:
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=20, random_state=0)

In [39]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)
y_pred

array([55873.75    , 76019.65    , 59169.685   , 72904.45    ,
       58309.678505, 55182.424   , 56882.462   , 61229.35    ,
       70112.4     , 63270.727   , 57522.13744 , 77424.9615  ,
       59072.4725  , 56733.05    , 62211.477   , 59944.4     ,
       73585.691   , 62148.3895  , 60798.5802  , 57126.48162 ,
       62318.59912 , 63549.8     , 66319.144945, 62967.459   ,
       48366.687805, 49104.7455  , 59666.325   , 44734.5395  ,
       59557.175505, 61895.676055, 65325.5725  , 66714.55    ,
       75639.8     , 58501.3     , 69815.118   , 48359.6875  ,
       40493.9432  , 64017.85    , 61600.8275  , 62889.026055,
       55086.5065  , 62481.937   , 54005.985   , 64314.35    ,
       56563.73162 , 51873.88776 , 59332.076055, 59499.8715  ,
       66047.323945, 65334.35    , 65684.935   , 68836.092   ,
       53439.6     , 57859.70632 , 58087.075   , 54338.74898 ,
       66207.05    , 64374.185   , 41821.44251 , 52242.741   ,
       58729.074   , 53041.70611 , 68590.925   , 61642.

In [40]:
# Evaluating the Algorithm
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 20561.223530788135
Mean Squared Error: 871377947.4120413
Root Mean Squared Error: 29519.111562037928
