# Regression Model Evaluation

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.decomposition import KernelPCA

## Importing the dataset

In [2]:
dataset = pd.read_csv('Engineering_graduate_salary_processed.csv')
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1].values
#X.drop(['ComputerProgramming','ElectronicsAndSemicon','ComputerScience','MechanicalEngg','ElectricalEngg','TelecomEngg','CivilEngg'], axis=1, inplace=True)

In [3]:
X

Unnamed: 0,Gender_f,Gender_m,10board_0,10board_board of secondary education,10board_cbse,10board_icse,10board_matriculation,10board_other,10board_rbse,10board_ssc,...,ComputerScience,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience
0,1,0,0,0,1,0,0,0,0,0,...,406.501429,402.026738,423.336066,349.879562,338.807692,-0.1590,0.3789,1.2396,0.14590,0.2889
1,0,1,0,0,1,0,0,0,0,0,...,406.501429,402.026738,423.336066,349.879562,338.807692,1.1336,0.0459,1.2396,0.52620,-0.2859
2,0,1,0,0,0,0,0,1,0,0,...,406.501429,402.026738,423.336066,260.000000,338.807692,0.5100,-0.1232,1.5428,-0.29020,-0.2875
3,0,1,0,0,1,0,0,0,0,0,...,406.501429,402.026738,423.336066,349.879562,338.807692,-0.4463,0.2124,0.3174,0.27270,0.4805
4,1,0,0,0,1,0,0,0,0,0,...,406.501429,402.026738,423.336066,349.879562,338.807692,-1.4992,-0.7473,-1.0697,0.06223,0.1864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2993,1,0,1,0,0,0,0,0,0,0,...,406.501429,402.026738,423.336066,349.879562,338.807692,-1.1901,0.9688,-1.0697,1.35490,0.0284
2994,1,0,0,0,0,0,0,0,0,0,...,406.501429,402.026738,423.336066,349.879562,338.807692,-0.1082,0.0328,-0.4891,-0.29020,0.5024
2995,0,1,0,0,0,0,0,1,0,0,...,406.501429,402.026738,423.336066,349.879562,338.807692,-0.8810,0.1888,-0.3440,0.06230,0.6603
2996,0,1,0,0,0,0,0,1,0,0,...,406.501429,402.026738,423.336066,349.879562,338.807692,1.4374,1.2808,-0.4891,-1.46537,0.5419


## Splitting the dataset into the Training set and Test set

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

In [5]:
scale_cols=['Age','10percentage','12graduation','12percentage','collegeGPA','GraduationYear','English','Logical','Quant','Domain','ComputerProgramming','ElectronicsAndSemicon','ComputerScience','MechanicalEngg','ElectricalEngg','TelecomEngg','CivilEngg']
scx = StandardScaler()
scy = StandardScaler()
pd.DataFrame(X_train,columns = X.columns)
pd.DataFrame(X_test,columns = X.columns)
X_train[scale_cols] = scx.fit_transform(X_train[scale_cols])
y_train = scy.fit_transform(y_train)
X_test[scale_cols] = scx.transform(X_test[scale_cols])
y_test = scy.transform(y_test)
X_train=X_train.to_numpy()
X_test=X_test.to_numpy()
y_train=np.ravel(y_train)
y_test=np.ravel(y_test)

In [6]:
X_train

array([[ 0.    ,  1.    ,  0.    , ...,  0.4711,  1.1601,  0.4805],
       [ 0.    ,  1.    ,  0.    , ...,  0.3817,  0.1798,  0.5024],
       [ 0.    ,  1.    ,  0.    , ...,  1.0348, -1.2303,  0.5024],
       ...,
       [ 0.    ,  1.    ,  0.    , ...,  0.2366, -1.2303, -0.4455],
       [ 0.    ,  1.    ,  0.    , ..., -0.1988, -0.2902,  0.3049],
       [ 1.    ,  0.    ,  0.    , ...,  0.4711, -1.7556, -0.6692]])

In [7]:
X_test

array([[ 0.    ,  1.    ,  0.    , ...,  0.8171, -0.4078, -0.4455],
       [ 0.    ,  1.    ,  0.    , ...,  0.0914, -0.4078,  0.0284],
       [ 1.    ,  0.    ,  0.    , ...,  0.672 , -0.2609, -0.1295],
       ...,
       [ 0.    ,  1.    ,  0.    , ..., -0.7794, -1.7004, -0.5245],
       [ 0.    ,  1.    ,  0.    , ..., -0.4511, -0.995 , -0.0943],
       [ 0.    ,  1.    ,  0.    , ...,  1.3933,  0.653 , -0.6692]])

In [8]:
y_train

array([-0.50158777,  0.77366306, -0.2055474 , ...,  1.13802045,
        2.25386493,  0.11326531])

In [9]:
y_test

array([ 6.59801382e-01, -9.57034501e-01, -8.43172819e-01, -2.33687095e-02,
       -5.69904783e-01, -9.34262165e-01, -4.78815437e-01,  5.00395027e-01,
       -2.33687095e-02, -8.43172819e-01,  2.04354654e-01,  8.64752409e-01,
       -5.24360110e-01,  1.13265309e-01, -2.96636746e-01, -9.34262165e-01,
       -4.10498428e-01, -2.33687095e-02, -3.87726092e-01, -2.96636746e-01,
        2.04354654e-01,  4.54850355e-01, -9.79806838e-01, -4.78815437e-01,
        1.13265309e-01, -5.01587774e-01, -5.69904783e-01,  9.10297082e-01,
        8.87524746e-01,  4.32078018e-01, -2.96636746e-01, -2.33687095e-02,
        4.32078018e-01,  9.04929724e-02, -7.06538801e-01, -2.96636746e-01,
       -4.33270764e-01,  5.23167364e-01,  9.04929724e-02,  1.13265309e-01,
       -5.24360110e-01,  2.72671663e-01, -1.04812385e+00, -8.43172819e-01,
        4.54850355e-01, -6.15449455e-01,  6.77206360e-02,  2.95444000e-01,
        1.13265309e-01,  1.20633746e+00,  4.49482997e-02,  4.49482997e-02,
        4.49482997e-02, -

## Training the models on the whole dataset

In [10]:
RFregressor = RandomForestRegressor(n_estimators = 100)
RFregressor.fit(X_train,y_train)

RandomForestRegressor()

In [11]:
DTregressor = DecisionTreeRegressor(min_samples_leaf=65)
DTregressor.fit(X_train, y_train)

DecisionTreeRegressor(min_samples_leaf=65)

In [12]:
Lregressor = LinearRegression()
Lregressor.fit(X_train,y_train)

LinearRegression()

In [13]:
SVRegressor = SVR()
SVRegressor.fit(X_train, y_train)

SVR()

## Predicting the Test set results

In [14]:
RF_y_pred = RFregressor.predict(X_test)

In [15]:
DT_y_pred = DTregressor.predict(X_test)

In [16]:
L_y_pred = Lregressor.predict(X_test) 

In [17]:
SVR_y_pred = SVRegressor.predict(X_test) 

## Evaluating the Model Performances

In [18]:
r2_score(y_test, RF_y_pred)*100

1.5869492336704538

In [19]:
r2_score(y_test, DT_y_pred)*100

16.745337893437352

In [20]:
r2_score(y_test, L_y_pred)*100

17.557124203038953

In [21]:
r2_score(y_test, SVR_y_pred)*100

20.91677527991441