## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Load the dataset

In [2]:
dataset = pd.read_csv('Salary_Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
X.shape

(30, 1)

## splitting the dataset into the teraining set and test set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)

In [5]:
X_train.shape

(20, 1)

In [6]:
X_test.shape

(10, 1)

## Training the simple linear regression model on the training set 

In [9]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

## Predicting the test set results

In [10]:
regressor.predict([[3.7]])

array([61058.82022717])

In [11]:
y_pred = regressor.predict(X_test)
y_pred

array([ 53374.54136938, 109085.56308835,  39927.05336825,  61058.82022717,
        64900.95965607,  74506.3082283 ,  38005.98365381,  82190.58708609,
       117730.37680336, 101401.28423056])

## Getting the actual, predicted and error values

In [12]:
y_test, y_pred, y_test-y_pred

(array([ 56642., 109431.,  37731.,  57189.,  57081.,  66029.,  46205.,
         81363., 112635., 101302.]),
 array([ 53374.54136938, 109085.56308835,  39927.05336825,  61058.82022717,
         64900.95965607,  74506.3082283 ,  38005.98365381,  82190.58708609,
        117730.37680336, 101401.28423056]),
 array([ 3267.45863062,   345.43691165, -2196.05336825, -3869.82022717,
        -7819.95965607, -8477.3082283 ,  8199.01634619,  -827.58708609,
        -5095.37680336,   -99.28423056]))

In [18]:
df = pd.DataFrame(data=zip(y_test, y_pred, y_test-y_pred), columns=['actual', 'predicted', 'error'], index=X_test[:, 0])

In [19]:
df

Unnamed: 0,actual,predicted,error
2.9,56642.0,53374.541369,3267.458631
8.7,109431.0,109085.563088,345.436912
1.5,37731.0,39927.053368,-2196.053368
3.7,57189.0,61058.820227,-3869.820227
4.1,57081.0,64900.959656,-7819.959656
5.1,66029.0,74506.308228,-8477.308228
1.3,46205.0,38005.983654,8199.016346
5.9,81363.0,82190.587086,-827.587086
9.6,112635.0,117730.376803,-5095.376803
7.9,101302.0,101401.284231,-99.284231


## y = m * x + c 

In [20]:
m = regressor.coef_
m

array([9605.34857224])

In [22]:
c = regressor.intercept_
c

25519.030509899843

In [32]:
y = m * 3.7 + c # 3.7 coz we are predicting on that values and we are seeing that this formual gives same answer
y

array([61058.82022717])

## Performance Measures 

In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [25]:
print("MAE", mean_absolute_error(y_test, y_pred))

MAE 4019.7301488264334


In [26]:
print("MSE", mean_squared_error(y_test, y_pred))

MSE 25749178.69265653


In [27]:
print("RMSE", np.sqrt(mean_squared_error(y_test, y_pred)))

RMSE 5074.364856083619


In [29]:
print("R2_Score", r2_score(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

R2_Score 0.9606799965862776


In [31]:
# Adjusted R2 Score
1 - ((1 - r2) * (30 - 1) / (30 - 1 - 1))

0.9592757107500732

In [33]:
# https://www.geeksforgeeks.org/python-coefficient-of-determination-r2-score/
# Coefficient of determination also called as R2 score is used to evaluate the performance of a linear regression model. 
# It is the amount of the variation in the output dependent attribute which is predictable from the input independent 
# variable(s). It is used to check how well-observed results are reproduced by the model, depending on the ratio of total 
# deviation of results described by the model.

In [34]:
# https://www.geeksforgeeks.org/ml-adjusted-r-square-in-regression-analysis/
# R-square test is used to determine the goodness of fit in regression analysis. Goodness of fit implies how better regression 
# model is fitted to the data points. More is the value of r-square near to 1, better is the model. But the problem lies in the
# fact that the value of r-square always increases as new variables(attributes) are added to the model, no matter that the newly
# added attributes have a positive impact on the model or not. also, it can lead to overfitting of the model if there are large
# no. of variables.
# Adjusted r-square is a modified form of r-square whose value increases if new predictors tend to improve model’s performance 
# and decreases if new predictors do not improve performance as expected.