In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%notebook inline

In [9]:
df = pd.read_csv("salary_data.csv")

In [10]:
df.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343
1,1.3,46205
2,1.5,37731
3,2.0,43525
4,2.2,39891


In [11]:
#checking for null values

print("Checking for null values in all columns...\n")
count =0
for i in df.columns:
    if(i == np.where(pd.isnull(i))):
        count+=1
if (count>0):
    print(str(count) + "columns contain null values")
else:
    print("No column contains null values")

Checking for null values in all columns...

No column contains null values


In [13]:
def calcMean(X):
    
    sum = 0
    count = 0
    
    for i in X:
        sum += i
        count += 1
    
    mean = sum/count
    
    return mean

In [3]:
def train_test_split(X,Y):

    train_size = int(0.7 * len(df)) # define train size

    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = Y[:train_size], Y[train_size:]
    
    return(X_train, X_test, y_train, y_test)
    

In [22]:
def trainModel(X_train, y_train, mean_X, mean_Y):
    
    n = len(X_train) 
    numerator = 0 
    denominator = 0
    
    for i in range(n): # for loop to traverse through array and get the values of num. and den.
        
        numerator += (X_train[i] - mean_X) * (y_train[i] - mean_Y)
        denominator += (X_train[i] - mean_X)**2

    m = numerator/denominator # assigning value to slope (m)
    c = mean_Y - (m * mean_X) # y= mx+c --> c = y-mx (c is the intercept)
    
    y = (m * X_train) + c # for every data point in array X_train, getting a subsequent y value 
    
    error = (abs(y - y_train)/y_train) * 100
    train_accuracy= 100 - sum(error)/ len(error)
    
    return(train_accuracy, m, c)

In [23]:
def testModel(X_test,y_test, m,c):
    
    y = (m * X_test) + c
    
    error = (abs(y - y_test)/y_test) * 100
    test_accuracy= 100 - sum(error)/ len(error)
    
    return(test_accuracy)

In [56]:
def makePred(m,c):
    
    x = float(input("\nEnter the (years of experience): "))
    
    y = (m * x) + c
    
    print("------------------------------")
    print("Predicted Salary: ", np.round(y))
    

In [52]:
def linearRegression():
    
    print("Running a Linear Regression on the dataset..\n")
    
    X = df["YearsExperience"].values # array containing years of exp.
    Y = df["Salary"].values # array containing sal
    
    
    X_train, X_test, y_train, y_test = train_test_split(X,Y)
    
    mean_X = calcMean(X_train) # using custom calcMean() function to calculate the mean for both arrays
    mean_Y = calcMean(y_train)
    
    train_accuracy, m, c = trainModel(X_train,y_train, mean_X, mean_Y)
    test_accuracy = testModel(X_test, y_test, m,c)
    
    print("Training the model..")
    print("------------------------------")
    print("Training Accuracy: " + str(np.round(train_accuracy, 4)) + " %")
    print("==============================")
    print("Testing the model..")
    print("------------------------------")
    print("Testing Accuracy: "+ str(np.round(test_accuracy, 4)) + " %")
    print("==============================")
    
    ch = input("\nWant to make a prediction (Y/N)?: ")
    
    if ch == "Y":
        makePred(m,c)
        ch = input("Do you want to generate a plot (Y/N)?: ")
        if ch == "Y":
            regPlot()
    else:
        print("Okay!")

In [59]:
linearRegression()

Running a Linear Regression on the dataset..

Training the model..
------------------------------
Training Accuracy: 91.4202 %
Testing the model..
------------------------------
Testing Accuracy: 96.5703 %

Want to make a prediction (Y/N)?: Y

Enter the (years of experience): 5
------------------------------
Predicted Salary:  72549.0
