In [24]:
import os
from pathlib import Path
import random


class LinearRegression:
    def __init__(self):
        self.b1 = 0
        self.b0 = 0
        self.dataset = []
        self.testset = []
        self.trainset = []

    def sumList(self, dataset):
        total = 0
        for item in dataset:
            total += item
        return total

    def meanList(self, dataset):
        return self.sumList(dataset) / len(dataset)

    def getDataset(self, path, filename):
        os.chdir(path)
        p = Path(filename)
        if p.exists():
            with open(filename) as file:
                self.dataset = file.readlines()

    def splitDataset(self, ratio):
        ratio *= len(self.dataset)
        from math import floor
        ratio = floor(ratio)
        i = 0
        usednums = []
        randomnum = 0
        while i < ratio:
            used = True
            while used:
                randomnum = random.randint(0, len(self.dataset) - 1)
                used = False
                if len(usednums) == 0:
                    usednums.append(randomnum)
                    used = False
                for item in usednums:
                    if randomnum == item:
                        used = True
            self.trainset.append(self.dataset[randomnum])
            usednums.append(randomnum)
            i += 1
        while i < len(self.dataset) - 1:
            used = True
            while used:
                randomnum = random.randint(0, len(self.dataset) - 1)
                used = False
                for item in usednums:
                    if randomnum == item:
                        used = True
            self.testset.append(self.dataset[randomnum])
            usednums.append(randomnum)
            i += 1

    def getlists(self, dataset):
        xlist = []
        ylist = []
        for item in dataset:
            a, b = item.split(',')
            x = float(a.strip())
            y = int(b.strip())
            xlist.append(x)
            ylist.append(y)
        return xlist, ylist

    def variance(self, dataset):
        xlist, ylist = self.getlists(dataset)
        mean = self.meanList(xlist)
        total = 0
        for item in xlist:
            total += (item - mean) ** 2
        return total / (len(xlist) - 1)

    def covariance(self, dataset):
        xlist, ylist = self.getlists(dataset)
        meanx = self.meanList(xlist)
        totalx = []
        for item in xlist:
            totalx.append((item - meanx))
        meany = self.meanList(ylist)
        totaly = []
        for item in ylist:
            totaly.append((item - meany))
        total = 0
        for i in range(len(totalx)):
            total += totalx[i] * totaly[i]
        return total / (len(xlist) - 1)

    def fit(self, dataset):
        self.b1 = self.covariance(dataset) / self.variance(dataset)
        xlist, ylist = self.getlists(dataset)
        self.b0 = self.meanList(ylist) - (self.b1 * self.meanList(xlist))
        return self.b1, self.b0

    def predict(self, dataset):
        xlist, ylist = self.getlists(dataset)
        predictedy = []
        for i in range(len(ylist)):
            predictedy.append(self.b0 + (self.b1 * xlist[i]))
            print('for x value {}, predicted y value is {:.2f}'.format(xlist[i], predictedy[i]))
        return predictedy

    def rmse(self, dataset):
        xlist, ylist = self.getlists(dataset)
        predictedy = self.predict(dataset)
        total = 0
        for i in range(len(ylist)):
            total += (predictedy[i] - ylist[i]) ** 2
        from math import sqrt
        return sqrt(total / len(ylist))



In [26]:
line1 = LinearRegression()
line1.getDataset(".", "salary.csv")
line1.splitDataset(.80)
a, b = line1.fit(line1.trainset)
c = line1.rmse(line1.testset)
x, y = line1.getlists(line1.testset)
print('\nvariance is: {:.2f} \ncovariance is: {:.2f} \nx-list is: {} \ny-list is: {}\n'.format(line1.variance(line1.trainset), line1.covariance(line1.trainset), x, y))
print("b0 is {:.2f}, b1 is {:.2f}, RMSE is {:.2f}".format(b, a, c))


for x value 1.3, predicted y value is 37792.86
for x value 8.7, predicted y value is 108552.90
for x value 7.1, predicted y value is 93253.43
for x value 4.1, predicted y value is 64566.93
for x value 4.5, predicted y value is 68391.80

variance is: 7.90 
covariance is: 75578.37 
x-list is: [1.3, 8.7, 7.1, 4.1, 4.5] 
y-list is: [46205, 109431, 98273, 57081, 61111]

b0 is 25362.04, b1 is 9562.17, RMSE is 6415.30
