In [0]:
# Required Python Packages
import pandas as pd
from math import pow


In [0]:
def get_headers(dataframe):
    """
    Get the headers name of the dataframe
    :param dataframe:
    :return:
    """
    return dataframe.columns.values





In [0]:
def cal_mean(readings):
    """
    Function to calculate the mean value of the input readings
    :param readings:
    :return:
    """
    readings_total = sum(readings)
    number_of_readings = len(readings)
    mean = readings_total / float(number_of_readings)
    return mean


In [0]:
def cal_variance(readings):
    """
    Calculating the variance of the readings
    :param readings:
    :return:
    """
    # To calculate the variance we need the mean value
    # Calculating the mean value from the cal_mean function
    readings_mean = cal_mean(readings)
    # mean difference squared readings
    mean_difference_squared_readings = [pow((reading - readings_mean), 2) for reading in readings]
    variance = sum(mean_difference_squared_readings)
    return variance / float(len(readings) - 1)


def cal_covariance(readings_1, readings_2):
    """
    Calculate the covariance between two different list of readings
    :param readings_1:
    :param readings_2:
    :return:
    """
    readings_1_mean = cal_mean(readings_1)
    readings_2_mean = cal_mean(readings_2)
    readings_size = len(readings_1)
    covariance = 0.0
    for i in xrange(0, readings_size):
        covariance += (readings_1[i] - readings_1_mean) * (readings_2[i] - readings_2_mean)
    return covariance / float(readings_size - 1)


In [0]:
def cal_simple_linear_regression_coefficients(x_readings, y_readings):
    """
    Calculating the simple linear regression coefficients (B0, B1)
    :param x_readings:
    :param y_readings:
    :return:
    """
    # Coefficient B1 = covariance of x_readings and y_readings divided by variance of x_readings
    # Directly calling the implemented covariance and the variance functions
    # To calculate the coefficient B1
    b1 = cal_covariance(x_readings, y_readings) / float(cal_variance(x_readings))

    # Coefficient B0 = mean of y_readings - ( B1 * the mean of the x_readings )
    b0 = cal_mean(y_readings) - (b1 * cal_mean(x_readings))
    return b0, b1


def predict_target_value(x, b0, b1):
    """
    Calculating the target (y) value using the input x and the coefficients b0, b1
    :param x:
    :param b0:
    :param b1:
    :return:
    """
    return b0 + b1 * x


def cal_rmse(actual_readings, predicted_readings):
    """
    Calculating the root mean square error
    :param actual_readings:
    :param predicted_readings:
    :return:
    """
    square_error_total = 0.0
    total_readings = len(actual_readings)
    for i in range(0, total_readings):
        error = predicted_readings[i] - actual_readings[i]
        square_error_total += pow(error, 2)
    rmse = square_error_total / float(total_readings)
    
    return rmse



In [0]:
def simple_linear_regression(dataset):
    """
    Implementing the simple linear regression without using any python library
    :param dataset:
    :return:
    """

    # Get the dataset header names
    dataset_headers = get_headers(dataset)
    print ("Dataset Headers :: ", dataset_headers)

    # Calculating the mean of the square feet and the price readings
    square_feet_mean = cal_mean(dataset[dataset_headers[0]])
    price_mean = cal_mean(dataset[dataset_headers[1]])

    square_feet_variance = cal_variance(dataset[dataset_headers[0]])
    price_variance = cal_variance(dataset[dataset_headers[1]])

    # Calculating the regression
    covariance_of_price_and_square_feet = dataset.cov()[dataset_headers[0]][dataset_headers[1]]
    w1 = covariance_of_price_and_square_feet / float(square_feet_variance)

    w0 = price_mean - (w1 * square_feet_mean)
    
    print('co efficent :' , w1)
    print('Intercept :' ,   w0)

    # Predictions
    dataset['Predicted_Salary'] = w0 + w1 * dataset[dataset_headers[0]]
    print(dataset)
    
    RMSE=cal_rmse(dataset['Salary'], dataset['Predicted_Salary'])
    print('RMSE::', RMSE)




In [0]:
if __name__ == "__main__":

    input_path = 'Salary_Data.csv'
    house_price_dataset = pd.read_csv(input_path)
    

In [11]:
simple_linear_regression(house_price_dataset)


Dataset Headers ::  ['YearsExperience' 'Salary']
co efficent : 9449.962321455076
Intercept : 25792.200198668696
    YearsExperience    Salary  Predicted_Salary
0               1.1   39343.0      36187.158752
1               1.3   46205.0      38077.151217
2               1.5   37731.0      39967.143681
3               2.0   43525.0      44692.124842
4               2.2   39891.0      46582.117306
5               2.9   56642.0      53197.090931
6               3.0   60150.0      54142.087163
7               3.2   54445.0      56032.079627
8               3.2   64445.0      56032.079627
9               3.7   57189.0      60757.060788
10              3.9   63218.0      62647.053252
11              4.0   55794.0      63592.049484
12              4.0   56957.0      63592.049484
13              4.1   57081.0      64537.045717
14              4.5   61111.0      68317.030645
15              4.9   67938.0      72097.015574
16              5.1   66029.0      73987.008038
17              5.3   83

In [0]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Salary_Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values


In [13]:
# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [0]:
# Predicting the Test set results
y_pred = regressor.predict(X)

In [0]:
from sklearn.metrics import mean_squared_error

In [16]:
mean_squared_error(y, y_pred)

31270951.722280964

In [0]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [18]:
rmse(y_pred,y)

5592.043608760661