In [None]:
import pandas as pd

df = pd.DataFrame().from_dict({'X': [1,2,3,4,5], 'Y': [1,2,1.3,3.75,2.25]})
df

The formula for a regression line is

Y' = bX + A

Y' = b1X1 + b2X2 + b3X3 + b0    # multiple X

Let's calculate Y'

In [None]:
b = 0.5
A = 0

In [None]:
df["Y pred"] = df['X'] * b + A
df

In [None]:
df["error"] = df["Y"] - df["Y pred"]
df

In [None]:
df["error sq"] = df["error"] * df["error"]
df

What is the sum of error?

In [None]:
sum_of_error_sq = df["error sq"].sum()
sum_of_error_sq

In [None]:
import math
def root_mean_sq_error(x,y,b,A):
    x=list(x)
    y=list(y)
    sum_error_sq = 0
    for i in range(len(x)):
        sum_error_sq += ((b*x[i]+A) -  y[i])**2
    return math.sqrt(sum_error_sq / len(x))

def sum_error_eq(x,y,b,A):
    x=list(x)
    y=list(y)
    sum_error_sq = 0
    for i in range(len(x)):
        sum_error_sq += ((b*x[i]+A) -  y[i])**2
    return sum_error_sq

sum_error_eq(df["X"],df["Y"],1,0)    

## Let's visulize it

In [None]:
from bokeh.layouts import column
from bokeh.models import ColumnDataSource, CustomJS, Slider, Div
from bokeh.plotting import figure, output_notebook, show
from bokeh.themes import built_in_themes
from bokeh.io import curdoc

output_notebook()
curdoc().theme = 'night_sky'

b = 0.425
A = 0.785

df = pd.DataFrame().from_dict({'X': [1,2,3,4,5], 'Y': [1,2,1.3,3.75,2.25]})
df['b'] = b
df['A'] = A
df['Y pred'] = df['X'] * b + A

source = ColumnDataSource(df)

plot = figure(plot_width=400, plot_height=400, x_range=[0,6], y_range=[0,6])
plot.line(x='X', y='Y pred', source=source, line_width=3, line_alpha=0.6)
plot.circle(x='X', y='Y', source=source, size=10, color="green", alpha=0.5)

callback_b = CustomJS(args=dict(source=source), code="""
        var data = source.data;
        var b_val = cb_obj.value
        var b = data['b']
        var A = data['A']
        var x = data['X']
        var y = data['Y pred']
        for (var i = 0; i < x.length; i++) {
            b[i] = b_val
            y[i] = b[i] * x[i] + A[i]
        }
        source.change.emit();
    """)

callback_A = CustomJS(args=dict(source=source), code="""
        var data = source.data;
        var A_val = cb_obj.value
        var b = data['b']
        var A = data['A']
        var x = data['X']
        var y = data['Y pred']
        for (var i = 0; i < x.length; i++) {
            A[i] = A_val
            y[i] = b[i] * x[i] + A[i]
        }
        source.change.emit();
    """)

div = Div(text="Root mean squre error: "+ str(root_mean_sq_error(source.data['X'],source.data['Y'], b, A)))

change_text = CustomJS(args=dict(div=div, source=source), code="""
                var data = source.data;
                var y_pred = data['Y pred'];
                var y = data['Y'];
                var result = 0;
                for (var i = 0; i < y.length; i++) {
                    var diff = y_pred[i] - y[i]
                    result = result + diff * diff
                }
                result = Math.sqrt(result / y.length)
                div.text = "Root mean squre error: " + result;
             """       
    )

slider_b = Slider(start=0.1, end=4, value=b, step=.1, title="value of b")
slider_b.js_on_change('value', callback_b, change_text)
slider_A = Slider(start=0.1, end=4, value=A, step=.1, title="value of A")
slider_A.js_on_change('value', callback_A, change_text)

layout = column(slider_b, slider_A, plot, div)

show(layout)

### Finding the best line with scikit learn
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression

In [None]:
!pip install -U scikit-learn

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

In [None]:
line_reg = LinearRegression()
X = np.array(df['X']).reshape(-1, 1)
y = np.array(df['Y'])
line_reg.fit(X,y)
print("Score: ", line_reg.score(X, y),
      "\nCoef: ", line_reg.coef_,
      "\nIntercept: ", line_reg.intercept_)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# Use only one feature
diabetes_X = diabetes_X[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes_y[:-20]
diabetes_y_test = diabetes_y[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(diabetes_y_test, diabetes_y_pred))

# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()