In [299]:
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [300]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,x,y
0,24.0,21.549452
1,50.0,47.464463
2,15.0,17.218656
3,38.0,36.586398
4,87.0,87.288984


In [301]:
data.dropna(inplace = True)
df = data.copy() # important for de-normalize process

In [302]:
data.describe()

Unnamed: 0,x,y
count,699.0,699.0
mean,50.014306,49.939869
std,28.95456,29.109217
min,0.0,-3.839981
25%,25.0,24.929968
50%,49.0,48.97302
75%,75.0,74.929911
max,100.0,108.871618


In [303]:
data.dtypes

x    float64
y    float64
dtype: object

In [304]:
f = px.scatter(x = data['x'],y = data['y'])
f.show()

# Normalize

In [305]:
def normalize(col):
    
    col = (col - col.min())/(col.max()-col.min())
    return col
    
data['x'] = normalize(data['x'])
data['y'] =  normalize(data['y'])

# Cost function

In [306]:
x0 = [1] * len(data)
x1 = data['x']

X = np.array([x0 , x1]).T
W = np.array([0 , 11 ])

In [307]:
X

array([[1.  , 0.24],
       [1.  , 0.5 ],
       [1.  , 0.15],
       ...,
       [1.  , 0.82],
       [1.  , 0.66],
       [1.  , 0.97]])

In [308]:
def costFunction (X,y,W) :
    h = X.dot(W)
    J = np.sum(((h - y) **2) / (2 * len(y)))
    return J

In [309]:
costWithoutGradient = costFunction(X , data['y'] , W)

In [310]:
costWithoutGradient

16.90303900268054

In [311]:
def gradientFunction(X , Y , W , alpha = 0.01 , iterations = 40000):
        
    costH = []
    length = len(Y)
    
    for i in range (0,iterations):
        if i % 1000== 0 :
            print("Iteration %d: "%i)
            print(costFunction(X,Y,W))
            
            
        h = X.dot(W)
        diff = h - Y
        g = X.T.dot(diff) / length # gradient
        
        W = W - alpha * g
        
        cost = costFunction(X,Y,W)
        costH.append(cost)
        
    return W , costH 
    

In [312]:
w2 , cost = gradientFunction(X , data['y'] , W) 

Iteration 0: 
16.90303900268054
Iteration 1000: 
0.7029020997507346
Iteration 2000: 
0.1877965417604989
Iteration 3000: 
0.05034055758881528
Iteration 4000: 
0.013660410586223993
Iteration 5000: 
0.003872308654981479
Iteration 6000: 
0.0012603523994728443
Iteration 7000: 
0.0005633515377516722
Iteration 8000: 
0.000377356768422945
Iteration 9000: 
0.00032772404037692673
Iteration 10000: 
0.00031447954053329986
Iteration 11000: 
0.0003109452440516476
Iteration 12000: 
0.0003100021165823062
Iteration 13000: 
0.0003097504428961724
Iteration 14000: 
0.0003096832837407547
Iteration 15000: 
0.0003096653623115194
Iteration 16000: 
0.00030966057997582985
Iteration 17000: 
0.0003096593038089379
Iteration 18000: 
0.0003096589632636391
Iteration 19000: 
0.0003096588723890824
Iteration 20000: 
0.00030965884813919536
Iteration 21000: 
0.00030965884166810986
Iteration 22000: 
0.00030965883994129987
Iteration 23000: 
0.0003096588394805004
Iteration 24000: 
0.00030965883935753604
Iteration 25000: 
0.0

In [313]:
w2 # coefficients

array([0.0331174 , 0.88780249])

In [314]:
data['pred'] = X.dot(w2)
data

Unnamed: 0,x,y,pred
0,0.24,0.225260,0.246190
1,0.50,0.455183,0.477019
2,0.15,0.186836,0.166288
3,0.38,0.358671,0.370482
4,0.87,0.808515,0.805506
...,...,...,...
695,0.58,0.553936,0.548043
696,0.93,0.873602,0.858774
697,0.82,0.820180,0.761115
698,0.66,0.598773,0.619067


# Testing

In [315]:
test = pd.read_csv('test.csv')
df_test = test.copy()

In [316]:
test['x'] = normalize(test['x'])
test['y'] = normalize(test['y'])

In [317]:
def testFunction (X) :
    
    predict = w2[0] * 1 + w2[1] * X 
    
    return predict

In [318]:
test['prediction'] = testFunction(test['x'])

In [319]:
fig1 = px.line(test, x="x", y="prediction", title='Line of Linear Regression Model')
fig2 = px.scatter(x = test['x'],y = test['y'])

fig = go.Figure(data = fig1.data + fig2.data)
fig.show()


In [320]:
test

Unnamed: 0,x,y,prediction
0,0.77,0.763279,0.716725
1,0.21,0.244317,0.219556
2,0.22,0.266617,0.228434
3,0.20,0.195538,0.210678
4,0.36,0.415531,0.352726
...,...,...,...
295,0.71,0.660315,0.663457
296,0.46,0.465825,0.441507
297,0.55,0.527771,0.521409
298,0.62,0.612188,0.583555


In [321]:
# for calculating error and r-square score we must do de-normalize
def deNormalize(col,maxi,mini):
    c = col * (maxi-mini) + mini
    
    return c
    
test['x'] = deNormalize(test['x'],df_test['x'].max(),df_test['y'].min())
test['y'] =  deNormalize(test['y'],df_test['x'].max(),df_test['y'].min())
test['prediction'] =  deNormalize(test['prediction'],df_test['x'].max(),df_test['y'].min())
    

In [322]:
test

Unnamed: 0,x,y,prediction
0,76.202387,75.507018,70.690168
1,18.260372,21.811097,19.249103
2,19.295051,24.118384,20.167694
3,17.225693,16.763974,18.330513
4,33.780554,39.526280,33.027960
...,...,...,...
295,69.994314,64.853514,65.178626
296,44.127343,44.730058,42.213864
297,53.439452,51.139429,50.481178
298,60.682204,59.873915,56.911312


In [323]:
def error(Y , Pred) :
    err = abs(Y - Pred)
    return err
    

In [324]:
test['error'] = error(test['y'],test['prediction'])

In [325]:
a = test['error'].sum()

In [326]:
errorr = a / len(test['prediction'])

In [327]:
print(errorr) # mean_absolute_error

2.8356186647913715


In [328]:
def residual_error(col):
    col = col*col
    return col
test['E^2'] = test['error'].apply(residual_error) 

In [329]:
def mean_error(col):
    col = col - col.mean()
    col = col*col
    
    return col
test['mean_error'] = mean_error(test['y'])

In [330]:
r_square_error = 1 - (test['E^2'].sum() / test['mean_error'].sum())

In [331]:
r_square_error

0.9834725922456193

In [332]:
test

Unnamed: 0,x,y,prediction,error,E^2,mean_error
0,76.202387,75.507018,70.690168,4.816850,23.202040,734.693022
1,18.260372,21.811097,19.249103,2.561994,6.563811,707.065302
2,19.295051,24.118384,20.167694,3.950690,15.607955,589.684113
3,17.225693,16.763974,18.330513,1.566539,2.454045,1000.951820
4,33.780554,39.526280,33.027960,6.498320,42.228161,78.774796
...,...,...,...,...,...,...
295,69.994314,64.853514,65.178626,0.325112,0.105698,270.658997
296,44.127343,44.730058,42.213864,2.516193,6.331229,13.481665
297,53.439452,51.139429,50.481178,0.658251,0.433294,7.494632
298,60.682204,59.873915,56.911312,2.962603,8.777017,131.609496


# Model creating with library

In [333]:
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score

In [334]:
df
df.dropna(inplace = True)

In [335]:
df

Unnamed: 0,x,y
0,24.0,21.549452
1,50.0,47.464463
2,15.0,17.218656
3,38.0,36.586398
4,87.0,87.288984
...,...,...
695,58.0,58.595006
696,93.0,94.625094
697,82.0,88.603770
698,66.0,63.648685


In [336]:
model = LinearRegression()

In [337]:
X = df[['x']]
Y = df[['y']]

In [338]:
model.fit(X,Y)

LinearRegression()

In [339]:
df_test

Unnamed: 0,x,y
0,77,79.775152
1,21,23.177279
2,22,25.609262
3,20,17.857388
4,36,41.849864
...,...,...
295,71,68.545888
296,46,47.334876
297,55,54.090637
298,62,63.297171


In [340]:
# making predictions
predictions = model.predict(df_test[['x']])

# Comparing

In [341]:
# model evaluation
print("Result of library >>> ")
print(
  'mean_absolute_error : ', mean_absolute_error(df_test[['y']], predictions))
print("r-square error : " , r2_score(df_test[['y']] , predictions) )

Result of library >>> 
mean_absolute_error :  2.415771850041258
r-square error :  0.9888014444327563


In [342]:
print("Result of our writing main code >>> ")
print(
  'mean_absolute_error : ', errorr)
print("r-square error : " , r_square_error )

Result of our writing main code >>> 
mean_absolute_error :  2.8356186647913715
r-square error :  0.9834725922456193
