## Read data from file

**Using numpy**

In [1]:
import math
import numpy as np
with open('../../data/Prac2/fuel.txt') as f:
    lines = f.readlines()

x_data = []
y_data = []
lines.pop(0)

for line in lines:
    splitted = line.replace('\n', '').split(',')
    splitted.pop(0)
    splitted = list(map(float, splitted))
    fuel = 1000 * splitted[1] / splitted[5]
    dlic = 1000 * splitted[0] / splitted[5]
    logMiles = math.log2(splitted[3])
    y_data.append([fuel])
    x_data.append([splitted[-1], dlic, splitted[2], logMiles])

x_data = np.asarray(x_data)
y_data = np.asarray(y_data)

In [2]:
def qr_householder(A):
    #""" Compute QR decomposition of A using Householder reflection"""
    M = A.shape[0]
    N = A.shape[1]

    # set Q to the identity matrix
    Q = np.identity(M)

    # set R to zero matrix
    R = np.copy(A)

    for n in range(N):
        # vector to transform
        x = A[n:, n]
        k = x.shape[0]
        # compute ro=-sign(x0)||x||
        ro = -np.sign(x[0]) * np.linalg.norm(x)

        # compute the householder vector v
        e = np.zeros(k)
        e[0] = 1
        v = (1 / (x[0] - ro)) * (x - (ro * e))

        # apply v to each column of A to find R
        for i in range(N):
            R[n:, i] = R[n:, i] - (2 / (v@v)) * ((np.outer(v, v)) @ R[n:, i])

        # apply v to each column of Q
        for i in range(M):
            Q[n:, i] = Q[n:, i] - (2 / (v@v)) * ((np.outer(v, v)) @ Q[n:, i])

    return Q.transpose(), R

In [3]:
def linear_regression(x_data, y_data):
# """
# This function calculate linear regression base on x_data and y_data
# :param x_data: vector
# :param y_data: vector
# :return: w (regression estimate)
# """

    # add column 1
    x_bars = np.concatenate((np.ones((x_data.shape[0], 1)), x_data), axis=1)

    Q, R = qr_householder(x_bars) # QR decomposition
    R_pinv = np.linalg.pinv(R) # calculate inverse matrix of R
    A = np.dot(R_pinv, Q.T) # apply formula

    return np.dot(A, y_data)

In [4]:
w = linear_regression(x_data, y_data) # get result
w = w.T.tolist()

line = ['Intercept', 'Tax', "Dlic", "Income", 'LogMiles']
res = list(zip(line, w[0]))
for o in res:
    print("{: >20}: {: >10}".format(*o))
    

           Intercept: 154.19284457730794
                 Tax: -4.227983208329624
                Dlic: 0.4718712134419837
              Income: -0.0061353309704178065
            LogMiles: 18.545274506048024


**Using scikit-learn**

In [6]:
import pandas as pd

df1 = pd.DataFrame(x_data, columns=['Tax', 'dlic', 'Income', 'logMiles'])
df1.head()

Unnamed: 0,Tax,dlic,Income,logMiles
0,18.0,1031.380067,23471.0,16.52711
1,8.0,1031.641062,30064.0,13.734286
2,18.0,908.597153,25578.0,15.753556
3,21.7,946.570576,22257.0,16.582436
4,18.0,844.703336,32275.0,17.364708


In [7]:
df2 = pd.DataFrame(y_data, columns=['Fuel'])
df2.head()

Unnamed: 0,Fuel
0,690.264418
1,514.279223
2,621.475071
3,655.292668
4,573.912855


In [8]:
df3 = pd.concat([df1, df2], axis='columns')
df3.head()

Unnamed: 0,Tax,dlic,Income,logMiles,Fuel
0,18.0,1031.380067,23471.0,16.52711,690.264418
1,8.0,1031.641062,30064.0,13.734286,514.279223
2,18.0,908.597153,25578.0,15.753556,621.475071
3,21.7,946.570576,22257.0,16.582436,655.292668
4,18.0,844.703336,32275.0,17.364708,573.912855


In [9]:
X = df3.drop(columns='Fuel')
y = df3['Fuel']

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [20]:
X_train = X[:40]
y_train = y[:40]

X_test = X[40:]
y_test = y[40:]

In [21]:
y_train

0     690.264418
1     514.279223
2     621.475071
3     655.292668
4     573.912855
5     616.611512
6     549.992608
7     626.023934
8     317.492397
9     586.346096
10    750.907417
11    426.349370
12    628.427947
13    526.237662
14    666.536463
15    647.001625
16    600.902409
17    659.741314
18    633.734764
19    584.092617
20    602.286173
21    543.232068
22    642.970595
23    672.919150
24    683.501955
25    689.366114
26    666.597759
27    617.690540
28    614.893985
29    689.652121
30    597.640261
31    646.527274
32    374.164070
33    645.441826
34    666.188747
35    572.075640
36    657.060518
37    556.345512
38    518.328630
39    482.326937
Name: Fuel, dtype: float64

In [22]:
from sklearn.linear_model import LinearRegression

regr = LinearRegression()

regr.fit(X_train, y_train)

In [23]:
regr.coef_

array([-1.90301193e+00,  4.18592399e-01, -6.22943009e-03,  1.95938559e+01])

In [24]:
regr.intercept_

np.float64(135.14202277839848)

In [25]:
regr.predict([[18, 1031.38, 23471, 16.52]])



array([710.09518277])

In [26]:
y_pred = regr.predict(X_test)

In [27]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

-0.02422958696773625

In [28]:
regr.score(X_test, y_test)

-0.02422958696773625

In [29]:
X_test

Unnamed: 0,Tax,dlic,Income,logMiles
40,16.0,914.852671,24321.0,16.013824
41,22.0,943.895904,26115.0,16.350525
42,20.0,942.044365,26239.0,16.42236
43,20.0,835.295555,27871.0,18.198287
44,24.5,935.788546,23907.0,15.365229
45,20.0,1075.288202,26901.0,13.802819
46,17.5,889.919514,31162.0,16.109851
47,23.0,930.856246,31528.0,16.305367
48,25.65,904.893601,21915.0,15.175121
49,27.3,882.329081,28232.0,16.781654
