In [1]:
import pandas as pd
import numpy as np

In [2]:
file_path = '../../data/Prac2/fuel.txt'

df = pd.read_csv(file_path)
df.head()

Unnamed: 0,State,Drivers,FuelC,Income,Miles,MPC,Pop,Tax
0,AL,3559897,2382507,23471,94440,12737.0,3451586,18.0
1,AK,472211,235400,30064,13628,7639.16,457728,8.0
2,AZ,3550367,2428430,25578,55245,9411.55,3907526,18.0
3,AR,1961883,1358174,22257,98132,11268.4,2072622,21.7
4,CA,21623793,14691753,32275,168771,8923.89,25599275,18.0


In [3]:
import math

df['Fuel'] = 1000 * df['FuelC']/df['Pop']
df['Dlic'] = 1000 * df['Drivers']/df['Pop']
df['logMiles'] = df['Miles'].apply(lambda x: math.log2(x))

df.head()

Unnamed: 0,State,Drivers,FuelC,Income,Miles,MPC,Pop,Tax,Fuel,Dlic,logMiles
0,AL,3559897,2382507,23471,94440,12737.0,3451586,18.0,690.264418,1031.380067,16.52711
1,AK,472211,235400,30064,13628,7639.16,457728,8.0,514.279223,1031.641062,13.734286
2,AZ,3550367,2428430,25578,55245,9411.55,3907526,18.0,621.475071,908.597153,15.753556
3,AR,1961883,1358174,22257,98132,11268.4,2072622,21.7,655.292668,946.570576,16.582436
4,CA,21623793,14691753,32275,168771,8923.89,25599275,18.0,573.912855,844.703336,17.364708


In [4]:
df1 = df.drop(columns=['FuelC', 'State', 'Miles', 'MPC', 'Pop', 'Drivers'])
df1.head()

Unnamed: 0,Income,Tax,Fuel,Dlic,logMiles
0,23471,18.0,690.264418,1031.380067,16.52711
1,30064,8.0,514.279223,1031.641062,13.734286
2,25578,18.0,621.475071,908.597153,15.753556
3,22257,21.7,655.292668,946.570576,16.582436
4,32275,18.0,573.912855,844.703336,17.364708


In [5]:
df1 = df1[['Tax', 'Dlic', 'Income', 'logMiles', 'Fuel']]
df1.head()

Unnamed: 0,Tax,Dlic,Income,logMiles,Fuel
0,18.0,1031.380067,23471,16.52711,690.264418
1,8.0,1031.641062,30064,13.734286,514.279223
2,18.0,908.597153,25578,15.753556,621.475071
3,21.7,946.570576,22257,16.582436,655.292668
4,18.0,844.703336,32275,17.364708,573.912855


In [6]:
X = df1.drop('Fuel', axis='columns')
y = df1['Fuel']

In [7]:
X_train = X[:40]
y_train = y[:40]

X_test = X[40:]
y_test = y[40:]

## Using numpy

In [8]:
def linear_regression(X_train, y_train):
# """
# This function calculate linear regression base on X_train and y_train
# :param X_train: vector
# :param y_train: vector
# :return: w (regression estimate)
# """
    X_train = X_train.to_numpy()
    y_train = y_train.to_numpy().reshape(-1, 1)

    # add column 1
    x_bars = np.concatenate((np.ones((X_train.shape[0], 1)), X_train), axis=1)

    Q, R = np.linalg.qr(x_bars) # QR decomposition
    R_pinv = np.linalg.pinv(R) # calculate inverse matrix of R
    A = np.dot(R_pinv, Q.T) # apply formula

    return np.dot(A, y_train)

In [9]:
w = linear_regression(X_train, y_train) # get result
w = w.T.tolist()

line = ['Intercept', 'Tax', "Dlic", "Income", 'LogMiles']
res = list(zip(line, w[0]))
for o in res:
    print("{: >20}: {: >10}".format(*o))

           Intercept: 135.14202277840923
                 Tax: -1.9030119273939228
                Dlic: 0.41859239924045655
              Income: -0.006229430094339298
            LogMiles: 19.593855914053012


In [10]:
w

[[135.14202277840923,
  -1.9030119273939228,
  0.41859239924045655,
  -0.006229430094339298,
  19.593855914053012]]

In [11]:
def predict(weights, values):
    result = weights[0]
    
    for i in range(len(values)):
        result += weights[i+1] * values[i]
        
    return result
        
    

In [12]:
X_test.shape

(11, 4)

In [13]:
for val in X_test.values:
    print(val)

[1.60000000e+01 9.14852671e+02 2.43210000e+04 1.60138242e+01]
[2.20000000e+01 9.43895904e+02 2.61150000e+04 1.63505249e+01]
[2.00000000e+01 9.42044365e+02 2.62390000e+04 1.64223605e+01]
[2.00000000e+01 8.35295555e+02 2.78710000e+04 1.81982868e+01]
[2.45000000e+01 9.35788546e+02 2.39070000e+04 1.53652288e+01]
[2.00000000e+01 1.07528820e+03 2.69010000e+04 1.38028193e+01]
[1.75000000e+01 8.89919514e+02 3.11620000e+04 1.61098511e+01]
[2.30000000e+01 9.30856246e+02 3.15280000e+04 1.63053671e+01]
[2.56500000e+01 9.04893601e+02 2.19150000e+04 1.51751207e+01]
[2.73000000e+01 8.82329081e+02 2.82320000e+04 1.67816543e+01]
[1.40000000e+01 9.70752746e+02 2.72300000e+04 1.47361905e+01]


In [14]:
weights = w[0]
y_pred = np.array([predict(weights, value) for value in X_test.values])

y_pred

array([649.91080195, 646.07167308, 649.73774387, 629.6843171 ,
       632.36929804, 650.06180511, 595.88545846, 604.10563913,
       625.93251769, 605.47408949, 633.96098858])

In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

6253.362638613635
60.61549039710437
-0.02422958696755284
