### Regression Diagnostics | Traditional Techniques

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp

from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression

coeffs = ['IP', 'BT', 'NEL', 'PLTH', 'RGEO', 'KAREA', 'EPS', 'MEFF']

In [2]:
DB2P8 = pd.read_csv("data/DB2P8.csv")
DB5 = pd.read_csv("data/DB5.csv")

DB2P8 = DB2P8[DB5.columns]

# How was this chosen? Is this a form of removing outliers or noise to the new regression?
# Why not simply use the whole DB5?
new_ids = pd.read_csv("data/new_point_ids.csv")

data = pd.read_csv("data/data.csv")
                  
r = pd.read_csv("data/R.csv")#DB5[DB5.id.isin(new_ids.id.values)] #reintroduce dataset

In [3]:
y = DB2P8[["TAUTH"]].apply(np.log)
x = DB2P8[coeffs].apply(np.abs).apply(np.log)

x

Unnamed: 0,IP,BT,NEL,PLTH,RGEO,KAREA,EPS,MEFF
0,-1.217734,0.790728,1.332102,0.625938,0.527093,-0.025523,-1.418447,0.405465
1,-1.220102,0.790728,1.317480,0.705076,0.521172,-0.027988,-1.409604,0.405465
2,-1.213686,0.790728,1.226712,0.123986,0.526502,-0.010556,-1.432100,0.405465
3,-1.217734,0.790728,1.328400,0.319181,0.526502,-0.025933,-1.417369,0.405465
4,-1.223495,0.790274,1.347294,0.709513,0.521766,-0.031387,-1.406800,0.405465
...,...,...,...,...,...,...,...,...
1305,-0.973390,0.098940,1.370165,0.506215,0.336472,0.014889,-1.252763,0.693147
1306,-1.037047,0.051643,1.484328,0.536493,0.336472,0.014889,-1.203973,0.693147
1307,-1.284821,0.049742,1.436987,0.366031,0.336472,0.012916,-1.203973,0.693147
1308,-1.369241,0.049742,1.301009,0.123986,0.336472,0.012916,-1.203973,0.693147


In [4]:
Y = y.to_numpy()
X = x.to_numpy()

X

array([[-1.21773372,  0.79072751,  1.33210213, ..., -0.02552296,
        -1.41844684,  0.40546511],
       [-1.22010219,  0.79072751,  1.31748005, ..., -0.02798804,
        -1.40960432,  0.40546511],
       [-1.2136865 ,  0.79072751,  1.22671229, ..., -0.01055551,
        -1.43210021,  0.40546511],
       ...,
       [-1.28482139,  0.04974209,  1.43698748, ...,  0.01291623,
        -1.2039728 ,  0.69314718],
       [-1.36924061,  0.04974209,  1.30100877, ...,  0.01291623,
        -1.2039728 ,  0.69314718],
       [-1.33902917,  0.05069311,  1.38779324, ...,  0.01291623,
        -1.2039728 ,  0.69314718]])

$\hat{\beta} = (X^TX)^{-1}X^TY$

In [5]:
# Using Numpy
np.matmul( np.linalg.inv( np.matmul(X.T,X) ) ,  np.matmul(X.T,Y))

array([[ 1.03959183],
       [ 0.07364086],
       [-0.07179592],
       [-0.68552627],
       [ 1.1544161 ],
       [ 0.31242674],
       [ 1.33504963],
       [-0.38127031]])

In [6]:
# Ordinary least squares Linear Regression.
regressor = LinearRegression()
regressor.fit(X,Y)

list(regressor.coef_[0]) # num 5 is alpha_R

[0.7811880648207693,
 0.3200766625186886,
 0.4351609022028758,
 -0.6681861059730426,
 2.22296166452915,
 0.39007375568487695,
 0.5756125462728638,
 0.17980017582925945]

---

[`sp.linag.lstsq(a,b)`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.lstsq.html)

Solution to Ax= B

In [7]:
# Using scipy
A = np.matmul(X.T,X); B = np.matmul(X.T,Y)
sp.linalg.lstsq(A,B)

(array([[ 1.03959183],
        [ 0.07364086],
        [-0.07179592],
        [-0.68552627],
        [ 1.1544161 ],
        [ 0.31242674],
        [ 1.33504963],
        [-0.38127031]]),
 array([], dtype=float64),
 8,
 array([9.18546354e+03, 1.68208059e+03, 2.64371947e+02, 1.29019887e+02,
        7.79171184e+01, 2.86886140e+01, 1.78106475e+01, 4.15422579e+00]))

In [8]:
sp.linalg.lstsq(X,Y)

(array([[ 1.03959183],
        [ 0.07364086],
        [-0.07179592],
        [-0.68552627],
        [ 1.1544161 ],
        [ 0.31242674],
        [ 1.33504963],
        [-0.38127031]]),
 array([127.56550985]),
 8,
 array([95.84082396, 41.01317582, 16.25951866, 11.35869213,  8.82706737,
         5.35617532,  4.22026628,  2.03819179]))

---

[`curve_fit`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.curve_fit.html)

[`source`](https://github.com/scipy/scipy/blob/v1.10.0/scipy/optimize/_minpack_py.py#L549-L912)

Use non-linear least squares to fit a function, f, to data.

In [9]:
def τ_lth(X, lα0, αI, αB, αn, αP, αR, αk, αε, αM):
    I, B, n, P, R, k, ε, M = X
    return lα0 + (I*αI) + (B*αB) + (n*αn) + (P*αP) + (R*αR) + (k*αk) + (ε*αε) + (M*αM)

data = DB2P8[["TAUTH"]+coeffs].apply(np.abs).apply(np.log)

Y_ = data.TAUTH.values
X_ = (
    data.IP.values, data.BT.values, data.NEL.values, data.PLTH.values, 
    data.RGEO.values, data.KAREA.values, data.EPS.values, data.MEFF.values
)

popt, pcov = curve_fit(τ_lth, X_, Y_)

In [10]:
list(popt)

[-3.0042715614095066,
 0.781188099527849,
 0.32007661931102366,
 0.4351608937185124,
 -0.6681861004577895,
 2.222961611203896,
 0.39007370662740415,
 0.5756124820859031,
 0.17980019309583797]

---

Using `statmodels`

In [12]:
import statsmodels.api as sm

model = sm.OLS(Y,X)
regression = model.fit()

list(regression.params)

[1.039591834019291,
 0.07364086377111047,
 -0.07179592098552395,
 -0.6855262744983007,
 1.154416098024112,
 0.3124267421485935,
 1.3350496323035508,
 -0.38127031013927654]