In [1]:
%load_ext autoreload
%autoreload 2

# Assessing Multicollinearity

[Notes](https://eviews.com/help/helpintro.html#page/content%2Ftesting-Coefficient_Diagnostics.html%23ww182212)

In [2]:
import sys
sys.path.append('../')
import tokamakTK

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import matplotlib.patches as mpatches
import matplotlib.colors as mcolors
import seaborn as sns
import plotly.express as px
import plotly.subplots as plsp
import plotly.graph_objects as go

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from plotly.subplots import make_subplots
from tokamakTK import MyCounter, HUEOrder
from collections import Counter
from statsmodels.stats.outliers_influence import variance_inflation_factor


pd.set_option('display.max_columns', None)
plt.rc('font',family = 'serif')
TD_colors = plt.cm.get_cmap('flag', 20) #Paired, flag
colors_ = sns.color_palette('viridis', 20)
CSS_colors = [mcolors.to_hex(TD_colors(i)) for i in range(20)]
path = "../data/"

  TD_colors = plt.cm.get_cmap('flag', 20) #Paired, flag


In [3]:
# Obtained from Optimization
min_subset_ids = pd.read_csv(path+"R_ids_alpha_0.6357.csv")

DB2 = pd.read_csv(path+"DB2P8.csv")
DB5 = pd.read_csv(path+"SELDB5_SVD.csv", low_memory=False) 

# Setting ELMy Dataset
DB5 = DB5[DB5["PHASE"].isin(['HGELM', 'HSELM', 'HGELMH', 'HSELMH'])]

# REMOVING SPHERICAL TOKAMAKS
#DB5 = DB5[~DB5.TOK.isin(['MAST', 'NSTX', 'START'])]

# There is two shots from DB2P8 missing in DB5
missing_shots = DB2[~DB2.id.isin( DB5.id.values )].reset_index(drop=True)
DB5 = pd.concat([DB5, missing_shots], axis=0, ignore_index=True)

# Labeling shots that had great impact in decreasing alpha_R
DB5.insert(loc=2,column="label",value=[0]*len(DB5))
DB5.insert(loc=2,column="label_str",value=["Unaffected"]*len(DB5))
DB5.loc[(DB5[DB5.id.isin(min_subset_ids.id)].index), "label"] = 1
DB5.loc[(DB5[DB5.id.isin(min_subset_ids.id)].index), "label_str"] = "Decreasing"

print(
    f"{ round( (len(min_subset_ids)/len(DB5))*100     ,2)  }% of the data decreased alpha_R\n" + 
    f"{ round( (1 - len(min_subset_ids)/len(DB5))*100 ,2)  }% of the data did not decrease alpha_R"
)

23.45% of the data decreased alpha_R
76.55% of the data did not decrease alpha_R


In [4]:
features = ['IP', 'BT', 'NEL', 'PLTH', 'RGEO', 'KAREA', 'EPS', 'MEFF']

###  `log(abs(X))` | With Spherical DS | complete DB5

In [5]:
X = DB5[features].apply(np.abs).apply(np.log)
X.insert(0,"intercept", np.ones(len(X)))
X = X.to_numpy()
y = DB5[["TAUTH"]].apply(np.log).to_numpy()

β = np.matmul( np.linalg.inv( np.matmul(X.T,X) ) ,  np.matmul(X.T,y))

In [6]:
pd.DataFrame(β, index=["intercept"] + features, columns=["alpha_x"])

Unnamed: 0,alpha_x
intercept,-2.518531
IP,1.13433
BT,0.072389
NEL,0.174897
PLTH,-0.682701
RGEO,1.447909
KAREA,0.280227
EPS,0.063554
MEFF,0.214018


In [7]:
np.var(β)

1.1502860169217888

In [8]:
X.shape

(6252, 9)

In [9]:
# REMOVING INTERCEPT

X = X[:,1:]

In [10]:
X = StandardScaler().fit_transform(X) # Q???????????
A = np.matmul(X.T, X)

# Correlation Matrix
R = 1/A

In [11]:
R

array([[ 0.00015995,  0.00046729,  0.00190505,  0.00019492,  0.00023336,
         0.0003168 ,  0.00053049,  0.00048386],
       [ 0.00046729,  0.00015995,  0.00058293,  0.00046705,  0.0004286 ,
        -0.00076201, -0.00029473,  0.00089822],
       [ 0.00190505,  0.00058293,  0.00015995,  0.00080755, -0.00044862,
         0.00065211,  0.00225102,  0.00049322],
       [ 0.00019492,  0.00046705,  0.00080755,  0.00015995,  0.00025956,
         0.00035022,  0.0008825 ,  0.00059571],
       [ 0.00023336,  0.0004286 , -0.00044862,  0.00025956,  0.00015995,
         0.00278908, -0.0005991 ,  0.01271353],
       [ 0.0003168 , -0.00076201,  0.00065211,  0.00035022,  0.00278908,
         0.00015995,  0.00032034,  0.00049209],
       [ 0.00053049, -0.00029473,  0.00225102,  0.0008825 , -0.0005991 ,
         0.00032034,  0.00015995,  0.0007273 ],
       [ 0.00048386,  0.00089822,  0.00049322,  0.00059571,  0.01271353,
         0.00049209,  0.0007273 ,  0.00015995]])

[`np.linalg.svd`](https://numpy.org/doc/stable/reference/generated/numpy.linalg.svd.html)

In [13]:
# Singular Value Decomposition of Scaled X
U,D,VT = np.linalg.svd( X )
V = VT.T

In [14]:
D

array([136.36717315, 113.26930451,  95.01041752,  64.91116621,
        51.6575547 ,  39.22381573,  31.30783684,  12.74474325])

In [15]:
sp.linalg.svd(X)[1]

array([136.36717315, 113.26930451,  95.01041752,  64.91116621,
        51.6575547 ,  39.22381573,  31.30783684,  12.74474325])

### Condition Number of a Matrx

$$
    k = \frac{\mu_{max}}{\mu_{min}}
$$

$A$ is ill conditioned if $\alpha\sim 1$. p103. 

$$
    ||A||\cdot||A^{-1}|| = (1+\alpha)\cdot (1+\alpha)^{-1}
$$

The product explodes $\rightarrow$ numerical stability. 

In [16]:
μ_max, μ_min = max(D), min(D)
k = μ_max/μ_min

k

10.699876059002447

Largest value in $\eta_k$ is also the condition number of the matrix $X$

In [17]:
μ_max

136.36717315434439

In [18]:
# Maximum is 64.71 when StandardScaler is not applied

η_k = μ_max/D
η_k

array([ 1.        ,  1.20391993,  1.43528654,  2.10082766,  2.6398302 ,
        3.4766422 ,  4.35568812, 10.69987606])

* $\eta_k\in\:\:[5, 10]$: Moderate collinearity
* $\eta_k\in\:\:[30, 100]$: Stron collinearity

Several large $\eta_k$'s, keys the simultaneous presence of more than one near dependency. 

### Variance Decomposition

"The estimated variance of each regressor coefficient may be decomposed into a sum of terms each of which is associated with a singular value; thereby providinf means for determining the extent to which near dependencies degrade eacg variance." [Belsley, Kuh, Welsch](https://onlinelibrary.wiley.com/doi/book/10.1002/0471725153) p.118 (ebook)

This is the link between numerical analysis and regression analysis. 

* NUMERICAL ANALYSIS: application of singular-value decomposition of $X$ 
* REGRESSION ANALYSIS: variance-covariance matrix of $\hat{\beta}$

Variance-Covariance Matrix of $\hat{\beta} = \sigma^2\cdot A$; where $\sigma$ is the common standard deviation of the $\mathbf{\varepsilon}$, in the linear model. 

$$
    Var[\hat{\beta}] = \sigma^2 \sum_j\frac{V^2_{ij}}{\mu_j^2}
$$

The variance decomposition proportions are:

$$
    \pi_{jk} = \frac{\phi_{kj}}{\phi_k}
$$

where

$$
    \phi_{kj} = \frac{V^2_{ij}}{\mu_j^2} \text{  and,  } \: \phi_k = \sum_j \phi_{kj}
$$

In [21]:
V_sq = np.square(V); mu_sq = np.square(D)

In [56]:
φ_kj = np.zeros((len(features),len(features)))
φ_k  = np.zeros(len(features))
pi_jk  = np.zeros((len(features),len(features)))

for k in range(len(features)):
    φ_kj[k,:] = V_sq[k,:] / mu_sq
    φ_k[k] = φ_kj[k,:].sum()
    pi_jk[:,k]  = φ_kj[k,:] / φ_k[k]
                                  
# Pi-Matrix
Π = pd.DataFrame(pi_jk, 
                 index=[f"μ{i}" for i in range(len(features))], 
                 columns=[f"var[{f}]" for f in features])              

In [57]:
Π

Unnamed: 0,var[IP],var[BT],var[NEL],var[PLTH],var[RGEO],var[KAREA],var[EPS],var[MEFF]
μ0,0.006112,0.003179,0.002282,0.022772,0.003376,0.015423,0.000842,0.020084
μ1,3.8e-05,0.021423,0.007091,0.000469,0.006532,0.031195,0.019384,0.014287
μ2,0.000854,0.024782,0.121021,0.000512,0.006522,0.001316,0.004057,0.069645
μ3,0.000102,0.000173,0.098934,0.020178,0.000932,0.020835,3.5e-05,0.798238
μ4,0.006245,0.009734,0.003501,0.012243,0.002269,0.475317,0.09618,0.011236
μ5,0.017873,0.293154,0.145373,0.209532,0.021743,0.152005,0.022465,0.078343
μ6,0.066796,0.071201,0.383235,0.656652,0.084739,0.01886,2.1e-05,0.004934
μ7,0.901981,0.576355,0.238564,0.077641,0.873886,0.285051,0.857017,0.003233


### Large variance-decomposition proportions are greater than 0.5. 

### Interpretation

* The component associated to $\mu_7$ (MEFF) accounts for most of the variance in IP, RGEO, and EPS; and almost half of BT.
* $\mu_3$ (PLTH) on MEFF $\sim$ 0.8
* $\mu_5$ (KAREA) on BT $\sim$ 0.3
* $\mu_6$ (EPS) on NEL $\sim$ 0.4, on PLTH $\sim$ 0.7

$\rightarrow$ 4 separate near dependencies in $X$, 3 strong, 1 moderate.

**IMPORTANT**: The $\Pi$-Matrix displays all such near
dependencies, treating all columns of X symmetrically and requiring no
prior information on the numbers of near dependencies or their
composition.

The $\Pi-$Matrix helps assessing the damage that collinearity has caused to regression estimates. Example:

<p align="center">
    <img src="example.png" width=700" />
</p>



# DIAGNOSTIC PROCEDURE

1. **Condition Index**
2. **Variance Decomposition**

The magnitude of these two proportions measure a degree to which the corresponding regression estimate has been degraded by the presence of collinearity. 

* Examine the near dependencies: p.126 (ebook)
* See also computational and Statistical problems for a good introduction. 
* 3.4 SUMMARY,INTERPRETATION, AND EXAMPLES OF DIAGNOSING ACTUAL DATA FOR COLLINEARITY. p.165