In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import sys
sys.path.append('../')
import tokamakTK
from tokamakTK import get_ECT_regression

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import matplotlib.patches as mpatches

from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer

pd.set_option('display.max_columns', None)
path = "../data/"
fig_path = "../../../LATEX/Latex Images/"

sstyle = 'seaborn-v0_8-poster'
plt.style.use(sstyle)
plt.rc('font',family = 'serif')

features = ['IP', 'BT', 'NEL', 'PLTH', 'RGEO', 'KAREA', 'EPS', 'MEFF']

In [10]:
# Obtained from Optimization

min_subset_ids_6357 = pd.read_csv(path+"R_ids_alpha_0.6357.csv")
min_subset_ids_9998 = pd.read_csv(path+"R_ids_alpha_0.9998.csv")

DB2 = pd.read_csv(path+"DB2P8.csv")
DB5 = pd.read_csv(path+"SELDB5_SVD.csv", low_memory=False) 

# Setting ELMy Dataset
DB5 = DB5[DB5["PHASE"].isin(['HGELM', 'HSELM', 'HGELMH', 'HSELMH'])]

# There is two shots from DB2P8 missing in DB5
missing_shots = DB2[~DB2.id.isin( DB5.id.values )].reset_index(drop=True)
DB5 = pd.concat([DB5, missing_shots], axis=0, ignore_index=True)

# Labeling shots that had great impact in decreasing alpha_R
DB5.insert(loc=2,column="label_6357",value=[0]*len(DB5))
DB5.loc[(DB5[DB5.id.isin(min_subset_ids_6357.id)].index), "label_6357"] = 1

DB5.insert(loc=2,column="label_9998",value=[0]*len(DB5))
DB5.loc[(DB5[DB5.id.isin(min_subset_ids_9998.id)].index), "label_9998"] = 1


print(
    "  Subset that decrease alpha-R to 0.6357\n--------\n" +
    f"{ round( (len(min_subset_ids_6357)/len(DB5))*100     ,2)  }% affected alpha_R\n" + 
    f"{ round( (1 - len(min_subset_ids_6357)/len(DB5))*100 ,2)  }% did not affect alpha_R" +
    "\n\n\n  Subset that decrease alpha-R to 0.9998\n--------\n" +
    f"{ round( (len(min_subset_ids_9998)/len(DB5))*100     ,2)  }% affected alpha_R\n" + 
    f"{ round( (1 - len(min_subset_ids_9998)/len(DB5))*100 ,2)  }% did not affect alpha_R"
)

  Subset that decrease alpha-R to 0.6357
--------
23.45% affected alpha_R
76.55% did not affect alpha_R


  Subset that decrease alpha-R to 0.9998
--------
9.88% affected alpha_R
90.12% did not affect alpha_R


In [20]:
# Data for alpha-R ~ 0.64

complete_data = DB5.copy()
decreased_ds  = DB5[DB5.label_6357.isin([1]) | DB5.id.isin(DB2.id.values)]
unaffected_ds = DB5[DB5.label_6357.isin([0])]

# OLS applied to each case: complete, decreasing, and unaffected | all having DB2 with them
# WARNING: Inside the build function, get_regression, log is applied to data
data  = [
    get_ECT_regression(complete_data),   # Case 1: complete dataset
    get_ECT_regression(decreased_ds),    # Case 2: decreasing dataset
    get_ECT_regression(unaffected_ds)    # Case 3: unaffected dataset
]

In [22]:
# Creating table with F-test and P-test values per case
TESTS = pd.DataFrame(np.zeros((len(features)+1, len(data)*2)), 
                    columns=['FV1', 'PV1', 'FV2', 'PV2', 'FV3', 'PV3'],
                    index=["const"]+features)
for i in range(3):
    for f in (["const"] + features):
        statement = f"({f}=0)"
        TESTS.loc[f, f"FV{i + 1}"] = np.round(data[i].f_test(statement).fvalue, 5)
        TESTS.loc[f, f"PV{i + 1}"] = np.round(data[i].f_test(statement).pvalue, 5)

In [23]:
TESTS

Unnamed: 0,FV1,PV1,FV2,PV2,FV3,PV3
const,7788.09592,0.0,3636.31538,0.0,9938.48449,0.0
IP,4809.5712,0.0,4165.11371,0.0,2370.14462,0.0
BT,25.79122,0.0,1.01125,0.31469,300.46163,0.0
NEL,332.972,0.0,85.68474,0.0,1971.23255,0.0
PLTH,10024.6024,0.0,2017.39751,0.0,14937.76647,0.0
RGEO,3221.36266,0.0,294.37717,0.0,7450.69348,0.0
KAREA,94.69404,0.0,0.00198,0.96452,350.35744,0.0
EPS,2.9596,0.08542,136.93138,0.0,507.59434,0.0
MEFF,157.27391,0.0,25.50366,0.0,182.72516,0.0


### Interpretation

* If P-Value is extremely small, one rejects the null-hypothesis and concludes that the addition of said feature helps the model to perform better.
    - In other words, full model outperforms than reduced model

* If P-Value ~ 0.1, one fails rejecting the null-hypothesis, meaning that the it is not vital to include said variable in the model. 

* If F-Value is really high, then it is substantial for the model to perform better.