In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')
import tokamakTK
from tokamakTK import get_ECT_regression

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import matplotlib.patches as mpatches

from collections import Counter
from scipy.stats import f
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

pd.set_option('display.max_columns', None)
path = "../data/"
fig_path = "../../../LATEX/Latex Images/"

plt.rc('font',family = 'serif')

features = ['IP', 'BT', 'NEL', 'PLTH', 'RGEO', 'KAREA', 'EPS', 'MEFF']

In [3]:
# Obtained from Optimization

min_subset_ids_6357 = pd.read_csv(path+"R_ids_alpha_0.6357.csv")
min_subset_ids_9998 = pd.read_csv(path+"R_ids_alpha_0.9998.csv")

DB2 = pd.read_csv(path+"DB2P8.csv")
DB5 = pd.read_csv(path+"SELDB5_SVD.csv", low_memory=False) 

# Setting ELMy Dataset
DB5 = DB5[DB5["PHASE"].isin(['HGELM', 'HSELM', 'HGELMH', 'HSELMH'])]

# Removing Spherical TOKAMAKS
DB5 = DB5[~DB5["TOK"].isin(['START','MAST','NSTX'])]

# There is two shots from DB2P8 missing in DB5
missing_shots = DB2[~DB2.id.isin( DB5.id.values )].reset_index(drop=True)
DB5 = pd.concat([DB5, missing_shots], axis=0, ignore_index=True)

# Labeling shots that had great impact in decreasing alpha_R
DB5.insert(loc=2,column="label_6357",value=[0]*len(DB5))
DB5.loc[(DB5[DB5.id.isin(min_subset_ids_6357.id)].index), "label_6357"] = 1

DB5.insert(loc=2,column="label_9998",value=[0]*len(DB5))
DB5.loc[(DB5[DB5.id.isin(min_subset_ids_9998.id)].index), "label_9998"] = 1

DB5.insert(0, 'intercept', np.ones(len(DB5)))


print(
    "  Subset that decrease alpha-R to 0.6357\n--------\n" +
    f"{ round( (len(min_subset_ids_6357)/len(DB5))*100     ,2)  }% affected alpha_R\n" + 
    f"{ round( (1 - len(min_subset_ids_6357)/len(DB5))*100 ,2)  }% did not affect alpha_R" +
    "\n\n\n  Subset that decrease alpha-R to 0.9998\n--------\n" +
    f"{ round( (len(min_subset_ids_9998)/len(DB5))*100     ,2)  }% affected alpha_R\n" + 
    f"{ round( (1 - len(min_subset_ids_9998)/len(DB5))*100 ,2)  }% did not affect alpha_R"
)

  Subset that decrease alpha-R to 0.6357
--------
24.37% affected alpha_R
75.63% did not affect alpha_R


  Subset that decrease alpha-R to 0.9998
--------
10.27% affected alpha_R
89.73% did not affect alpha_R


In [4]:
all_cases = [DB2.copy(),
             DB5.copy(),
             DB5[DB5.label_6357.isin([1]) | DB5.id.isin(DB2.id.values)],
             DB5[DB5.label_6357.isin([0])],
             DB5[DB5.label_9998.isin([1]) | DB5.id.isin(DB2.id.values)],
             DB5[DB5.label_9998.isin([0])]
]

def get_FPvalues(full_model, reduced_model):
    return (anova_lm(reduced_model, full_model)["F"][1], 
            anova_lm(reduced_model, full_model)["Pr(>F)"][1])

In [18]:
# Creating table with F-test and P-test values per case
TESTS = pd.DataFrame(np.zeros((len(features), len(all_cases)*2)), 
                    columns=['FV0', 'PV0', 'FV1', 'PV1', 'FV2', 'PV2', 'FV3', 'PV3', 'FV4', 'PV4', 'FV5', 'PV5'],
                    index=features)

In [19]:
for i, data_ in enumerate(all_cases):
    for f in features:
        
        reduced_features = features.copy()
        reduced_features.remove(f)
        
        # get_ECT_regression takes the regression in log-space with intercept
        
        full_regression = get_ECT_regression(data_)
        reduced_regression = get_ECT_regression(data_, features=reduced_features)
        
        TESTS.loc[f, f"FV{i}"] = np.round(get_FPvalues(full_regression, reduced_regression)[0],3)
        TESTS.loc[f, f"PV{i}"] = np.round(get_FPvalues(full_regression, reduced_regression)[1],3)
        
TESTS.index.rename("removed", inplace=True)
c = ["F_DB2", "P_DB2", "F_DB5", "P_DB5", "F_d0.64", "P_d0.64", # (d): decreasing; (u): unaffected
                                         "F_u0.64", "P_u0.64", "F_d0.99", "P_d0.99",
                                                               "F_u0.99", "P_u0.99"]
TESTS.columns = c

In [20]:
TESTS

Unnamed: 0_level_0,F_DB2,P_DB2,F_DB5,P_DB5,F_d0.64,P_d0.64,F_u0.64,P_u0.64,F_d0.99,P_d0.99,F_u0.99,P_u0.99
removed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
IP,961.584,0.0,5181.435,0.0,4152.285,0.0,2270.725,0.0,2047.643,0.0,3958.4,0.0
BT,138.994,0.0,4.793,0.029,0.103,0.748,167.093,0.0,25.811,0.0,3.127,0.077
NEL,494.275,0.0,231.234,0.0,99.348,0.0,1770.864,0.0,1.323,0.25,596.254,0.0
PLTH,2674.968,0.0,9305.986,0.0,2038.386,0.0,13804.401,0.0,1634.317,0.0,10291.976,0.0
RGEO,2022.427,0.0,2092.035,0.0,236.783,0.0,5458.332,0.0,457.264,0.0,2987.743,0.0
KAREA,94.667,0.0,13.337,0.0,0.762,0.383,240.419,0.0,12.947,0.0,44.857,0.0
EPS,131.137,0.0,0.132,0.717,145.325,0.0,375.206,0.0,13.629,0.0,37.333,0.0
MEFF,36.507,0.0,192.298,0.0,27.408,0.0,188.627,0.0,15.544,0.0,210.307,0.0


In [21]:
#TESTS.to_csv(path+"ANOVA_tests.csv")