In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')
import tokamakTK
from tokamakTK import get_ECT_regression, HUEOrder, get_pi_matrix, clean_numerical_data, MyCounter

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import matplotlib.patches as mpatches
import matplotlib.colors as mcolors
import seaborn as sns
import plotly.express as px
import plotly.subplots as plsp
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from collections import Counter

pd.set_option('display.max_columns', None)
plt.rc('font',family = 'serif')
path = "../data/"
fig_path = "../../../LATEX/Latex Images/"

coeffs = ['IP', 'BT', 'NEL', 'PLTH', 'RGEO', 'KAREA', 'EPS', 'MEFF']

In [3]:
# Obtained from Optimization

min_subset_ids_6357 = pd.read_csv(path+"R_ids_alpha_0.6357.csv")
min_subset_ids_9998 = pd.read_csv(path+"R_ids_alpha_0.9998.csv")
min_subset_ids_joe  = pd.read_csv(path+"deviation_id.csv")

DB2 = pd.read_csv(path+"DB2P8.csv")
DB5 = pd.read_csv(path+"SELDB5_SVD.csv", low_memory=False) 

# Setting ELMy Dataset
DB5 = DB5[DB5["PHASE"].isin(['HGELM', 'HSELM', 'HGELMH', 'HSELMH'])]

# Removing Spherical TOKAMAKS
#DB5 = DB5[~DB5["TOK"].isin(['START','MAST','NSTX'])]

# There is two shots from DB2P8 missing in DB5
missing_shots = DB2[~DB2.id.isin( DB5.id.values )].reset_index(drop=True)
DB5 = pd.concat([DB5, missing_shots], axis=0, ignore_index=True)

# Labeling shots that had great impact in decreasing alpha_R
DB5.insert(loc=2,column="label_6357",value=["Unaffected"]*len(DB5))
DB5.loc[(DB5[DB5.id.isin(min_subset_ids_6357.id)].index), "label_6357"] = "Decreasing"

DB5.insert(loc=2,column="label_9998",value=["Unaffected"]*len(DB5))
DB5.loc[(DB5[DB5.id.isin(min_subset_ids_9998.id)].index), "label_9998"] = "Decreasing"

DB5.insert(loc=2,column="label_joe",value=["Unaffected"]*len(DB5))
DB5.loc[(DB5[DB5.id.isin(min_subset_ids_joe.id)].index), "label_joe"] = "Decreasing"

DB5.insert(0, 'intercept', np.ones(len(DB5)))


print(
    "  Subset that decrease alpha-R to 0.6357\n--------\n" +
    f"{ round( (len(min_subset_ids_6357)/len(DB5))*100     ,2)  }% affected alpha_R\n" + 
    f"{ round( (1 - len(min_subset_ids_6357)/len(DB5))*100 ,2)  }% did not affect alpha_R" +
    "\n\n\n  Subset that decrease alpha-R to 0.9998\n--------\n" +
    f"{ round( (len(min_subset_ids_9998)/len(DB5))*100     ,2)  }% affected alpha_R\n" + 
    f"{ round( (1 - len(min_subset_ids_9998)/len(DB5))*100 ,2)  }% did not affect alpha_R"
    "\n\n\n  Subset given by Joseph Hall           \n--------\n" +
    f"{ round( (len(min_subset_ids_joe)/len(DB5))*100     ,2)  }% affected alpha_R\n" + 
    f"{ round( (1 - len(min_subset_ids_joe)/len(DB5))*100 ,2)  }% did not affect alpha_R"
)

  Subset that decrease alpha-R to 0.6357
--------
23.45% affected alpha_R
76.55% did not affect alpha_R


  Subset that decrease alpha-R to 0.9998
--------
9.88% affected alpha_R
90.12% did not affect alpha_R


  Subset given by Joseph Hall           
--------
18.46% affected alpha_R
81.54% did not affect alpha_R


In [8]:
amount = 20
small_dataset = pd.read_csv(path+"decreased_dataset_random_sampling_500_decreasing_points.csv")
samples = [f"subset_{int(i)}" for i in np.linspace(1,len(small_dataset.columns)-1,amount)]
values = small_dataset[samples].describe().loc["min"]
indxs = [small_dataset[small_dataset[values.keys()[i]].isin([values.values[i]])].index[0] for i in range(amount)]


info = pd.DataFrame([np.linspace(1,len(small_dataset.columns)-1,amount),small_dataset.loc[indxs]["seed"].values]).T
info = info.rename(columns={0:"sampling", 1:"seed"})
info["sampling"] = info["sampling"].astype(int)
info["seed"] = info["seed"].astype(int)

In [9]:
DB5_decreas_pts = pd.read_csv(path+"decreasing_dataset_info.csv")
# Sampling on the decreasing points of the decreasing DS
# If sampling is on "decreased": sampling on decreasing points in decreasing dataset (smaller alpha_R)
# If sampling is on "weights": sampling on complete dataset only (highers alpha_R)
IDs = [0]*amount
for i in range(amount):
    IDs[i] = DB5_decreas_pts.sample(n=info["sampling"][i], weights='decreased', random_state=info["seed"][i])
    
# Adding mine with alpha_R of approx 0.63
IDs = IDs + [min_subset_ids_6357]

In [10]:
DF = [DB5[DB5.id.isin(IDs[i].id.values) | DB5.id.isin(DB2.id.values)][coeffs+["TAUTH"]] for i in range(amount+1)]

In [11]:
[get_ECT_regression(DF[i]).params["RGEO"] for i in range(len(DF))]

[2.193381295547305,
 1.8413459046650524,
 1.6257035103070592,
 1.4502291354609658,
 1.3319700443067546,
 1.2232065794708968,
 1.1429833582859634,
 1.0635431828948825,
 0.9926216010787007,
 0.9408375289922396,
 0.8907058348274953,
 0.8433327529157967,
 0.8141162063836194,
 0.7765526617447575,
 0.7319295717685702,
 0.7082993866348661,
 0.6815782756548103,
 0.6617309392275936,
 0.6515977868422362,
 0.6558006008429011,
 0.6357571952782841]