**IMPORTANT**: what is shown here was developed after my thesis defence. This is just a different approach to reinforce my understanding in the topic.

### Note to self: to make `cupy` work: `cthe`, `cnb`

In [1]:
%load_ext autoreload
%autoreload 2

In [14]:
import sys
sys.path.append('../../')
import tokamakTK
from tokamakTK import MyCounter, HUEOrder

import pydotplus
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import matplotlib.patches as mpatches

import plotly.express as px
import plotly.subplots as plsp
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.inspection import permutation_importance
from sklearn.tree import plot_tree
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV,StratifiedKFold
from sklearn.metrics import precision_recall_curve, precision_recall_fscore_support

pd.set_option('display.max_columns', None)
plt.rc('font',family = 'serif')

path = "../../data/"

In [3]:
# Obtained from Optimization

min_subset_ids = pd.read_csv(path+"R_ids_alpha_0.6357.csv")

DB2 = pd.read_csv(path+"DB2P8.csv")
DB5 = pd.read_csv(path+"SELDB5_SVD.csv", low_memory=False) 

# Setting ELMy Dataset
DB5 = DB5[DB5["PHASE"].isin(['HGELM', 'HSELM', 'HGELMH', 'HSELMH'])]

# There is two shots from DB2P8 missing in DB5
missing_shots = DB2[~DB2.id.isin( DB5.id.values )].reset_index(drop=True)
DB5 = pd.concat([DB5, missing_shots], axis=0, ignore_index=True)

# Labeling shots that had great impact in decreasing alpha_R
DB5.insert(loc=2,column="label",value=[0]*len(DB5))
DB5.loc[(DB5[DB5.id.isin(min_subset_ids.id)].index), "label"] = 1


print(
    f"{ round( (len(min_subset_ids)/len(DB5))*100     ,2)  }% of the data decreased alpha_R\n" + 
    f"{ round( (1 - len(min_subset_ids)/len(DB5))*100 ,2)  }% of the data did not decrease alpha_R"
)

23.45% of the data decreased alpha_R
76.55% of the data did not decrease alpha_R


In [4]:
# Obtained from Optimization

min_subset_ids = pd.read_csv(path+"R_ids_alpha_0.6357.csv")

DB2 = pd.read_csv(path+"DB2P8.csv")
DB5 = pd.read_csv(path+"SELDB5_SVD.csv", low_memory=False) 

# Setting ELMy Dataset
DB5 = DB5[DB5["PHASE"].isin(['HGELM', 'HSELM', 'HGELMH', 'HSELMH'])]

# REMOVING SPHERICAL TOKAMAKS
#DB5 = DB5[~DB5.TOK.isin(['MAST', 'NSTX', 'START'])]


# There is two shots from DB2P8 missing in DB5
missing_shots = DB2[~DB2.id.isin( DB5.id.values )].reset_index(drop=True)
DB5 = pd.concat([DB5, missing_shots], axis=0, ignore_index=True)

# Labeling shots that had great impact in decreasing alpha_R
DB5.insert(loc=2,column="label",value=[0]*len(DB5))
DB5.loc[(DB5[DB5.id.isin(min_subset_ids.id)].index), "label"] = 1


print(
    f"{ round( (len(min_subset_ids)/len(DB5))*100     ,2)  }% of the data decreased alpha_R\n" + 
    f"{ round( (1 - len(min_subset_ids)/len(DB5))*100 ,2)  }% of the data did not decrease alpha_R"
)

23.45% of the data decreased alpha_R
76.55% of the data did not decrease alpha_R


In [5]:
# Low entropy and low multicollinearity features
features = ['WFICFORM', 'WFFORM', 'RHOSTAR', 'ZEFFNEO', 'DWDIA', 'BETASTAR', 'NUSTAR', 'PFLOSS', 'Q95']

In [6]:
DB5 = tokamakTK.clean_categorical_data(DB5)

# Needed to respectively clean each dtype
num_features = DB5[features].select_dtypes(include=['int', 'float']).columns.tolist()
cat_features = DB5[features].select_dtypes(include=['object']).columns.tolist()

data_num = DB5[num_features+["TOK","DATE"]]
data_cat = DB5[cat_features]

data_num = tokamakTK.clean_numerical_data(data_num, SS_scaling=False, UL_scale=False)

data_ = pd.concat([data_num,
                  (pd.concat([
                       DB5[["label"]], 
                       tokamakTK.encode_categorical_ohe(data_cat)
                      ], axis=1)
                  )],
                  axis=1
                 )

In [7]:
X = data_.drop("label", axis=1)
y = data_["label"]

In [8]:
# split data, now having eval
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=71, stratify=y
                                                   )
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.5, 
                                                    random_state=71, stratify=y_train
                                                   )

In [10]:
# Percentages per dataset
X_train.shape[0]/X.shape[0], X_eval.shape[0]/X.shape[0], X_test.shape[0]/X.shape[0]

(0.399872040946897, 0.40003198976327575, 0.20009596928982726)

In [15]:
# Define the hyperparameter grid for Random Forest
parameters = {
    'n_estimators':[int(i) for i in np.linspace(180,220, 5)],
    'max_depth': [int(i) for i in np.linspace(15,25, 5)],
#    'min_impurity_decrease': [0.0004, 0.0005, 0.0006,0.0007],
    'min_samples_split':[2,3,4,5],
    'max_features':[None,len(features),"sqrt","log2"],
    'criterion':["gini", "entropy", "log_loss"],
    "bootstrap": [True, False]
}

model = RandomForestClassifier(#min_samples_split=2,
                            #min_samples_leaf=1,
                            random_state=71,
                            n_jobs=-1,
                           )

grid_search = GridSearchCV(
    model, 
    parameters, 
    cv= StratifiedKFold(n_splits=10, shuffle=True, random_state=71),
    scoring='recall',
    #n_jobs=-1,
    refit=True
)

In [16]:
# Fit the GridSearchCV object to your training data
grid_search.fit(X_eval, y_eval)

In [17]:
# Best score and params
grid_search.best_params_, grid_search.best_score_ 

({'bootstrap': False,
  'criterion': 'gini',
  'max_depth': 20,
  'max_features': 'sqrt',
  'min_samples_split': 2,
  'n_estimators': 220},
 0.7325248392752777)

```python
>>> grid_search.best_params_, grid_search.best_score_ 
({'bootstrap': False,
  'criterion': 'gini',
  'max_depth': 20,
  'max_features': 'sqrt',
  'min_samples_split': 2,
  'n_estimators': 220},
 0.7325248392752777)
```

In [18]:
(X_eval+X_train).shape[0]

5001

In [None]:
model.set_params(**grid_search.best_params_)
model.fit(pd.concat([X_eval,X_train]).reset_index(),
          pd.concat([y_eval,y_train]).reset_index()
         )