In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import matplotlib.patches as mpatches

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, precision_recall_fscore_support

pd.set_option('display.max_columns', None)
path = "../data/"

In [None]:
# Obtained from Optimization
min_subset_ids = pd.read_csv(path+"id_vs_frequency_decreasing_ds.csv")

DB2 = pd.read_csv(path+"DB2P8.csv")
DB5 = pd.read_csv(path+"STDB5_all_cols.csv", low_memory=False) 

# There are shots missing in DB5 from DB2P8
missing_shots = DB2[~DB2.id.isin( DB5.id.values )].reset_index(drop=True)
DB5 = pd.concat([DB5, missing_shots], axis=0, ignore_index=True)

# Labeling shots that had great impact in decreasing alpha_R
DB5.insert(loc=2,column="label",value=[0]*len(DB5))
DB5.loc[(DB5[DB5.id.isin(min_subset_ids.id)].index), "label"] = 1

print(
    f"{ round( (len(min_subset_ids)/len(DB5))*100     ,2)  }% of the data decreased alpha_R\n" + 
    f"{ round( (1 - len(min_subset_ids)/len(DB5))*100 ,2)  }% of the data did not decrease alpha_R"
)

### INFORMATION IN THE DATABASE

In [None]:
# DWMHD = DWDIA
plasma_characteristics = ["QCYL5","FBS","BEIMHD","PREMAG","LHTIME","HYBRID",
                          "CONFIG","DWDIA","WMHD","WROT","TORQ"
                         ] 
TOK_characteristics = ["TOK_ID","DIVNAME","WALMAT","DIVMAT","LIMMAT","PMAIN","PDIV"]
transport = ["ITBTYPE"] # Description of transport barrier
ELM = ["ELMTYPE","ELMFREQ"]
heating = ["PECRH", "PICRH", "ICSCHEME","AUXHEAT"]
impurities = ["EVAP","ZEFF","ZEFFNEO","PRAD","POHM","ENBI","OMGAIMP0"]
power = ["PLTH","PFLOSS"] # corrections on power loss | NBI Power lost by unconfined orbits
physical_variables = ["RHOSTAR","BETASTAR","NUSTAR"]
engineering_variables = ["KAREA","EPS","NEL","IP","MEFF","BT","PLTH","RGEO"]
temperatures = ["TAV","TEV","TIV"]
# electron density in SOL | perpendicular | parallel | total | total calcs fast ions
fast_particles = ["NESOL","WFPER","WFPAR","WFFORM","WFICFORM","WFANIIC"] 

In [None]:
entropy_features = ['RHOSTAR', 'BETASTAR', 'KAREA', 'EPS', 'NEL', 'TAV', 'QCYL5', 'NUSTAR']
research_features = ['TAUTH','NEL','TAV','BT','RHOSTAR','NUSTAR','BETASTAR']

### TREATMENT TO CATEGORICAL DATA

### RANDOM FOREST IMPLEMENTATION

In [None]:
DB5_ = pd.DataFrame(StandardScaler().fit_transform( DB5[features] ), columns = features)

X = DB5_[features].to_numpy() # Getting scaled dataset
y = DB5[["label"]].to_numpy().reshape(-1)

# create the model | Default Configuration
model = RandomForestClassifier(random_state=71)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=71, stratify=y)

model.fit(X_train, y_train)
y_pred= model.predict_proba(X_test)

In [None]:
# Accuracy
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

In [None]:
# keep probabilities for the positive outcome only
y_pred = y_pred[:, 1]
# calculate pr-curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
# plot the roc curve for the model
no_skill = len(y_test[y_test==1]) / len(y_test)
plt.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
plt.plot(recall, precision, marker='.', label='Default Random Forest', c="g")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title("Updated Entropy Variables")
plt.legend();

In [None]:
y_pred_ = model.predict(X_test)
precision_recall_fscore_support(y_test, y_pred_, labels=[1,0])