In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import matplotlib.patches as mpatches

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from collections import Counter
from kneebow.rotor import Rotor

pd.set_option('display.max_columns', None)
path = "../../data/"

### From the scaling law...

* Thermal energy confinement time: $\tau_{E,th} = W_{th}/P_{l,th}$ [s]
    - Thermal stored energy: $W_{th}$
* Intercept of the regression: $\alpha_0$    
* Plasma current: $I_P$ [MA]
* Vacuum toroidal magnetic field: $B_t$ [T]
* Central line-averaged electron density: $\bar{n}_e$ [$\cdot$10$^{19}$ m $^{-3}$] 
* Thermal power lost due to the transport through the LCFS: $P_{l,th}$ [MW]
* Major radius: $R_{geo}$ [m]
* Elongation of the last-closed flux surface: $\kappa = V/(2\pi R_{geo}\pi a^2)$ 
    - Plasma volume inside the LCFS: V [m$^{3}$]
    - Minor radius: $a$ [m] 
* Inverse aspect ration: $\epsilon = a/R_{geo}$ 
    - large $\epsilon$ = spherical tokamak (NSTX and MAST)
* Effective atomic mass of the plasma $M_{eff}$

$$
    \tau_{E,th} = \alpha_0\cdot I_P^{\alpha_I}\cdot B^{\alpha_B}_t\cdot \bar{n}^{\alpha_n}_e  \cdot P^{\alpha_P}_{l,th}\cdot R^{\alpha_R}_{geo}\cdot \kappa^{\alpha_\kappa}_a\cdot \epsilon^{\alpha_\epsilon}\cdot M^{\alpha_M}_{eff}
$$

It is possible to express $\tau_{E,th}$ in terms of four dimensional variables $\tau_{E,th} = \tau_{E,th}(\hat{n}, \hat{T}, R, B)$ only ; thus, $\omega_{c}\tau_{E,th}$ is expected to scale with $(\rho_*, \nu_*, \beta)$. These analyses are referred as Kadomtsev's transformation and Kadomtsev's constraint.

[[Y. Sarazin et al, 2019]](https://iopscience.iop.org/article/10.1088/1741-4326/ab48a5/meta).


Reading [HDB5V2.3](https://dataspace.princeton.edu/handle/88435/dsp01m900nx49h) database.

In [3]:
DB5 = pd.read_excel("../../data/HDB5V2.3.xlsx")

In [4]:
# Reading data and preparing it

DB5_ = pd.read_csv(path+"DB5.csv") #From DB5 dataset | log data
#subset_ids = pd.read_csv(path+"R_ids_alpha_0.6556.csv")
#DB5_ = (DB5_[DB5_.id.isin(subset_ids.id.values)]).reset_index(drop=True)

In [5]:
DB5_

Unnamed: 0,ind,id,PHASE,TOK,IP,BT,NEL,PLTH,RGEO,KAREA,EPS,MEFF,TAUTH,DATE,SHOT,TIME,Q95,ZEFF,AMIN,VOL,POHM,PNBI,DWDIA,DWMHD,PICRH,PECRH,PL,PFLOSS,TAV,LCOULOMB,QCYL5,TAUBOHM,RHOSTAR,BETASTAR,NUSTAR,OMEGACYCL
0,12405,HDULEH,HGELM,ASDEX,0.2959,2.205,3.789,1.8700,1.6940,0.9748,0.242090,1.5,0.05100,19820622,5980,1.205,4.563,,0.4101,5.4820,68660.0,2599000.0,212000.0,212000.0,0.0,0.0,2456000.0,585700.0,1086.986716,15.406717,3.606992,0.074970,0.006430,0.601125,0.223030,1.470000
1,12406,NAC6N1,HGELM,ASDEX,0.2952,2.205,3.734,2.0240,1.6840,0.9724,0.244240,1.5,0.04902,19820622,5980,1.224,4.671,,0.4113,5.4680,22960.0,2598000.0,0.0,0.0,0.0,0.0,2621000.0,596700.0,1150.578521,15.470883,3.647604,0.072059,0.006596,0.627056,0.195418,1.470000
2,12411,U2T1C7,HSELM,ASDEX,0.2971,2.205,3.410,1.1320,1.6930,0.9895,0.238807,1.5,0.06375,19820622,5982,1.188,4.275,,0.4043,5.4050,0.0,2590000.0,815400.0,942200.0,0.0,0.0,1690000.0,558000.0,927.006370,15.300207,3.544311,0.093713,0.006023,0.461374,0.274718,1.470000
3,12412,422XQB,HGELM,ASDEX,0.2959,2.205,3.775,1.3760,1.6930,0.9744,0.242351,1.5,0.06991,19820622,5982,1.216,4.574,,0.4103,5.4820,31630.0,2599000.0,683900.0,662500.0,0.0,0.0,1961000.0,584900.0,1100.512996,15.420935,3.611224,0.102768,0.006467,0.606357,0.216753,1.470000
4,12413,WZ9FED,HGELM,ASDEX,0.2942,2.204,3.847,2.0330,1.6850,0.9691,0.244926,1.5,0.05151,19820622,5982,1.244,4.758,,0.4127,5.4900,37030.0,2600000.0,0.0,0.0,0.0,0.0,2637000.0,604000.0,1173.621999,15.475806,3.668909,0.075685,0.006642,0.659569,0.193993,1.469333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6245,24961,NGC85H,HSELM,TDEV,-0.2198,1.963,4.919,0.6145,0.8274,1.1080,0.259971,2.0,0.01666,19980407,33119,0.800,3.203,,0.2151,0.8371,102100.0,0.0,-1906.0,-1906.0,0.0,510500.0,614500.0,0.0,588.737748,14.663031,2.793295,0.016352,0.011702,0.533324,0.319289,0.981500
6246,24963,ID1IP4,HSELM,TDEV,-0.2199,1.952,5.109,0.6358,0.8356,1.1010,0.258018,2.0,0.01710,19980408,33155,0.800,3.143,,0.2156,0.8439,125200.0,0.0,401.3,401.3,0.0,511000.0,635800.0,0.0,596.868618,14.657798,2.757587,0.016690,0.011822,0.567921,0.325224,0.976000
6247,24964,K19A5H,HSELM,TDEV,-0.2198,1.951,5.337,0.3762,0.8350,1.1030,0.257844,2.0,0.02543,19980408,33171,0.800,3.131,,0.2153,0.8426,167500.0,0.0,-1447.0,-1447.0,0.0,207200.0,376200.0,0.0,503.655066,14.466163,2.757718,0.024807,0.010880,0.501128,0.471050,0.975500
6248,26411,D26CA1,HSELM,TFTR,0.9794,4.795,3.500,5.1120,2.4540,0.9997,0.327343,2.0,0.17310,19900124,45980,4.600,8.050,,0.8033,31.2500,-732600.0,10310000.0,162800.0,162800.0,0.0,0.0,9410000.0,4297000.0,1916.017316,16.013226,6.010681,0.415007,0.002314,0.206978,0.105809,2.397500


In [6]:
#DB5["TAUTH"] = DB5.TAUTH.apply(lambda x: round(x,3)) 
db5 = DB5[["TOK","DATE","SHOT","TIME"]]. applymap(str) 
keyDB5 = (db5["TOK"] + db5["DATE"] + db5["SHOT"] + db5["TIME"] )
DB5["key"] = keyDB5

db5_ = DB5_[["TOK","DATE","SHOT","TIME"]]. applymap(str) 
keyDB5_ = (db5_["TOK"] + db5_["DATE"] + db5_["SHOT"] + db5_["TIME"] )
DB5_["key"] = keyDB5_

len( keyDB5.unique() ), len( keyDB5_.unique() )

(14153, 6250)

In [7]:
len(DB5), len(DB5_)

(14153, 6250)

In [8]:
# Complete columns for shots of interest
complete_data = DB5[DB5.key.isin(DB5_.key)]

features = ['NEL','TAUTH','BT', 'TEV', 'TE0', 'TE0TSC']

In [9]:
DB5_["PLTH"]

0       1.8700
1       2.0240
2       1.1320
3       1.3760
4       2.0330
         ...  
6245    0.6145
6246    0.6358
6247    0.3762
6248    5.1120
6249    4.9930
Name: PLTH, Length: 6250, dtype: float64

In [10]:
DB5["PLTH"]

0        315400.0
1        325500.0
2        299200.0
3             NaN
4        752300.0
           ...   
14148    120700.0
14149    121000.0
14150    142100.0
14151    125000.0
14152    144900.0
Name: PLTH, Length: 14153, dtype: float64

In [11]:
complete_data

Unnamed: 0,TOK,TOK_ID,DIVNAME,LCUPDATE,DATE,SHOT,TIME,TIME_ID,T1,T2,AUXHEAT,PHASE,HYBRID,ITB,ITBTYPE,ELMTYPE,ELMFREQ,ELMMAX,ELMDUR,ELMINT,OLTIME,LHTIME,TPI,ISEQ,MEFF,PGASA,PGASZ,BGASA,BGASZ,BGASA2,BGASZ2,PELLET,FUELRATE,XGASZ,XGASA,CONFIG,RGEO,RMAG,AMIN,KAPPA,KAPPAA,KAREA,DELTA,DELTAU,DELTAL,INDENT,AREA,VOL,SURFFORM,SEPLIM,XPLIM,WALMAT,DIVMAT,LIMMAT,EVAP,DALFMP,DALFDV,IGRADB,BT,IEML,PREMAG,IP,VSURF,Q95,SH95,BEILI2,BEIMHD,BEPMHD,BETMHD,BEPDIA,BMHDMDIA,TAUCR,FBS,RHOQ2,RHOINV,NEL,NELFORM,DNELDT,NEV,NE0,NE0TSC,ZEFF,ZEFFNEO,PRAD,POHM,ENBI,PINJ,BSOURCE,PINJ2,BSOURCE2,COCTR,PNBI,PFLOSS,ECHFREQ,ECHMODE,ECHLOC,PECRHC,PECRH,ICFREQ,ICSCHEME,ICANTEN,PICRHC,PICRH,PALPHA,DWDIA,DWDIAPAR,DWMHD,DWHC,TEV,TE0,TE0TSC,TIV,TI0,TICX0,WDIA,WMHD,WKIN,WEKIN,WIKIN,WROT,WFPER,WFPAR,WFFORM,WFANI,WFICRH,WFICRHP,WFICFORM,ICFORM,WFANIIC,TAUDIA,TAUMHD,TAUTH1,TAUTH2,WTOT,WTH,PL,PLTH,TAUTOT],TAUTH,TAUC92,TAUC93,H89,HITER96L,H93,HITER92Y,HEPS97,HIPB98Y,HIPB98Y1,HIPB98Y2,HIPB98Y3,HIPB98Y4,OMGAIMP0,OMGAIMPH,OMGAM0,OMGAMH,SPIN,TORQ,TORQBM,TORQIN,VTOR0,VTORV,VTORIMP,STANDARD,SELDB1,SELDB2,SELDB2X,IAEA92,DB2P5,DB2P8,DB3IS,DB3V5,IAE2000N,IAE2000X,HMWS2003,IAE2004S,IAE2004I,DB3DONLY,HMWS2005,OJK2006,SELDB3,SELDB3X,SELDB4,AAREA,STDDB4V5,NESEP,NESOL,PMAIN,PDIV,GP_MAIN,GP_DIV,SELDB5,STD3,key
5,ASDEX,1,DV-IPRE,19940125,19820622,5980,1.205,1205,1.203,1.207,NB,HGELM,UNKNOWN,UNKNOWN,,UNKNOWN,,,,,1.110,1.166,6.0,NONE,1.5,2.0,1,1,1,0,0,NONE,,0,0.0,DN,1.6940,1.8030,0.4101,0.9752,0.975092,0.9748,0.0000,0.0000,0.00000,0.0,0.5152,5.4820,27.088080,0.15640,,SS,TI2,NONE,NONE,,,,2.205,0.0,,295900.0,0.23200,4.563,,2.649,2.0190,2.019,0.009265,1.6670,0.4481,5.423056,,,0.277121,37890000000000000000,0.0,-19590000000000000000,30170000000000000000,51000000000000000000,,,,0.0,68660.0,42000.00,2620000.0,404020.0,0.0,0.0,1.0000,2599000.0,585700.0,0.000000e+00,NONE,NONE,0.0,0.0,0.0,NONE,NONE,0.0,0.0,0.0,212000.0,620500.0,212000.0,1.0,,,,,,,116400.0,141100.0,,,,,,,24260.0,0.5780,0.0,0.0,0.0,0.0,0.0,0.04746,0.05753,,0.05100,119600.0,95370.0,2456000.0,1870000.0,0.04871,0.05100,0.6667,0.6667,1.612,1.413,0.9272,1.103,1.126,1.150,1.119,1.132,1.130,1.0110,,,,,,,,,,,,1.0,111.0,1.111111e+09,1.111111e+09,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.111111e+09,111100.0,0.0,,,,,,,,,1,1.0,ASDEX1982062259801.205
6,ASDEX,1,DV-IPRE,19940125,19820622,5980,1.224,1224,1.222,1.226,NB,HGELM,UNKNOWN,UNKNOWN,,UNKNOWN,,,,,1.110,1.166,7.0,NONE,1.5,2.0,1,1,1,0,0,NONE,,0,0.0,DN,1.6840,1.7970,0.4113,0.9724,0.972233,0.9724,0.0000,0.0000,0.00000,0.0,0.5167,5.4680,26.969199,0.16430,,SS,TI2,NONE,NONE,,,,2.205,0.0,,295200.0,0.07778,4.671,,2.720,2.0900,2.090,0.009540,1.7510,0.4348,16.042211,,,0.273998,37340000000000000000,0.0,-44680000000000000000,29400000000000000000,52000000000000000000,,,,0.0,22960.0,42000.00,2620000.0,404020.0,0.0,0.0,1.0000,2598000.0,596700.0,0.000000e+00,NONE,NONE,0.0,0.0,0.0,NONE,NONE,0.0,0.0,0.0,0.0,-485.4,0.0,1.0,,,,,,,121100.0,144500.0,,,,,,,25220.0,0.5782,0.0,0.0,0.0,0.0,0.0,0.04628,0.05522,,0.04902,124400.0,99230.0,2621000.0,2024000.0,0.04748,0.04902,0.6667,0.6667,1.641,1.470,0.9572,1.139,1.165,1.185,1.152,1.169,1.167,1.0440,,,,,,,,,,,,1.0,111.0,1.111111e+09,1.111111e+09,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.111111e+09,111100.0,0.0,,,,,,,,,1,1.0,ASDEX1982062259801.224
11,ASDEX,1,DV-IPRE,19940125,19820622,5982,1.188,1188,1.186,1.190,NB,HSELM,UNKNOWN,UNKNOWN,,UNKNOWN,,,,,1.115,1.165,5.0,NONE,1.5,2.0,1,1,1,0,0,NONE,,0,0.0,DN,1.6930,1.7870,0.4043,0.9895,0.989248,0.9895,0.0000,0.0000,0.00000,0.0,0.5080,5.4050,26.880691,0.16230,,SS,TI2,NONE,NONE,,,,2.205,0.0,,297100.0,-0.05525,4.275,,2.289,1.6600,1.660,0.007679,1.3130,0.4427,22.850781,,,0.282055,34100000000000000000,0.0,192600000000000000000,28300000000000000000,45000000000000000000,,,,0.0,0.0,42000.00,2620000.0,404020.0,0.0,0.0,1.0000,2590000.0,558000.0,0.000000e+00,NONE,NONE,0.0,0.0,0.0,NONE,NONE,0.0,0.0,0.0,815400.0,1187000.0,942200.0,1.0,,,,,,,92490.0,117000.0,,,,,,,23670.0,0.5724,0.0,0.0,0.0,0.0,0.0,0.05230,0.07126,,0.06375,95840.0,72170.0,1690000.0,1132000.0,0.05671,0.06375,0.6667,0.6667,1.564,1.261,0.8309,1.016,1.040,1.084,1.051,1.039,1.037,0.9231,,,,,,,,,,,,0.0,1.0,1.111101e+09,1.111011e+09,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.111111e+09,111100.0,0.0,,,,,,,,,1,1.0,ASDEX1982062259821.188
12,ASDEX,1,DV-IPRE,19940125,19820622,5982,1.216,1216,1.214,1.218,NB,HGELM,UNKNOWN,UNKNOWN,,UNKNOWN,,,,,1.115,1.165,6.0,NONE,1.5,2.0,1,1,1,0,0,NONE,,0,0.0,DN,1.6930,1.8020,0.4103,0.9748,0.974708,0.9744,0.0000,0.0000,0.00000,0.0,0.5155,5.4820,27.079877,0.15700,,SS,TI2,NONE,NONE,,,,2.205,0.0,,295900.0,0.10690,4.574,,2.658,2.0280,2.028,0.009310,1.6810,0.4425,11.762454,,,0.276798,37750000000000000000,0.0,69950000000000000000,30820000000000000000,49000000000000000000,,,,0.0,31630.0,42000.00,2620000.0,404020.0,0.0,0.0,1.0000,2599000.0,584900.0,0.000000e+00,NONE,NONE,0.0,0.0,0.0,NONE,NONE,0.0,0.0,0.0,683900.0,659500.0,662500.0,1.0,,,,,,,117400.0,141600.0,,,,,,,24490.0,0.5772,0.0,0.0,0.0,0.0,0.0,0.06040,0.07206,,0.06990,120700.0,96200.0,1961000.0,1376000.0,0.06154,0.06991,0.6667,0.6667,1.821,1.553,1.0370,1.242,1.264,1.303,1.260,1.259,1.257,1.1210,,,,,,,,,,,,1.0,111.0,1.111111e+09,1.111011e+09,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.111111e+09,111100.0,0.0,,,,,,,,,1,1.0,ASDEX1982062259821.216
13,ASDEX,1,DV-IPRE,19940125,19820622,5982,1.244,1244,1.242,1.246,NB,HGELM,UNKNOWN,UNKNOWN,,UNKNOWN,,,,,1.115,1.165,7.0,NONE,1.5,2.0,1,1,1,0,0,NONE,,0,0.0,DN,1.6850,1.8020,0.4127,0.9690,0.969012,0.9691,0.0000,0.0000,0.00000,0.0,0.5185,5.4900,27.031096,0.16190,,SS,TI2,NONE,NONE,,,,2.204,0.0,,294200.0,0.12590,4.758,,2.809,2.1780,2.178,0.009887,1.8420,0.4313,9.883040,,,0.272429,38470000000000000000,0.0,-39920000000000000000,29930000000000000000,54000000000000000000,,,,0.0,37030.0,42000.00,2620000.0,404020.0,0.0,0.0,1.0000,2600000.0,604000.0,0.000000e+00,NONE,NONE,0.0,0.0,0.0,NONE,NONE,0.0,0.0,0.0,0.0,140300.0,0.0,1.0,,,,,,,126600.0,149700.0,,,,,,,25150.0,0.5800,0.0,0.0,0.0,0.0,0.0,0.04806,0.05683,,0.05151,129900.0,104700.0,2637000.0,2033000.0,0.04925,0.05151,0.6667,0.6667,1.707,1.538,1.0090,1.194,1.218,1.238,1.202,1.221,1.219,1.0900,,,,,,,,,,,,1.0,111.0,1.111111e+09,1.111111e+09,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.111111e+09,111100.0,0.0,,,,,,,,,1,1.0,ASDEX1982062259821.244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12561,TDEV,14,NONAME,19990916,19980407,33119,0.800,800,,,EC,HSELM,UNKNOWN,UNKNOWN,,UNKNOWN,,9.023000e+19,0.000341,4.436000e+16,,,0.0,NONE,2.0,2.0,1,0,0,0,0,NONE,,0,0.0,SN,0.8274,0.8511,0.2151,1.0890,1.118636,1.1080,0.1473,0.2460,0.04853,0.0,0.1626,0.8371,7.345441,0.05010,0.07184,CSS,CC,C,BOROC,,,1.0,1.963,0.0,NO,-219800.0,-0.46450,3.203,,1.211,0.6313,,0.006282,0.4250,,0.982723,,,0.366410,49190000000000000000,0.0,-15540000000000000000,42890000000000000000,55000000000000000000,,,,115700.0,102100.0,0.00,0.0,0.0,0.0,0.0,,0.0,0.0,1.100000e+11,X,OUT,510500.0,510500.0,0.0,NONE,NONE,0.0,0.0,0.0,-1906.0,,-1906.0,0.0,880.5,3105.0,2678.0,,,,8007.0,11890.0,,8586.0,,,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.01324,0.01941,,,10240.0,10240.0,614500.0,614500.0,0.01666,0.01666,1.0000,1.0000,1.248,1.211,0.8807,1.155,1.149,1.163,1.159,1.047,1.155,0.9986,,,,,,,,,,,,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.111111e+09,11111100.0,0.0,,,,,,,,,1,1.0,TDEV19980407331190.8
12563,TDEV,14,NONAME,19990916,19980408,33155,0.800,800,,,EC,HSELM,UNKNOWN,UNKNOWN,,UNKNOWN,,8.640000e+19,0.000450,4.079000e+16,,,0.0,NONE,2.0,2.0,1,0,0,0,0,NONE,,0,0.0,SN,0.8356,0.8565,0.2156,1.0780,1.116878,1.1010,0.1448,0.2247,0.06482,0.0,0.1631,0.8439,7.394830,0.05115,0.07549,CSS,CC,C,BOROC,,,1.0,1.952,0.0,NO,-219900.0,-0.56920,3.143,,1.251,0.6918,,0.007014,0.3930,,0.810275,,,0.374657,51090000000000000000,0.0,5451000000000000000,45010000000000000000,56000000000000000000,,,,112300.0,125200.0,0.00,0.0,0.0,0.0,0.0,,0.0,0.0,1.100000e+11,X,OUT,511000.0,511000.0,0.0,NONE,NONE,0.0,0.0,0.0,401.3,,401.3,0.0,864.5,2983.0,,233.1,388.6,,7482.0,13170.0,10870.0,8855.0,2019.0,,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.01181,0.02071,0.01709,,10870.0,10870.0,635800.0,635800.0,0.01710,0.01710,1.0000,1.0000,1.289,1.240,0.9094,1.185,1.178,1.188,1.183,1.073,1.182,1.0250,,,,,,,,,,,,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.111111e+09,1000.0,0.0,,,,,,,,,1,1.0,TDEV19980408331550.8
12564,TDEV,14,NONAME,19990916,19980408,33171,0.800,800,,,EC,HSELM,UNKNOWN,UNKNOWN,,UNKNOWN,,4.180000e+19,0.000285,1.530000e+16,,,0.0,NONE,2.0,2.0,1,0,0,0,0,NONE,,0,0.0,SN,0.8350,0.8539,0.2153,1.0810,1.119306,1.1030,0.1447,0.2282,0.06121,0.0,0.1630,0.8426,7.390283,0.05020,0.07441,CSS,CC,C,BOROC,,,1.0,1.951,0.0,NO,-219800.0,-0.76200,3.131,,1.188,0.6438,,0.006527,0.3408,,0.604551,,,0.374332,53370000000000000000,0.0,-6362000000000000000,45130000000000000000,63000000000000000000,,,,84760.0,167500.0,0.00,0.0,0.0,0.0,0.0,,0.0,0.0,1.100000e+11,X,OUT,207200.0,207200.0,0.0,NONE,NONE,0.0,0.0,0.0,-1447.0,,-1447.0,0.0,689.6,1928.0,1618.0,238.4,397.4,,6481.0,12240.0,9567.0,7276.0,2291.0,,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.01759,0.03267,0.02553,,9567.0,9567.0,376200.0,376200.0,0.02543,0.02543,1.0000,1.0000,1.469,1.235,0.9445,1.236,1.217,1.248,1.228,1.092,1.204,1.0390,,,,,,,,,,,,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.111111e+09,1000.0,0.0,,,,,,,,,1,1.0,TDEV19980408331710.8
14011,TFTR,13,NONAME,19980115,19900124,45980,4.600,4600,,,NB,HSELM,UNKNOWN,UNKNOWN,,UNKNOWN,,,,,,,0.0,NONE,2.0,2.0,1,2,1,0,0,NONE,,0,0.0,LIM,2.4540,2.6720,0.8033,0.9339,0.933779,0.9997,0.0000,0.0000,0.00000,0.0,1.8930,31.2500,75.295603,0.00000,,C,NONE,C,NONE,,,,4.795,0.0,,979400.0,-0.52440,8.050,,2.535,1.3070,1.488,0.001809,0.9427,,11.503916,,,0.165961,35000000000000000000,0.0,13870000000000000000,28470000000000000000,49000000000000000000,,,,,-732600.0,90.07,11400000.0,622711.0,0.0,0.0,0.5011,10310000.0,4297000.0,0.000000e+00,NONE,NONE,0.0,0.0,0.0,NONE,NONE,0.0,0.0,0.0,162800.0,,162800.0,0.0,1797.0,4517.0,,2198.0,6579.0,,1046000.0,1449000.0,885200.0,466400.0,418800.0,,186600.0,184900.0,371500.0,0.5023,0.0,0.0,0.0,0.0,0.0,0.09956,0.12670,0.14460,0.14460,1449000.0,885200.0,9410000.0,5112000.0,0.15400,0.17310,1.0000,1.0000,2.164,2.403,1.0020,1.445,1.465,1.488,1.266,1.309,1.351,1.0880,,,,,,,,,,,,0.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.111111e+09,100.0,0.0,,,,,,,,,1,1.0,TFTR19900124459804.6


In [12]:
# Standard Scaler
X = complete_data[features]
X = StandardScaler().fit_transform(X)
data = pd.DataFrame(X, columns=features)
data["TOK"] = complete_data["TOK"]

In [13]:
complete_data[features]

Unnamed: 0,NEL,TAUTH,BT,TEV,TE0,TE0TSC
5,37890000000000000000,0.05100,2.205,,,
6,37340000000000000000,0.04902,2.205,,,
11,34100000000000000000,0.06375,2.205,,,
12,37750000000000000000,0.06991,2.205,,,
13,38470000000000000000,0.05151,2.204,,,
...,...,...,...,...,...,...
12561,49190000000000000000,0.01666,1.963,880.5,3105.0,2678.0
12563,51090000000000000000,0.01710,1.952,864.5,2983.0,
12564,53370000000000000000,0.02543,1.951,689.6,1928.0,1618.0
14011,35000000000000000000,0.17310,4.795,1797.0,4517.0,


In [14]:
len(data), len(complete_data)

(4624, 4624)

In [15]:
### Computing the optimal epsilon

#%matplotlib notebook

NNeighbours = 2*X.shape[-1]

# getting the average distance
neighbors = NearestNeighbors(n_neighbors = NNeighbours)
neighbors_fit = neighbors.fit(X)
distances, indices = neighbors_fit.kneighbors(X)
distances = np.sort(distances, axis=0)
distances = distances[:,1]

x = list(range(len(distances)))

plt.figure(figsize=(7,4))
plt.scatter(x, distances, s=1)

data_elbow = (pd.DataFrame([x, distances]).T).values.tolist()
rotor = Rotor()
rotor.fit_rotate( data_elbow )
elbow_index = rotor.get_elbow_index()
plt.axvline(elbow_index, c="k")
#plt.axvline(1204, c="r")

#plt.axis([900, max(x), min(distances), max(distances)])

plt.ylabel("Distance")
plt.xlabel("Object")
plt.grid(True, alpha=0.5);

ValueError: Input X contains NaN.
NearestNeighbors does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Label -1 means noise
DB = DBSCAN(eps = distances[elbow_index], min_samples=NNeighbours ).fit(X)
labels = DB.labels_
data.insert(loc=0, column="labels", value=labels)
Counter(labels)

In [None]:
labels_unique = list(range(-1, len(Counter(labels)) -1) )

TOKs_per_lbl = [0]*len(labels_unique)
for i,lbl in enumerate(labels_unique):
    df = data[data.labels.isin([lbl])]
    TOKs_per_lbl[i] = df.TOK.unique()

In [None]:
# *** Unique Tokamaks found per label 
# *** This will allow to discern wether the clustering happened in obvious parameters
# Index of the df is the label / columns names can be ignored
pd.DataFrame(TOKs_per_lbl, index=labels_unique)

In [None]:
data_cl = data[~data.labels.isin([-1])][["labels"]+features]
data_cl_mean = data_cl.groupby("labels").mean().T
data_cl_mean.columns = "cluster_"+data_cl_mean.columns.astype(str)
print("Mean values of each feature per cluster")
data_cl_mean

In [None]:
import plotly.graph_objects as go

categories = features
fig = go.Figure()

for i in range(len(Counter(labels))-1):

    fig.add_trace(go.Scatterpolar(
          r=data_cl_mean[f"cluster_{i}"].values,
          theta=categories,
          fill='toself',
          name=f'Cluster {i}'
    ))



fig.show()