In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm

import matplotlib.patches as mpatches

from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from collections import Counter

coeffs = ['IP', 'BT', 'NEL', 'PLTH', 'RGEO', 'KAREA', 'EPS', 'MEFF']

### Clustering Applied only to DB5 Observations

In [3]:
R = pd.read_csv("../data/R.csv") #reintroduce dataset | log data
subset_ids = pd.read_csv("../data/R_ids_alpha_0.9153.csv")

data = R[R.id.isin(subset_ids.id.values)]

In [4]:
# 'MAST', 'NSTX': are spherical tokamaks
data.TOK.unique()

array(['AUG', 'AUGW', 'D3D', 'JET', 'JETILW', 'JT60U', 'MAST', 'NSTX'],
      dtype=object)

In [5]:
X = StandardScaler().fit_transform(data[coeffs])
# Scaled data
pd.DataFrame(X, columns=coeffs)

Unnamed: 0,IP,BT,NEL,PLTH,RGEO,KAREA,EPS,MEFF
0,-0.331526,0.838919,1.287848,-1.486056,-0.673402,0.519328,-0.639600,0.009252
1,-0.331526,0.838919,1.378730,-1.501955,-0.676666,0.410631,-0.616332,0.009252
2,-0.329639,0.836696,0.989826,-1.716891,-0.678299,0.519328,-0.632700,0.009252
3,-1.249247,-0.370778,1.050781,-0.933778,-0.702919,0.268309,-0.470774,0.009252
4,-1.252649,-0.369496,1.065977,-0.903222,-0.709522,0.102640,-0.419011,0.009252
...,...,...,...,...,...,...,...,...
1246,-0.765730,-3.369614,0.043594,-1.253172,-2.465181,3.879360,3.736411,0.009252
1247,-0.764129,-3.370114,0.291230,-0.860210,-2.399232,3.178814,3.718674,0.009252
1248,-1.006378,-3.369113,-0.174843,-0.906633,-2.483505,3.003939,3.960551,0.009252
1249,-1.266881,-3.369614,-0.289471,-2.133162,-2.486677,2.216867,3.717324,0.009252


In [6]:
# Label -1 means noise
DB = DBSCAN(eps=1, min_samples=3).fit(X)
labels = DB.labels_
data.insert(loc=0, column="labels", value=labels)
Counter(labels)

Counter({0: 493,
         -1: 23,
         1: 3,
         2: 10,
         3: 596,
         4: 6,
         5: 16,
         6: 3,
         7: 12,
         8: 3,
         9: 12,
         10: 23,
         11: 51})

In [7]:
data[data.labels.isin([3])].TOK.unique()

array(['JET', 'JETILW', 'JT60U'], dtype=object)

In [8]:
data[data.labels.isin([3])]

Unnamed: 0,labels,ind,id,PHASE,TOK,IP,BT,NEL,PLTH,RGEO,...,PL,PFLOSS,TAV,LCOULOMB,QCYL5,TAUBOHM,RHOSTAR,BETASTAR,NUSTAR,OMEGACYCL
2204,3,20129,O7OR68,HGELM,JET,0.710987,1.084513,1.401183,2.333114,1.036027,...,10690000.0,383900.0,1370.999243,15.604307,1.372902,0.282785,-5.978965,-0.795313,-1.868017,1.4790
2205,3,20132,3BVQ5K,HSELM,JET,0.719302,0.859085,1.394014,2.231733,1.035317,...,9609000.0,292900.0,1515.828225,15.708313,1.137579,0.273640,-5.701338,-0.251203,-2.303502,1.1805
2207,3,20134,781QLG,HGELM,JET,0.701611,0.872966,1.647697,2.624669,1.035672,...,14160000.0,356200.0,1485.707086,15.561401,1.200583,0.243589,-5.723262,-0.045353,-1.952194,1.1970
2208,3,20136,U5WGQX,HGELM,JET,0.721735,0.868780,1.699644,2.456164,1.035672,...,11970000.0,303400.0,1378.182464,15.460302,1.151132,0.273802,-5.756638,-0.060158,-1.805965,1.1920
2209,3,20137,JVOX3F,HGELM,JET,0.710004,0.866680,1.706202,2.649715,1.035672,...,14520000.0,364200.0,1329.868755,15.421338,1.182264,0.223150,-5.772382,-0.085087,-1.699428,1.1895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4632,3,23865,QCHKD5,HSELM,JT60U,0.587787,1.124930,1.147402,1.791759,1.175573,...,6580000.0,580000.0,1758.875154,15.980332,1.023223,0.374220,-5.712294,-0.880791,-2.323288,1.5400
4633,3,23866,G4RME8,HSELM,JT60U,0.587787,1.121678,1.163151,1.918392,1.175573,...,8350000.0,1540000.0,2012.167646,16.106995,1.016946,0.380680,-5.635795,-0.724000,-2.566033,1.5350
4634,3,23867,Y65FJU,HGELM,JT60U,0.587787,1.121678,1.398717,2.170196,1.175573,...,10600000.0,1840000.0,2046.868057,16.006311,1.010867,0.379145,-5.620026,-0.471336,-2.366182,1.5350
4635,3,23868,RDC2YK,HGELM,JT60U,0.587787,1.124930,1.360977,2.312535,1.175573,...,12500000.0,2400000.0,2203.081471,16.098727,1.011065,0.340340,-5.581662,-0.442034,-2.537795,1.5400


In [9]:
data.columns

Index(['labels', 'ind', 'id', 'PHASE', 'TOK', 'IP', 'BT', 'NEL', 'PLTH',
       'RGEO', 'KAREA', 'EPS', 'MEFF', 'TAUTH', 'DATE', 'SHOT', 'TIME', 'Q95',
       'ZEFF', 'AMIN', 'VOL', 'POHM', 'PNBI', 'DWDIA', 'DWMHD', 'PICRH',
       'PECRH', 'PL', 'PFLOSS', 'TAV', 'LCOULOMB', 'QCYL5', 'TAUBOHM',
       'RHOSTAR', 'BETASTAR', 'NUSTAR', 'OMEGACYCL'],
      dtype='object')

### Possible numbers of clusters in dataset

In [10]:
np.sqrt( len(data)/2 ) 

25.0099980007996