In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm

import matplotlib.patches as mpatches

from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from collections import Counter

pd.set_option('display.max_columns', None)

eng_coeffs = ['IP', 'BT', 'NEL', 'PLTH', 'RGEO', 'KAREA', 'EPS', 'MEFF']
dmn_coeffs = ['RHOSTAR', 'BETASTAR', 'NUSTAR', 'QCYL5']

path="../../data/"

In [3]:
DB5 = pd.read_csv(path+"DB5.csv")

R = pd.read_csv(path+"R.csv") #reintroduce dataset

subset_ids = pd.read_csv(path+"R_ids_alpha_0.6556.csv")

In [4]:
# Data with columns that can be used for clustering
data = DB5[DB5.id.isin(subset_ids.id.values)]
data = data[data.columns[4:]]
data.drop(['SHOT','TIME','DATE'], axis=1, inplace=True)
data.reset_index(drop=True, inplace=True)
data.dropna(axis='columns', inplace=True) # Dropping columns with at least one value missing

In [5]:
data

Unnamed: 0,IP,BT,NEL,PLTH,RGEO,KAREA,EPS,MEFF,TAUTH,Q95,AMIN,VOL,POHM,PNBI,PICRH,PECRH,PL,PFLOSS,TAV,LCOULOMB,QCYL5,TAUBOHM,RHOSTAR,BETASTAR,NUSTAR,OMEGACYCL
0,0.9971,-2.5080,7.120,4.4300,1.6650,1.592,0.299159,2.0,0.089890,3.977,0.4981,12.9800,400100.0,4981000.0,0.0,0.0,5320000.0,887000.0,1020.565150,15.028261,3.013448,0.112722,0.005208,0.819783,0.277214,1.254000
1,0.7961,-1.9230,7.492,6.0300,1.6640,1.610,0.297416,1.5,0.051740,3.839,0.4949,12.9500,425000.0,7078000.0,0.0,0.0,7450000.0,1415000.0,761.312828,14.709729,2.888794,0.066331,0.005113,1.094550,0.495887,1.282000
2,1.1970,-2.9740,9.431,3.3300,1.6700,1.616,0.291916,2.0,0.124400,3.731,0.4875,12.6600,757200.0,2907000.0,0.0,0.0,3750000.0,426000.0,820.096900,14.669024,2.879855,0.184983,0.004022,0.620546,0.551965,1.487000
3,1.1970,-2.9740,9.741,3.3000,1.6680,1.606,0.293285,2.0,0.122900,3.728,0.4892,12.6500,850900.0,2872000.0,0.0,0.0,3740000.0,442300.0,779.255665,14.601770,2.885629,0.182752,0.003907,0.609025,0.624644,1.487000
4,0.7981,-2.0220,5.787,2.2200,1.6580,1.614,0.298251,2.0,0.125300,4.033,0.4945,12.9200,339000.0,2067000.0,0.0,0.0,2400000.0,170400.0,882.464134,14.986514,3.041559,0.126678,0.006050,0.886383,0.303425,1.011000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1480,0.2144,0.2676,4.737,0.8200,0.2936,1.501,0.760899,1.7,0.002414,5.655,0.2234,0.4340,399900.0,846000.0,0.0,0.0,1246000.0,425900.0,228.006993,13.733277,1.790252,0.000380,0.047423,10.703172,0.087205,0.157412
1481,0.1933,0.2742,5.042,0.7771,0.2864,1.476,0.769553,1.7,0.002236,6.266,0.2204,0.4052,353600.0,835000.0,0.0,0.0,1229000.0,451500.0,201.397272,13.577981,1.928926,0.000361,0.044089,9.584175,0.121547,0.161294
1482,0.2165,0.2719,4.547,0.7172,0.2904,1.509,0.756543,1.7,0.002534,5.351,0.2197,0.4173,330600.0,857000.0,0.0,0.0,1158000.0,440400.0,226.703215,13.748011,1.689761,0.000405,0.047323,9.894581,0.079819,0.159941
1483,0.2163,0.2800,5.000,0.4048,0.2815,1.510,0.762700,1.7,0.003731,5.314,0.2147,0.3870,404800.0,0.0,0.0,0.0,404800.0,0.0,184.744734,13.495859,1.691384,0.000615,0.042451,8.361019,0.124365,0.164706


In [6]:
#features = ['BETASTAR', 'RHOSTAR', 'EPS', 'TAUBOHM', 'KAREA', 'NEL', 'TAUTH', 'RGEO'] # Entropy
features = ['TAUTH','NEL','TAV','BT','RHOSTAR','NUSTAR','BETASTAR'] # Research

In [7]:
data_cl = data[features].apply(np.abs).apply(np.log)
data_cl = StandardScaler().fit_transform(data_cl)
data_cl = pd.DataFrame(data_cl, columns=features)
data_cl

Unnamed: 0,TAUTH,NEL,TAV,BT,RHOSTAR,NUSTAR,BETASTAR
0,-0.379057,0.518171,-0.400417,0.499641,-0.116968,0.555033,-0.657922
1,-1.153976,0.648393,-0.964525,-0.087773,-0.155767,1.060010,-0.241961
2,0.076778,1.236926,-0.821359,0.876556,-0.663607,1.153038,-1.058601
3,0.059759,1.319623,-0.919686,0.876556,-0.725039,1.260447,-1.085571
4,0.086891,-0.011876,-0.680277,0.023253,0.200436,0.633481,-0.545521
...,...,...,...,...,...,...,...
1480,-5.453896,-0.523810,-3.285242,-4.449486,4.558633,-0.449193,3.039282
1481,-5.561356,-0.364257,-3.524109,-4.395601,4.404344,-0.160882,2.880376
1482,-5.385834,-0.628484,-3.296280,-4.414230,4.554173,-0.526049,2.926243
1483,-4.843071,-0.385646,-3.690231,-4.349307,4.324162,-0.140979,2.683903


Following function based on: [Sushil Deore, 2020](https://sushildeore99.medium.com/really-what-is-hopkins-statistic-bad1265df4b)

In [8]:
from sklearn.neighbors import NearestNeighbors
from random import sample
from math import isnan

def Hopkins_Statistic(X): 
    np.random.seed(71)
    """
    Expects Dataframe.
    
    Compares given data to random data that contains no cluster tendency,
    or properties using NearestNeighbors.  
    
    The result is a the percentage. This percentage is how differente the
    given data is to the random scattered data. 
    
    If H < 0.5: 
        unlikely that the given data has statistically significant clusters.
    Else:
        likely that the given data has statistically significant clusters.
        
    Therefore, if H ~ 1, then the dataset is significantly clusterable. 
    """
    n, d = X.shape
    m = int(0.1 * n)
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
    
    rand_X = sample(range(0,n,1), m)
    
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = (nbrs.kneighbors(np.random.uniform(np.amin( X, axis=0 ),
                                             np.amax( X, axis=0 ),
                                             d
                                            ).reshape(1, -1), 
                                     2, 
                                     return_distance=True)
                    )
        ujd.append(u_dist[0][1])
        w_dist, _ = (nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1),
                                     2, 
                                     return_distance=True)
                    )
        wjd.append(w_dist[0][1])
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
    return H

In [9]:
# The whole dataset is clusterable
Hopkins_Statistic(data)

0.9751861446843432

In [10]:
Hopkins_Statistic(data_cl)

0.959553498527468

In [11]:
# Complete Dataset  = 0.9751861446843432 | All numeric

# Entropy Features  = 0.9590119913017682

# Research Features = 0.959553498527468