In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm

import matplotlib.patches as mpatches

from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from collections import Counter

pd.set_option('display.max_columns', None)

eng_coeffs = ['IP', 'BT', 'NEL', 'PLTH', 'RGEO', 'KAREA', 'EPS', 'MEFF']
dmn_coeffs = ['RHOSTAR', 'BETASTAR', 'NUSTAR', 'QCYL5']

In [3]:
DB5 = pd.read_csv("../data/DB5.csv")

R = pd.read_csv("../data/R.csv") #reintroduce dataset

subset_ids = pd.read_csv("../data/R_ids_alpha_0.9153.csv")

In [4]:
# Data with columns that can be used for clustering
data = DB5[DB5.id.isin(subset_ids.id.values)]
data = data[data.columns[4:]]
data.drop(['SHOT','TIME','DATE'], axis=1, inplace=True)
data.reset_index(drop=True, inplace=True)
data.dropna(axis='columns', inplace=True) # Dropping columns with at least one value missing

In [5]:
data

Unnamed: 0,IP,BT,NEL,PLTH,RGEO,KAREA,EPS,MEFF,TAUTH,Q95,AMIN,VOL,POHM,PNBI,PICRH,PECRH,PL,PFLOSS,TAV,LCOULOMB,QCYL5,TAUBOHM,RHOSTAR,BETASTAR,NUSTAR,OMEGACYCL
0,1.19700,-2.974000,9.431000,3.33000,1.670000,1.616000,0.291916,2.0,0.124400,3.7310,0.487500,12.6600,757200.0,2907000.0,0.0,0.0,3750000.0,426000.0,820.096900,14.669024,2.879855,0.184983,0.004022,0.620546,0.551965,1.487000
1,1.19700,-2.974000,9.741000,3.30000,1.668000,1.606000,0.293285,2.0,0.122900,3.7280,0.489200,12.6500,850900.0,2872000.0,0.0,0.0,3740000.0,442300.0,779.255665,14.601770,2.885629,0.182752,0.003907,0.609025,0.624644,1.487000
2,1.19800,-2.971000,8.482000,2.92000,1.667000,1.616000,0.292322,2.0,0.141400,3.7440,0.487300,12.6200,723400.0,2551000.0,0.0,0.0,3230000.0,303400.0,914.300463,14.830789,2.879126,0.210050,0.004253,0.623469,0.402135,1.485500
3,0.79740,-1.717000,8.668000,4.56000,1.652000,1.593000,0.301998,2.0,0.066930,3.4670,0.498900,12.9300,561900.0,4865000.0,0.0,0.0,5620000.0,1057000.0,644.467277,14.470208,2.608357,0.057459,0.006035,1.344660,0.689996,0.858500
4,0.79620,-1.718000,8.715000,4.64000,1.648000,1.578000,0.305158,2.0,0.054490,3.4580,0.502900,12.9800,710400.0,4895000.0,0.0,0.0,5710000.0,1074000.0,528.858207,14.269801,2.635847,0.046807,0.005421,1.108137,1.008278,0.859000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246,0.98770,0.439900,6.057000,3.80200,0.865000,1.957826,0.703815,2.0,0.046030,7.0400,0.608800,12.3900,223200.0,4117000.0,0.0,0.0,4227000.0,424800.0,562.468951,14.513332,2.013432,0.010124,0.018034,12.493422,0.072123,0.219950
1247,0.98840,0.439800,6.615000,4.75500,0.886200,1.881048,0.701309,2.0,0.038960,7.4770,0.621500,12.7100,185000.0,6099000.0,0.0,0.0,5615000.0,859600.0,593.569119,14.523087,1.975415,0.008567,0.018152,14.405352,0.071525,0.219900
1248,0.88790,0.440000,5.604000,4.63100,0.859200,1.862356,0.736266,2.0,0.033680,8.7210,0.632600,12.6400,169200.0,6021000.0,0.0,0.0,5678000.0,1048000.0,619.975894,14.649543,2.336153,0.007410,0.018217,12.735053,0.059717,0.220000
1249,0.79120,0.439900,5.380000,2.30400,0.858200,1.780501,0.701119,2.0,0.043780,6.6710,0.601700,10.9200,457500.0,1966000.0,0.0,0.0,2417000.0,112700.0,404.580446,14.243109,2.218914,0.009629,0.015476,7.982013,0.133629,0.219950


In [6]:
data_cl = data[eng_coeffs + dmn_coeffs].apply(np.abs).apply(np.log)
data_cl = StandardScaler().fit_transform(data_cl)
data_cl = pd.DataFrame(data_cl, columns=eng_coeffs + dmn_coeffs)
data_cl

Unnamed: 0,IP,BT,NEL,PLTH,RGEO,KAREA,EPS,MEFF,RHOSTAR,BETASTAR,NUSTAR,QCYL5
0,-0.331526,0.838919,1.287848,-1.486056,-0.673402,0.519328,-0.639600,0.009252,-0.480611,-1.130573,1.163935,0.191352
1,-0.331526,0.838919,1.378730,-1.501955,-0.676666,0.410631,-0.616332,0.009252,-0.541185,-1.158636,1.274633,0.201343
2,-0.329639,0.836696,0.989826,-1.716891,-0.678299,0.519328,-0.632700,0.009252,-0.364176,-1.123536,0.880523,0.190089
3,-1.249247,-0.370778,1.050781,-0.933778,-0.702919,0.268309,-0.470774,0.009252,0.366157,0.027325,1.363679,-0.302588
4,-1.252649,-0.369496,1.065977,-0.903222,-0.709522,0.102640,-0.419011,0.009252,0.141961,-0.262354,1.703128,-0.250292
...,...,...,...,...,...,...,...,...,...,...,...,...
1246,-0.765730,-3.369614,0.043594,-1.253172,-2.465181,3.879360,3.736411,0.009252,2.650822,3.365018,-0.657292,-1.593963
1247,-0.764129,-3.370114,0.291230,-0.860210,-2.399232,3.178814,3.718674,0.009252,2.664368,3.578237,-0.664753,-1.689053
1248,-1.006378,-3.369113,-0.174843,-0.906633,-2.483505,3.003939,3.960551,0.009252,2.671895,3.393701,-0.826214,-0.852377
1249,-1.266881,-3.369614,-0.289471,-2.133162,-2.486677,2.216867,3.717324,0.009252,2.331470,2.694186,-0.105411,-1.109212


Following function based on: [Sushil Deore, 2020](https://sushildeore99.medium.com/really-what-is-hopkins-statistic-bad1265df4b)

In [7]:
from sklearn.neighbors import NearestNeighbors
from random import sample
from math import isnan

def Hopkins_Statistic(X): 
    np.random.seed(71)
    """
    Expects Dataframe.
    
    Compares given data to random data that contains no cluster tendency,
    or properties using NearestNeighbors.  
    
    The result is a the percentage. This percentage is how differente the
    given data is to the random scattered data. 
    
    If H < 0.5: 
        unlikely that the given data has statistically significant clusters.
    Else:
        likely that the given data has statistically significant clusters.
        
    Therefore, if H ~ 1, then the dataset is significantly clusterable. 
    """
    n, d = X.shape
    m = int(0.1 * n)
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
    
    rand_X = sample(range(0,n,1), m)
    
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = (nbrs.kneighbors(np.random.uniform(np.amin( X, axis=0 ),
                                             np.amax( X, axis=0 ),
                                             d
                                            ).reshape(1, -1), 
                                     2, 
                                     return_distance=True)
                    )
        ujd.append(u_dist[0][1])
        w_dist, _ = (nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1),
                                     2, 
                                     return_distance=True)
                    )
        wjd.append(w_dist[0][1])
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
    return H

In [8]:
# The whole dataset is clusterable
Hopkins_Statistic(data)

0.9714877974862515

In [9]:
Hopkins_Statistic(data_cl)

0.9445365162033262