In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm

import matplotlib.patches as mpatches

from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from collections import Counter

pd.set_option('display.max_columns', None)

eng_coeffs = ['IP', 'BT', 'NEL', 'PLTH', 'RGEO', 'KAREA', 'EPS', 'MEFF']
dmn_coeffs = ['RHOSTAR', 'BETASTAR', 'NUSTAR', 'QCYL5']

path="../../data/"

In [3]:
DB5 = pd.read_csv(path+"DB5.csv")

subset_ids = pd.read_csv(path+"R_ids_alpha_0.9998.csv")

In [4]:
# Data with columns that can be used for clustering
data = DB5[DB5.id.isin(subset_ids.id.values)]
data = data[data.columns[4:]]
data.drop(['SHOT','TIME','DATE'], axis=1, inplace=True)
data.reset_index(drop=True, inplace=True)
data.dropna(axis='columns', inplace=True) # Dropping columns with at least one value missing

In [5]:
#features = ['BETASTAR', 'RHOSTAR', 'EPS', 'TAUBOHM', 'KAREA', 'NEL', 'TAUTH', 'RGEO'] # Entropy
features = ['TAUTH','NEL','TAV','BT','RHOSTAR','NUSTAR','BETASTAR'] # Research

In [6]:
data_cl = data[features].apply(np.abs).apply(np.log)
data_cl = StandardScaler().fit_transform(data_cl)
data_cl = pd.DataFrame(data_cl, columns=features)

Following function based on: [Sushil Deore, 2020](https://sushildeore99.medium.com/really-what-is-hopkins-statistic-bad1265df4b)

In [7]:
from sklearn.neighbors import NearestNeighbors
from random import sample
from math import isnan

np.random.seed(71)

def Hopkins_Statistic(X): 
    """
    Expects Dataframe.
    
    Compares given data to random data that contains no cluster tendency,
    or properties using NearestNeighbors.  
    
    The result is a the percentage. This percentage is how differente the
    given data is to the random scattered data. 
    
    If H < 0.5: 
        unlikely that the given data has statistically significant clusters.
    Else:
        likely that the given data has statistically significant clusters.
        
    Therefore, if H ~ 1, then the dataset is significantly clusterable. 
    """
    n, d = X.shape
    m = int(0.1 * n)
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
    
    rand_X = sample(range(0,n,1), m)
    
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = (nbrs.kneighbors(np.random.uniform(np.amin( X, axis=0 ),
                                             np.amax( X, axis=0 ),
                                             d
                                            ).reshape(1, -1), 
                                     2, 
                                     return_distance=True)
                    )
        ujd.append(u_dist[0][1])
        w_dist, _ = (nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1),
                                     2, 
                                     return_distance=True)
                    )
        wjd.append(w_dist[0][1])
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
    return H

In [8]:
# The whole dataset is clusterable
Hopkins_Statistic(data)

0.9645303529563176

In [9]:
Hopkins_Statistic(data_cl)

0.9486352242984367

In [10]:
# Complete Dataset  = 0.9749382627806085 | All numeric

# Entropy Features  = 0.9644433188916054

# Research Features = 0.9587835352350436