# HOPKINS

In [1]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from random import sample
from numpy.random import uniform

In [2]:
def hopkins_statistic(X):
    
    X=X.values  #convert dataframe to a numpy array
    sample_size = int(X.shape[0]*0.05) #0.05 (5%) based on paper by Lawson and Jures
    
    
    #a uniform random sample in the original data space
    X_uniform_random_sample = uniform(X.min(axis=0), X.max(axis=0) ,(sample_size , X.shape[1]))
    
    
    
    #a random sample of size sample_size from the original data X
    random_indices=sample(range(0, X.shape[0], 1), sample_size)
    X_sample = X[random_indices]
   
    
    #initialise unsupervised learner for implementing neighbor searches
    neigh = NearestNeighbors(n_neighbors=2)
    nbrs=neigh.fit(X)
    
    #u_distances = nearest neighbour distances from uniform random sample
    u_distances , u_indices = nbrs.kneighbors(X_uniform_random_sample , n_neighbors=2)
    u_distances = u_distances[: , 0] #distance to the first (nearest) neighbour
    
    #w_distances = nearest neighbour distances from a sample of points from original data X
    w_distances , w_indices = nbrs.kneighbors(X_sample , n_neighbors=2)
    #distance to the second nearest neighbour (as the first neighbour will be the point itself, with distance = 0)
    w_distances = w_distances[: , 1]
    
 
    
    u_sum = np.sum(u_distances)
    w_sum = np.sum(w_distances)
    
    #compute and return hopkins' statistic
    H = u_sum/ (u_sum + w_sum)
    return H

# GEN DATA

In [3]:
import pandas as pd
from sklearn.datasets import make_blobs


make_blobs()

(array([[  4.73291121, -10.70695271],
        [ -9.36004453,  -0.9919627 ],
        [ -0.88510389,  -4.55423468],
        [-10.15973397,  -1.13864071],
        [ -1.16637445,  -4.00826475],
        [  3.6118037 , -10.23925523],
        [ -3.16657637,  -5.15226808],
        [ -3.16358268,  -6.21473592],
        [-10.07578949,  -0.81656735],
        [  4.07108227, -10.84278068],
        [  2.95606717,  -8.54135685],
        [ -8.66898318,  -1.43256944],
        [ -1.1310387 ,  -5.53193017],
        [ -2.93391343,  -5.71460609],
        [ -2.16915427,  -5.05701433],
        [  4.05412246, -10.32704363],
        [ -1.40501456,  -3.28885217],
        [  2.93093554, -10.79265462],
        [  4.40608644,  -9.80389251],
        [  3.49994827, -10.58354452],
        [  5.10961159,  -9.26402005],
        [ -8.46387234,  -1.87100999],
        [ -1.50189124,  -5.07322767],
        [ -9.88016071,   0.26034744],
        [ -1.05972749,  -3.60792321],
        [ -2.56865938,  -3.52310338],
        [-10

# GEN CENTERS

In [4]:
n = 3 # 3 dimensions
n_points = 2**n # max number of points before 2 points are proportional


In [5]:
# 3 DIMENSIONS
# COUNT IN BINARY TO GET COORDINATES

# 000
# 001
# 010
# 011
# 100
# 101
# 110
# 111

In [6]:
4
bin(4)
[eval(digit) for digit in  [*bin(4)[2:]]]

[1, 0, 0]

In [7]:
def clust_centers(n_dim, n_centers):
    # unary grid
    unary_grid = []
    for i in range(2**n_dim):
        unary_grid.append(
            [eval(digit) for digit in  [*format(i, f'#0{n_dim+2}b')[2:]]] # Get Binary representation and convert to list
            )
    if n_centers <= 2**n_dim:
        return unary_grid[:n_centers]
    grid = unary_grid
    double_grid = [(pd.Series(elem) * 2).tolist() for elem in unary_grid[1:]] # Double non null vectors
    grid.extend(double_grid[:n_centers-2**n_dim])
    return grid
clust_centers(2,7)

[[0, 0], [0, 1], [1, 0], [1, 1], [0, 2], [2, 0], [2, 2]]

In [8]:
import gen_data
df = gen_data.generate_data(n_clusters=5,clust_std=30*0.1, n_num=20,n_cat=10,n_indiv=250)
#df.columns = ['X','Y','Z','S','W']
#df.dtypes

In [9]:
df = gen_data.generate_data(n_clusters=9,clust_std=2,n_num=15,n_cat=15,cat_unique=3,n_indiv=250)

In [10]:
import plotly.express as px

In [11]:
from prince import FAMD
pca = FAMD(n_components=3)
rr = pca.fit_transform(df)
rr.columns = ['X','Y','Z']

px.scatter_3d(rr,'X','Y','Z')

In [12]:
from gen_data import generate_data
from prince import FAMD
import plotly.express as px

In [13]:
df = generate_data(n_clusters=3,clust_std=0.1,n_num=20,n_cat=20,cat_unique=7,n_indiv=1000)
pca = FAMD(n_components=3)
rr = pca.fit_transform(df)
rr.columns = ['X','Y','Z']

px.scatter_3d(rr,'X','Y','Z')

In [14]:
hopkins_statistic(rr)

0.8464784967301255

In [15]:
import umap.umap_ as umap
n_components=3
intersection=False
df2 = df.copy()
numerical = df2.select_dtypes(exclude='object')
for c in numerical.columns:
    numerical[c] = (numerical[c] - numerical[c].mean())/numerical[c].std(ddof=0)
    
##preprocessing categorical
categorical = df2.select_dtypes(include='object')
categorical = pd.get_dummies(categorical)
#Embedding numerical & categorical
fit1 = umap.UMAP(#random_state=12,
                #n_neighbors=int(np.log2(len(df))),
                n_components=n_components).fit_transform(numerical)
fit2 = umap.UMAP(metric='hamming', 
                #n_neighbors=int(np.log2(len(df))),
                n_components=n_components).fit_transform(categorical)
# intersection will resemble the numerical embedding more.
#if intersection:
#    embedding = fit1 * fit2
#
## union will resemble the categorical embedding more.
#else:
#
#    embedding = fit1 + fit2
import numpy as np
gamma = np.mean(np.std(numerical))/2
embedding = np.square(fit1)+fit2*gamma
#fit2 *= gamma

#embedding = fit1+fit2 - np.multiply(fit1, fit2)


um = pd.DataFrame(embedding) # Each points' UMAP coordinate 

# Actual Plotting
um.columns = ['X','Y','Z']
#um['cluster'] = df['cluster'].astype(str)
fig = px.scatter_3d(um, 
                x='X',y='Y',z='Z')
fig.update_layout(showlegend=False)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


gradient function is not yet implemented for hamming distance metric; inverse_transform will be unavailable



In [16]:
hopkins_statistic(um)

0.9820325676442346

In [17]:
import pacmap
pm = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0)


In [18]:
import numpy as np
df2 = df.copy()
numerical = df2.select_dtypes(exclude='object')
for c in numerical.columns:
    numerical[c] = (numerical[c] - numerical[c].mean())/numerical[c].std(ddof=0)
    
##preprocessing categorical
categorical = df2.select_dtypes(include='object')
categorical = pd.get_dummies(categorical)
#Embedding numerical & categorical
fit1 = pacmap.PaCMAP(#random_state=12,
                n_neighbors=10,
                n_components=n_components).fit_transform(numerical)

fit2 = pacmap.PaCMAP(distance='hamming', 
                n_neighbors=10,
                n_components=n_components).fit_transform(categorical)

gamma = np.mean(np.std(numerical))/2
embedding = np.square(fit1)+fit2*gamma

um = pd.DataFrame(embedding) # Each points' UMAP coordinate 

# Actual Plotting
um.columns = ['X','Y','Z']
#um['cluster'] = df['cluster'].astype(str)
fig = px.scatter_3d(um, 
                x='X',y='Y',z='Z')
fig.update_layout(showlegend=False)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()


apply_pca = True for Hamming distance. This option will be ignored.



In [19]:
hopkins_statistic(um)

0.9636479846703794

In [20]:
famd = FAMD(n_components=len(df.columns)).fit_transform(df)
pm = pacmap.PaCMAP(n_components=3,apply_pca=False).fit_transform(famd)
pm = pd.DataFrame(pm)
pm.columns = ['X','Y','Z']
#um['cluster'] = df['cluster'].astype(str)
fig = px.scatter_3d(pm, 
                x='X',y='Y',z='Z')
fig.update_layout(showlegend=False)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()


Running ANNOY Indexing on high-dimensional data. Nearest-neighbor search may be slow!



In [57]:
from sklearn.manifold import SpectralEmbedding
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist


df2 = df.copy()
if 'cluster' in df2.columns:
    df2.pop('cluster')
numerical = df2.select_dtypes('number')
categorical = df2.select_dtypes('object')
# Scaling
scaler = StandardScaler()
numerical = scaler.fit_transform(numerical)
categorical = categorical.apply(lambda x: x.replace(x.unique(),list(range(1,1+len(x.unique())))))
# Gamma parameter to compute pairwise distances
gamma = np.mean(np.std(numerical))/2
# Compute pairwise distance matrix
distances = (cdist(numerical,numerical,'sqeuclidean')) + cdist(categorical,categorical,'hamming')*gamma
distances = np.nan_to_num(distances)
#for i in range(len(distances[0])):
#    for j in range(len(distances)):
#        distances[i][j] = 1-distances[i][j]
#distances = np.nan_to_num(distances)
##g_mat = 1-gower.gower_matrix(df)

kernel = pd.DataFrame(distances).apply(lambda x: np.exp(-x))
kernel[kernel < .0001] = 0
#kernel = np.exp(-(distances**2))

###### LAPLACIAN EMBEDDINGS
#lap = SpectralEmbedding(3,affinity="precomputed").fit_transform(np.interp(distances, (distances.min(), distances.max()), (0, +1)))
lap = SpectralEmbedding(3,affinity="precomputed").fit_transform(kernel)


Graph is not fully connected, spectral embedding may not work as expected.



In [58]:
um=pd.DataFrame(lap)
# Actual Plotting
um.columns = ['X','Y','Z']
#um['cluster'] = df['cluster'].astype(str)
fig = px.scatter_3d(um, 
                x='X',y='Y',z='Z')
fig.update_layout(showlegend=False)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [22]:
hopkins_statistic(um)

0.8918331645738151

In [23]:
import umap.umap_ as umap
n_components=3
intersection=False
df2 = df.copy()
numerical = df2.select_dtypes(exclude='object')
for c in numerical.columns:
    numerical[c] = (numerical[c] - numerical[c].mean())/numerical[c].std(ddof=0)
    
##preprocessing categorical
categorical = df2.select_dtypes(include='object')
categorical = pd.get_dummies(categorical)
#Embedding numerical & categorical


import gower
g_mat = gower.gower_matrix(df)
gamma = np.mean(np.std(numerical))/2

# Compute pairwise distance matrix
distances = (cdist(numerical,numerical,'sqeuclidean')) + cdist(categorical,categorical,'hamming')*gamma

fit1 = umap.UMAP(n_components=n_components,metric='precomputed').fit_transform(distances)
#print(fit1)

#um = pd.DataFrame(embedding) # Each points' UMAP coordinate 
um=pd.DataFrame(fit1)
# Actual Plotting
um.columns = ['X','Y','Z']
#um['cluster'] = df['cluster'].astype(str)
fig = px.scatter_3d(um, 
                x='X',y='Y',z='Z')
fig.update_layout(showlegend=False)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()


using precomputed metric; inverse_transform will be unavailable



In [24]:
um2 = umap.UMAP(n_neighbors=30,n_components=3).fit_transform(um)
um2=pd.DataFrame(um2)
um2.columns = ['X','Y','Z']
#um['cluster'] = df['cluster'].astype(str)
fig = px.scatter_3d(um2, 
                x='X',y='Y',z='Z')
fig.update_layout(showlegend=False)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [25]:
import umap.umap_ as umap
n_components=3
intersection=False
df2 = df.copy()
numerical = df2.select_dtypes(exclude='object')
for c in numerical.columns:
    numerical[c] = (numerical[c] - numerical[c].mean())/numerical[c].std(ddof=0)
    
##preprocessing categorical
categorical = df2.select_dtypes(include='object')
categorical = pd.get_dummies(categorical)
#Embedding numerical & categorical


import gower
g_mat = gower.gower_matrix(df)
gamma = np.mean(np.std(numerical))/2

# Compute pairwise distance matrix
distances = (cdist(numerical,numerical,'sqeuclidean')) + cdist(categorical,categorical,'hamming')*gamma*2
huang = distances

fit1 = umap.UMAP(n_components=n_components,metric='precomputed',min_dist=0.1,spread=1).fit_transform(distances)


#um = pd.DataFrame(embedding) # Each points' UMAP coordinate 
um=pd.DataFrame(fit1)
# Actual Plotting
um.columns = ['X','Y','Z']
#um['cluster'] = df['cluster'].astype(str)
fig = px.scatter_3d(um, 
                x='X',y='Y',z='Z')
fig.update_layout(showlegend=False)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()


using precomputed metric; inverse_transform will be unavailable



In [26]:
import numpy as np
df2 = df.copy()
numerical = df2.select_dtypes(exclude='object')
for c in numerical.columns:
    numerical[c] = (numerical[c] - numerical[c].mean())/numerical[c].std(ddof=0)
    
##preprocessing categorical
categorical = df2.select_dtypes(include='object')
categorical = pd.get_dummies(categorical)
#Embedding numerical & categorical

distances = (cdist(numerical,numerical,'cityblock')) + cdist(categorical,categorical,'hamming')*gamma

fit1 = umap.UMAP(metric='precomputed', 
                n_neighbors=10,
                n_components=n_components).fit_transform(distances)

gamma = np.mean(np.std(numerical))/2
embedding = fit1

um = pd.DataFrame(embedding) # Each points' UMAP coordinate 

# Actual Plotting
um.columns = ['X','Y','Z']
#um['cluster'] = df['cluster'].astype(str)
fig = px.scatter_3d(um, 
                x='X',y='Y',z='Z')
fig.update_layout(showlegend=False)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()


using precomputed metric; inverse_transform will be unavailable



In [27]:
from sklearn.manifold import MDS


model = MDS(n_components=80, dissimilarity='precomputed', random_state=2002)
out = model.fit(huang).embedding_

out

array([[ 2.95873355,  2.95741093, -1.76392729, ...,  3.29949194,
         0.43282129,  2.15070628],
       [ 1.32077692,  0.22961808, -0.02584782, ...,  5.76774741,
        -0.78060143,  2.11612671],
       [ 1.45808394, -0.16869409, -1.28064082, ..., -2.13969614,
        -1.94234629, -0.78688625],
       ...,
       [ 0.9303611 , -1.00704121, -0.2473072 , ...,  0.21034954,
        -3.37224982, -1.31544871],
       [-0.21008778,  1.16510719, -2.0660337 , ...,  1.12737854,
         0.92330759,  3.85184213],
       [ 3.18997375,  1.95106395, -1.68582913, ..., -0.03862068,
        -0.76398929,  3.38772446]])

In [28]:
np.nanmean(huang/cdist(out,out))


invalid value encountered in divide



0.9730377802118063

In [29]:
np.nanstd(huang/cdist(out,out))


invalid value encountered in divide



0.11605144603116788

In [30]:
np.nanquantile(huang/cdist(out,out),.98)


invalid value encountered in divide



1.1829837180130391

In [31]:
from prince import PCA
heu = PCA(n_components=3).fit_transform(out)

In [32]:
px.scatter_3d(pd.DataFrame(heu),0,1,2)

In [33]:
heu = umap.UMAP(n_components=3).fit_transform(out)
px.scatter_3d(pd.DataFrame(heu),0,1,2)

In [34]:
heu = umap.UMAP(n_components=3, metric='minkowski',metric_kwds={'p':1/5}).fit_transform(heu)
px.scatter_3d(pd.DataFrame(heu),0,1,2)

In [35]:
numerical.shape[1]

20

In [36]:
distances = (cdist(numerical,numerical,'sqeuclidean')) + cdist(categorical,categorical,'hamming')*gamma

fit1 = umap.UMAP(n_components=df.shape[1],metric='precomputed').fit_transform(distances)
fit2 = umap.UMAP(n_components=3,metric='sqeuclidean').fit_transform(fit1)
px.scatter_3d(pd.DataFrame(fit2),0,1,2)


using precomputed metric; inverse_transform will be unavailable


gradient function is not yet implemented for sqeuclidean distance metric; inverse_transform will be unavailable



In [37]:
from sklearn.manifold import Isomap

iso = Isomap(n_components=40,metric="precomputed").fit_transform(huang)

In [38]:
np.nanmean(huang/cdist(iso,iso))


invalid value encountered in divide



0.6442186160999672

In [39]:
np.nanmean(huang/cdist(rr,rr))


invalid value encountered in divide



24.51486176616867

In [40]:
from sklearn.manifold import MDS


model = MDS(n_components=3, dissimilarity='precomputed', random_state=2002)
emb = model.fit(huang).embedding_
#np.nanmean(huang,cdist(emb,emb))

In [41]:
np.nanmean(huang/cdist(emb,emb))


invalid value encountered in divide



1.0945419608456468

In [42]:
e2 = pd.DataFrame(emb)
px.scatter_3d(e2,0,1,2)

In [43]:
from rpy2 import robjects

pi = robjects.r['pi']
pi

2023-01-14 12:00:01.547 INFO    rpy2.situation: cffi mode is CFFI_MODE.ANY
2023-01-14 12:00:01.596 INFO    rpy2.situation: R home found: C:\PROGRA~1\R\R-41~1.2
2023-01-14 12:00:01.806 ERROR   rpy2.situation: Unable to determine R library path: Command '('C:\\PROGRA~1\\R\\R-41~1.2\\bin\\Rscript', '-e', 'cat(Sys.getenv("LD_LIBRARY_PATH"))')' returned non-zero exit status 1.
2023-01-14 12:00:01.807 INFO    rpy2.situation: LD_LIBRARY_PATH: 
2023-01-14 12:00:01.818 INFO    rpy2.rinterface_lib.embedded: Default options to initialize R: rpy2, --quiet, --no-save


0
3.141593


In [44]:
from rpy2.robjects.packages import importr
utils = importr('utils')

In [45]:
utils.chooseCRANmirror(ind=1)

<rpy2.rinterface_lib.sexp.NULLType object at 0x000002122D1E8940> [RTYPES.NILSXP]

In [46]:
utils.install_packages('kmed')

Exception ignored from cffi callback <function _consolewrite_ex at 0x000002122F09F010>:
Traceback (most recent call last):
  File "c:\Users\cleme\AppData\Local\Programs\Python\Python310\lib\site-packages\rpy2\rinterface_lib\callbacks.py", line 133, in _consolewrite_ex
    s = conversion._cchar_to_str_with_maxlen(buf, n, _CCHAR_ENCODING)
  File "c:\Users\cleme\AppData\Local\Programs\Python\Python310\lib\site-packages\rpy2\rinterface_lib\conversion.py", line 138, in _cchar_to_str_with_maxlen
    s = ffi.string(c, maxlen).decode(encoding)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 106: invalid continuation byte






package 'kmed' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\cleme\AppData\Local\Temp\RtmpkFqtcP\downloaded_packages


<rpy2.rinterface_lib.sexp.NULLType object at 0x000002122D1E8940> [RTYPES.NILSXP]

In [47]:
importr('kmed')

rpy2.robjects.packages.Package as a <module 'kmed'>

In [48]:
from rpy2 import robjects
rr = robjects.r(
    '''
    library(kmed)
    a <- matrix(sample(1:2, 7*3, replace = TRUE), 7, 3)
    a1 <- matrix(sample(1:3, 7*3, replace = TRUE), 7, 3)
    mixdata <- cbind(iris[1:7,1:3], a, a1)
    colnames(mixdata) <- c(paste(c("num"), 1:3, sep = ""),
                        paste(c("bin"), 1:3, sep = ""),
                        paste(c("cat"), 1:3, sep = ""))
    distmix(mixdata, method = "ahmad", idnum = 1:3, idbin = 4:6, idcat = 7:9)
    '''
)
list(rr)

[0.0,
 5.129999999999999,
 6.7116,
 5.333611111111112,
 4.046711111111112,
 2.6911111111111112,
 0.26,
 5.129999999999999,
 0.0,
 0.20560000000000012,
 5.7111111111111095,
 7.16471111111111,
 3.6044444444444443,
 5.089999999999999,
 6.7116,
 0.20560000000000012,
 0.0,
 7.386044444444444,
 5.397777777777777,
 4.775377777777777,
 6.5116,
 5.333611111111112,
 5.7111111111111095,
 7.386044444444444,
 0.0,
 1.2121000000000006,
 10.220277777777776,
 5.0136111111111115,
 4.046711111111112,
 7.16471111111111,
 5.397777777777777,
 1.2121000000000006,
 0.0,
 8.031377777777776,
 4.226711111111112,
 2.6911111111111112,
 3.6044444444444443,
 4.775377777777777,
 10.220277777777776,
 8.031377777777776,
 0.0,
 3.3311111111111122,
 0.26,
 5.089999999999999,
 6.5116,
 5.0136111111111115,
 4.226711111111112,
 3.3311111111111122,
 0.0]

In [49]:
rr = robjects.r(
    '''
    library(kmed)
    df <- read.csv("penguins.csv")
    sapply(df, class)

    '''
)
list(rr)

['character',
 'character',
 'numeric',
 'numeric',
 'integer',
 'integer',
 'character',
 'integer']

In [50]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.114070,-0.199153,0.153123,0.303670,-0.026368,-0.094547,0.037739,0.220101,-0.028716,0.228900,...,2,3,2,2,4,1,4,3,5,6
1,0.062808,-0.153880,-0.125579,0.304854,0.297609,0.122795,-0.010155,0.227065,-0.089763,0.157627,...,4,4,6,4,2,5,2,5,6,2
2,0.030880,0.091022,0.081529,0.093533,0.292342,0.130896,0.134958,0.137253,0.159708,0.098639,...,2,1,2,1,2,4,6,4,6,2
3,0.124283,-0.030016,-0.143757,0.221702,-0.217228,0.098914,-0.052882,-0.022503,-0.024244,-0.236986,...,5,0,3,6,2,6,1,6,2,1
4,0.181943,0.087442,0.121391,0.144561,-0.085337,0.010163,-0.259214,0.122444,-0.147765,0.109965,...,0,2,0,0,3,0,4,2,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.141683,-0.171053,-0.163513,0.201766,0.037786,0.137030,0.006882,0.020535,0.163380,0.230117,...,5,6,2,5,3,6,6,2,3,0
996,0.048317,0.112019,-0.052480,-0.040376,-0.108678,0.152326,-0.030797,-0.098943,-0.283984,-0.044491,...,3,0,3,5,2,5,2,1,0,6
997,-0.009088,-0.000527,0.066762,0.076951,0.152308,0.220454,-0.078848,0.154417,0.206234,0.120558,...,6,1,4,2,5,6,6,0,2,0
998,0.229168,0.045210,0.039306,0.051882,-0.105200,-0.139480,-0.201166,0.225620,-0.024361,-0.212245,...,1,4,3,5,6,0,3,2,0,2


In [51]:
kmed = importr("kmed")


In [52]:
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects import r

from rpy2.robjects.conversion import localconverter
with localconverter(ro.default_converter + pandas2ri.converter):
    base = importr('base')
    smry = base.summary(df)
    kmed = importr('kmed')
    mixdata = robjects.r(
        '''
        library(kmed)
        a <- matrix(sample(1:2, 7*3, replace = TRUE), 7, 3)
        a1 <- matrix(sample(1:3, 7*3, replace = TRUE), 7, 3)
        mixdata <- cbind(iris[1:7,1:3], a, a1)
        colnames(mixdata) <- c(paste(c("num"), 1:3, sep = ""),
                            paste(c("bin"), 1:3, sep = ""),
                            paste(c("cat"), 1:3, sep = ""))
        mixdata
        ''')
    #ahmad_dey = kmed.distmix(df, method = "ahmad",
    # idnum = '1:3',
    #  idbin = '4:6',
    #   idcat = '7:9')
    d_mat = kmed.distmix(
                                data=mixdata, 
                                method='huang',  
                                idbin=ro.IntVector([4,5,6]),
                                idcat=ro.IntVector([7,8,9]),
                                idnum=ro.IntVector([1,2,3])
                            )
d_mat

RecursionError: maximum recursion depth exceeded in comparison

In [None]:
df.dtypes

0     float64
1     float64
2     float64
3     float64
4     float64
5     float64
6     float64
7     float64
8     float64
9     float64
10    float64
11    float64
12    float64
13    float64
14    float64
15    float64
16    float64
17    float64
18    float64
19    float64
20     object
21     object
22     object
23     object
24     object
25     object
26     object
27     object
28     object
29     object
30     object
31     object
32     object
33     object
34     object
35     object
36     object
37     object
38     object
39     object
dtype: object

In [None]:
with localconverter(ro.default_converter + pandas2ri.converter):
    kmed = importr('kmed')
    num_ids = []
    cat_ids = []
    bin_ids = []
    for i,col in enumerate(df.columns):
        if df[col].nunique() <= 2:
            bin_ids.append(i+1)
            continue
        elif np.issubdtype(df[col].dtype, np.number):
            num_ids.append(i+1)
            continue
        else:
            cat_ids.append(i+1)

    dist_matrix = kmed.distmix(
                                data=df, 
                                method='gower',  
                                idbin=ro.r("NULL") if len(bin_ids)==0 else ro.IntVector(bin_ids),
                                idnum=ro.r("NULL") if len(num_ids)==0 else ro.IntVector(num_ids),
                                idcat=ro.r("NULL") if len(cat_ids)==0 else ro.IntVector(cat_ids)
                            )
dist_matrix

array([[0.        , 0.52373808, 0.60295174, ..., 0.53125236, 0.54089492,
        0.55630639],
       [0.52373808, 0.        , 0.53983349, ..., 0.46609312, 0.57584806,
        0.55254993],
       [0.60295174, 0.53983349, 0.        , ..., 0.55839509, 0.51113395,
        0.37260062],
       ...,
       [0.53125236, 0.46609312, 0.55839509, ..., 0.        , 0.61260741,
        0.5479417 ],
       [0.54089492, 0.57584806, 0.51113395, ..., 0.61260741, 0.        ,
        0.48027054],
       [0.55630639, 0.55254993, 0.37260062, ..., 0.5479417 , 0.48027054,
        0.        ]])

In [None]:
import gower
gower.gower_matrix(df)

array([[0.        , 0.5237381 , 0.6029517 , ..., 0.5312524 , 0.5408949 ,
        0.55630636],
       [0.5237381 , 0.        , 0.5398335 , ..., 0.46609312, 0.57584804,
        0.55254996],
       [0.6029517 , 0.5398335 , 0.        , ..., 0.5583951 , 0.51113397,
        0.37260062],
       ...,
       [0.5312524 , 0.46609312, 0.5583951 , ..., 0.        , 0.6126074 ,
        0.5479417 ],
       [0.5408949 , 0.57584804, 0.51113397, ..., 0.6126074 , 0.        ,
        0.48027053],
       [0.55630636, 0.55254996, 0.37260062, ..., 0.5479417 , 0.48027053,
        0.        ]], dtype=float32)

In [None]:
df.select_dtypes(include=np.number)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.077349,0.074148,-0.092470,0.193703,-0.218828,0.246698,0.111918,0.025719,-0.176475,0.243335,0.174239,0.014178,-0.108676,-0.183157,-0.081796,0.093888,0.009925,-0.069189,-0.244326,0.151888
1,-0.142318,0.022775,0.037407,0.057913,-0.170825,0.055001,0.086226,-0.173003,0.085066,0.086405,0.202272,0.069699,-0.268689,-0.028508,-0.118063,-0.016561,0.036411,-0.131414,0.013103,0.144950
2,-0.170864,0.301652,0.107116,0.142521,-0.082950,0.024189,-0.042392,-0.062138,0.267601,0.074326,-0.125821,0.033158,-0.030709,0.019687,-0.136489,-0.018986,-0.044199,-0.061214,0.111561,0.004839
3,0.029628,0.247126,-0.116451,0.197002,0.016649,-0.020406,-0.074012,-0.091724,0.176399,0.188235,0.101382,0.172200,-0.037606,0.023433,-0.181943,0.071289,0.149179,0.045107,0.063140,-0.128308
4,-0.021397,-0.025215,-0.116732,-0.133430,-0.183832,0.263251,0.216057,0.057672,-0.010534,0.006271,0.287941,-0.112596,0.022179,-0.012596,0.098104,0.093130,-0.014412,0.041114,-0.170531,0.207990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.001623,0.159430,-0.022497,0.173181,-0.100692,0.006630,-0.080779,-0.048222,0.094408,0.172071,-0.056245,0.204697,0.178168,-0.158004,-0.148042,0.046431,0.108189,-0.189891,0.255394,-0.115379
996,-0.038734,0.017685,-0.116510,0.185744,-0.157890,0.025947,0.070185,-0.050878,-0.087024,0.053669,0.047632,0.163718,-0.186831,-0.216969,-0.049331,-0.028820,0.315693,0.012807,-0.135882,-0.017251
997,-0.051276,0.100574,0.127614,0.220622,-0.131036,-0.014322,0.225449,-0.135867,0.028687,0.084498,0.119589,0.207920,-0.156705,-0.266049,-0.343788,-0.005173,0.161376,-0.071874,-0.147590,0.033607
998,0.195975,0.035911,0.263587,0.193149,-0.224440,-0.078130,-0.082987,-0.037897,0.225106,0.012049,0.036563,0.044522,0.164838,-0.111105,-0.128710,0.027970,-0.287878,-0.189525,0.352325,-0.212521


In [None]:
import pandas as pd
import numpy as np
sponge = pd.read_csv('titanic.csv').dropna()
df = sponge.iloc[:,1:]
target = sponge.iloc[:,0]
numerical = df.select_dtypes(exclude='object')
for c in numerical.columns:
    numerical[c] = (numerical[c] - numerical[c].mean())/numerical[c].std(ddof=0)
    
##preprocessing categorical
categorical = df.select_dtypes(include='object')
categorical = pd.get_dummies(categorical)

gamma = np.mean(np.std(numerical))/2
distances = (cdist(numerical,numerical,'sqeuclidean')) + cdist(categorical,categorical,'hamming')*gamma

fit1 = umap.UMAP(n_components=3,n_neighbors=15,metric='precomputed').fit_transform(distances)

um = pd.DataFrame(fit1)
um.columns = ['X','Y','Z']
fig = px.scatter_3d(um, 'X', 'Y', 'Z', color=list(target.astype(str)))
fig.show()


using precomputed metric; inverse_transform will be unavailable



In [None]:
target.value_counts()

1    123
0     59
Name: survived, dtype: int64

In [None]:
df

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
