In [1]:
pip install --upgrade kmodes

Collecting kmodes
  Downloading https://files.pythonhosted.org/packages/9b/34/fffc601aa4d44b94e945a7cc72f477e09dffa7dce888898f2ffd9f4e343e/kmodes-0.11.0-py2.py3-none-any.whl
Installing collected packages: kmodes
Successfully installed kmodes-0.11.0


In [2]:
import pathlib
import pandas as pd
import numpy as np

from kmodes.kprototypes import KPrototypes

# scikit-learn bootstrap
from sklearn.utils import resample

In [3]:
sourcepath = 'https://raw.githubusercontent.com/FTPGitHub/TDG/main/datos'

In [4]:
# Lee dataset estandarizado de transacciones
data1 = pd.read_csv("{}/transacciones.csv".format(sourcepath))

In [5]:
data1['AISLE']=data1['AISLE'].astype(str)

In [6]:
data1.dtypes

AISLE        object
IG           object
HTS          object
PLANNING     object
ABC          object
UOM          object
CAT          object
COSTX       float64
QTYVAR      float64
dtype: object

In [7]:
data1.head(5)

Unnamed: 0,AISLE,IG,HTS,PLANNING,ABC,UOM,CAT,COSTX,QTYVAR
0,17,IG56,HTS119,1/1,B,PC,CAT17,-0.029976,-0.05325
1,17,IG56,HTS119,1/1,B,PC,CAT17,0.092472,-0.022999
2,17,IG56,HTS119,1/1,C,PC,CAT17,0.26613,0.004373
3,17,IG56,HTS119,1/1,B,PC,CAT17,-0.025112,-0.060323
4,15,IG56,HTS119,1/1,C,PC,CAT17,-0.077982,-0.062534


In [None]:
data1.shape[0]

16239

In [None]:
catColumnsPos = [data1.columns.get_loc(col) for col in list(data1.select_dtypes('object').columns)]

In [None]:
catColumnsPos

[0, 1, 2, 3, 4, 5, 6]

In [None]:
# Fit the cluster 
k = 2
gamma = 0.5
import time
start_time = time.time()
kprototype = KPrototypes(n_jobs = -1, n_clusters = k, init = 'Huang', gamma = gamma, random_state = 0)
kprototype.fit_predict(data1, categorical = catColumnsPos)
print("--- %s seconds ---" % (time.time() - start_time))

--- 66.5556890964508 seconds ---


In [None]:
data1['Cluster'] = kprototype.labels_

In [None]:
def add2set (nuevos):
  conjunto = {999999}
  conjunto.update(nuevos)
  conjunto.discard(999999)
  return(conjunto)


In [None]:
# k = 2 (dos conjuntos de puntos)
orig0 = {}
orig1 = {}

In [None]:
orig0 = add2set(data1[data1['Cluster']==0].index)
orig1 = add2set(data1[data1['Cluster']==1].index)

In [None]:
data1.drop('Cluster', inplace=True, axis=1)

In [None]:
# Inicializamos la matriz de distancias con ceros
import time
start_time = time.time()
nBootstrap = 100
gamma = 0.5
jaccard = [ [ 0 for i in range(k) ] for j in range(nBootstrap) ]
boot0 = {}
boot1 = {}

In [None]:
pd.options.mode.chained_assignment = None

In [None]:
for i in range(nBootstrap):
  print('bootstrap # ',i);
  boot = resample(data1, replace=True, n_samples=data1.shape[0], random_state=i)
  # Fit the cluster
  kprototypebs = KPrototypes(n_jobs = -1, n_clusters = k, init = 'Huang', gamma = gamma, random_state = 1)
  kprototypebs.fit_predict(boot, categorical = catColumnsPos)
  boot['Cluster'] = kprototypebs.labels_
  print("--- %s seconds ---" % (time.time() - start_time));
  boot0 = add2set(boot[boot['Cluster']==0].index)
  boot1 = add2set(boot[boot['Cluster']==1].index)
  jaccard[i][0]=max(len(orig0 & boot0) / len(orig0 | boot0), len(orig0 & boot1) / len(orig0 | boot1))
  jaccard[i][1]=max(len(orig1 & boot0) / len(orig1 | boot0), len(orig1 & boot1) / len(orig1 | boot1))
  boot.drop('Cluster', inplace=True, axis=1)
  print(jaccard[i][0:k]);

bootstrap #  0
--- 52.933316469192505 seconds ---
[0.6092843326885881, 0.6328904710444345]
bootstrap #  1
--- 103.35523295402527 seconds ---
[0.586073500967118, 0.6288600838734274]
bootstrap #  2
--- 156.33497881889343 seconds ---
[0.6150870406189555, 0.6322138276934798]
bootstrap #  3
--- 201.3859179019928 seconds ---
[0.5783365570599613, 0.6347229283172343]
bootstrap #  4
--- 249.95306539535522 seconds ---
[0.6312741312741312, 0.6301361149980919]
bootstrap #  5
--- 298.76789593696594 seconds ---
[0.6354961832061069, 0.6320508744038156]
bootstrap #  6
--- 347.529177904129 seconds ---
[0.5996131528046421, 0.630809333079026]
bootstrap #  7
--- 394.9036509990692 seconds ---
[0.5846702317290553, 0.6335707925200356]
bootstrap #  8
--- 445.86148262023926 seconds ---
[0.6365384615384615, 0.6336980027986261]
bootstrap #  9
--- 493.97935461997986 seconds ---
[0.6305609284332688, 0.6388959552276774]
bootstrap #  10
--- 542.9597480297089 seconds ---
[0.031243839936920953, 0.6003491489494357]
boo

In [None]:
pd.options.mode.chained_assignment = 'warn'

In [None]:
jaccard = pd.DataFrame(jaccard)

In [None]:
jaccard.describe()

Unnamed: 0,0,1
count,100.0,100.0
mean,0.555371,0.612528
std,0.172055,0.069502
min,0.030806,0.313573
25%,0.583404,0.628307
50%,0.613681,0.630163
75%,0.634429,0.632405
max,0.680851,0.638896
