In [1]:
pip install --upgrade kmodes

Collecting kmodes
  Downloading https://files.pythonhosted.org/packages/9b/34/fffc601aa4d44b94e945a7cc72f477e09dffa7dce888898f2ffd9f4e343e/kmodes-0.11.0-py2.py3-none-any.whl
Installing collected packages: kmodes
Successfully installed kmodes-0.11.0


In [2]:
import pathlib
import pandas as pd
import numpy as np

# scikit-learn bootstrap
from sklearn.utils import resample
from sklearn.metrics import silhouette_score
from kmodes.kprototypes import KPrototypes

In [3]:
sourcepath = 'https://raw.githubusercontent.com/FTPGitHub/TDG/main/datos'

In [4]:
# Lee dataset estandarizado de transacciones
data1 = pd.read_csv("{}/transacciones.csv".format(sourcepath))

In [5]:
data1['AISLE']=data1['AISLE'].astype(str)

In [6]:
data1.dtypes

AISLE        object
IG           object
HTS          object
PLANNING     object
ABC          object
UOM          object
CAT          object
COSTX       float64
QTYVAR      float64
dtype: object

In [7]:
data1.head(5)

Unnamed: 0,AISLE,IG,HTS,PLANNING,ABC,UOM,CAT,COSTX,QTYVAR
0,17,IG56,HTS119,1/1,B,PC,CAT17,-0.029976,-0.05325
1,17,IG56,HTS119,1/1,B,PC,CAT17,0.092472,-0.022999
2,17,IG56,HTS119,1/1,C,PC,CAT17,0.26613,0.004373
3,17,IG56,HTS119,1/1,B,PC,CAT17,-0.025112,-0.060323
4,15,IG56,HTS119,1/1,C,PC,CAT17,-0.077982,-0.062534


La versión original del código lee la matriz de distancias previamente calculada desde Google Drive o desde un archivo local en el PC según donde se encuentre almacenada.

Desafortunadamente el archivo pesa cerca de 5 GB (matriz 16239 x 16239) y GitHub no permite archivos mayores a 100 MB. Por lo tanto se procede a recalcularla a continuación utilizando numba para optimización de desempeño.

In [None]:
# dm = pd.read_csv("/content/gdrive/MyDrive/distancias.csv")

In [None]:
# Posición de las variables categóricas
catColumnsPos = [data1.columns.get_loc(col) for col in list(data1.select_dtypes('object').columns)]
print('Categorical columns : {}'.format(list(data1.select_dtypes('object').columns)))
print('Categorical columns position : {}'.format(catColumnsPos))

Categorical columns : ['AISLE', 'IG', 'HTS', 'PLANNING', 'ABC', 'UOM', 'CAT']
Categorical columns position : [0, 1, 2, 3, 4, 5, 6]


In [None]:
# Posición de las variables numéricas]
numColumnsPos = [data1.columns.get_loc(col) for col in list(data1.select_dtypes('float64').columns)]
print('Numerical columns : {}'.format(list(data1.select_dtypes('float64').columns)))
print('Numerical columns position : {}'.format(numColumnsPos))

Numerical columns : ['COSTX', 'QTYVAR']
Numerical columns position : [7, 8]


In [None]:
import numba as nb
import time

In [None]:
data1.reset_index(drop=True)
N = data1.shape[0]

dataNum= data1.iloc[:,numColumnsPos]
dataNum=dataNum.to_numpy()

dataCat=data1.iloc[:,catColumnsPos]
dataCat=dataCat.to_numpy()

# gamma = 0.5 (constante)
catWeight=0.5

start_time = time.time()

In [None]:
@nb.njit
def dist_num_njit(data):
    distancias = np.zeros((len(data),len(data)), dtype=np.float64)
    valor = nb.float64(0)

    for i in range(len(data)):
        for j in range(i+1, len(data)):
            valor = nb.float64(np.sum((data[i] - data[j]) ** 2))
            distancias[i][j] = valor
    return distancias

In [None]:
def dist_cat(data, catWeight):
    distancias = np.zeros((len(data),len(data)), dtype=np.float64)
    for i in range(len(data)):
        for j in range(i+1, len(data)):
            distancias[i][j] = np.float64(np.sum(data[i]!=data[j])*catWeight)
    return distancias

In [None]:
@nb.njit
def traspose_njit (data):
    distancias = data
    for i in range(len(data)):
        for j in range(1+i,len(data)):
            distancias[j][i] = distancias[i][j]
    return distancias            

In [None]:
# Todavía no está implementado correctamente
def get_scores(data,catWeight):
    %time dist_num_np = dist_num_njit(dataNum[range(i)])
    %time dist_cat_np = dist_cat(dataCat[range(i)], catWeight)
    %time dm = np.sum([dist_cat_np, dist_num_np], axis=0)
    %time dm = traspose_njit(dm)

In [None]:
start_time = time.time()
for i in [10, dataNum.shape[0]]:
  print('Número de filas a procesar: {}'.format(i))
  %time dist_num_np = dist_num_njit(dataNum[range(i)])
  %time dist_cat_np = dist_cat(dataCat[range(i)], catWeight)
  %time dm = np.sum([dist_cat_np, dist_num_np], axis=0)
  %time dm = traspose_njit(dm)
  print()
print('Tiempo total = {}'.format(time.time() - start_time))

Número de filas a procesar: 10
CPU times: user 781 ms, sys: 20 ms, total: 801 ms
Wall time: 784 ms
CPU times: user 727 µs, sys: 0 ns, total: 727 µs
Wall time: 733 µs
CPU times: user 98 µs, sys: 0 ns, total: 98 µs
Wall time: 102 µs
CPU times: user 148 ms, sys: 0 ns, total: 148 ms
Wall time: 147 ms

Número de filas a procesar: 16239
CPU times: user 16.9 s, sys: 1.04 s, total: 17.9 s
Wall time: 17.7 s
CPU times: user 26min 17s, sys: 5.43 s, total: 26min 22s
Wall time: 26min 18s
CPU times: user 1.83 s, sys: 1.34 s, total: 3.17 s
Wall time: 3.15 s
CPU times: user 1.4 s, sys: 12.1 ms, total: 1.41 s
Wall time: 1.39 s

Tiempo total = 1601.8853166103363


**Algoritmo Silueta Bootstrap**

In [None]:
import time
start_time = time.time()
# Coeficiente silueta para K 2-4
kValues = [2, 3, 4]
gamma = 0.5
nBootstrap = 100
# Inicializamos la matriz de coeficientes silueta con ceros
silueta = [ [ 0 for j in range(nBootstrap + 1) ] for i in range(len(kValues)) ]
catColumnsPos = [data1.columns.get_loc(col) for col in list(data1.select_dtypes('object').columns)]
N = data1.shape[0]

In [None]:
for k in kValues:
  print('Dataset original, k = ', k)
  kprototype = KPrototypes(n_jobs = -1, n_clusters = k, init = 'Huang', gamma = gamma, random_state = 0)
  kprototype.fit_predict(data1, categorical = catColumnsPos)
  print("--- %s seconds ---" % (time.time() - start_time))
  silueta[k-min(kValues)][0] = silhouette_score(dm, kprototype.labels_, metric='precomputed')
  print("Coeficiente silueta --- %s seconds ---" % (time.time() - start_time))
  print(silueta[k-min(kValues)][0])

for b in range(1,nBootstrap+1):
  scores = np.zeros((len(data1),len(data1)), dtype=np.float64)
  print('bootstrap # ',b);
  boot = resample(data1, replace=True, n_samples=N, random_state=b)
  bootIndex = boot.index
  for i in range(N):
    for j in range(i+1,N):
      scores[i][j]=dm[bootIndex[i]][bootIndex[j]]
  print("Matriz de distancias (triangulo superior) --- %s seconds ---" % (time.time() - start_time))
  # Completamos la matriz
  for i in range(0,N):
    for j in range(0,i):
      scores[i][j]=scores[j][i]
  print("Espejo --- %s seconds ---" % (time.time() - start_time))
  for k in kValues:
    print('Bootstrap, k = ', k)
    kprototypebs = KPrototypes(n_jobs = -1, n_clusters = k, init = 'Huang', gamma = gamma, random_state = 1)
    kprototypebs.fit_predict(boot, categorical = catColumnsPos)
    print("--- %s seconds ---" % (time.time() - start_time));
    silueta[k-min(kValues)][b] = silhouette_score(scores, kprototypebs.labels_, metric='precomputed')
    print("Coeficiente silueta --- %s seconds ---" % (time.time() - start_time))
    print(silueta[k-min(kValues)][b])

Dataset original, k =  2
--- 80.45178961753845 seconds ---
Coeficiente silueta --- 83.46208024024963 seconds ---
0.7368588147337156
Dataset original, k =  3
--- 171.46498107910156 seconds ---
Coeficiente silueta --- 174.4525866508484 seconds ---
0.7182825853232915
Dataset original, k =  4
--- 259.15191531181335 seconds ---
Coeficiente silueta --- 261.9385132789612 seconds ---
0.10914930712265486
bootstrap #  1
Matriz de distancias (triangulo superior) --- 606.0396528244019 seconds ---
Espejo --- 716.2900447845459 seconds ---
Bootstrap, k =  2
--- 782.3157691955566 seconds ---
Coeficiente silueta --- 784.1398751735687 seconds ---
0.7853497277166785
Bootstrap, k =  3
--- 878.8913578987122 seconds ---
Coeficiente silueta --- 880.6931409835815 seconds ---
0.7989985671062392
Bootstrap, k =  4
--- 981.536144733429 seconds ---
Coeficiente silueta --- 982.9725198745728 seconds ---
0.129275946751179
bootstrap #  2
Matriz de distancias (triangulo superior) --- 1326.0416538715363 seconds ---
Espe

In [None]:
for i in kValues:
  print('Mediana para k = ',i);
  print(median(silueta[i-min(kValues)][1:nBootstrap+1]));

In [None]:
silueta = pd.DataFrame(silueta)