In [1]:
import pandas as pd

FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'

In [2]:
import pickle
from datetime import timedelta
from helper import save

import psutil
from pandarallel import pandarallel
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from distances import Distances
from clustering import Clustering, get_dists

tqdm.pandas()
from helper import *

In [8]:
class Tester:
    def __init__(self):
        return

    @staticmethod
    def run_statistics(path: str = 'data_processed',
                       nrows: int = 1_000_000,
                       top_lim: int = 10_000,
                       k: int = 100,
                       methods: list[str] = None,
                       field: str = 'product_id'
                       ):
        d = Distances(path, nrows)

        print(f'Collecting data...')
        pp = d.get_pp(top_lim=top_lim, batch_size=100_000, field=field)

        stats = dict()
        if methods is None:
            methods = ['min_dist', 'max_dist', 'average', 'weighted', 'ward']

        print(f'Starting agglomerative clustering with \n\tfile: {path},\n\tfield: {field},\n\tnrows: {nrows},\n\ttop_lim: {top_lim}')
        for m in methods:
            c = Clustering(get_dists=get_dists)
            print(f'Clustering with method {m}')
            c.fit(metric='euclidean', method=m, type='agglomerative', top_lim=top_lim, k=k, dists=pp)
            stats[m] = c.get_stats()

        return stats

In [9]:
t = Tester()
stats = t.run_statistics(methods=['euclidean', 'max_dist', 'min_dist', 'average', 'weighted', 'ward'], nrows=2_000_000, top_lim=1_000, k=100)

Collecting data...
Top of dataset length: 650013
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  0%|          | 0/7 [00:00<?, ?it/s]

Starting agglomerative clustering with 
	file: data_processed,
	field: product_id,
	nrows: 2000000,
	top_lim: 1000
Clustering with method euclidean
Starting counting distances between clusters...


  0%|          | 0/1086 [00:00<?, ?it/s]

Starting collapsing closest clusters...


  0%|          | 0/986 [00:00<?, ?it/s]

Clustering with method max_dist
Starting counting distances between clusters...


  0%|          | 0/1086 [00:00<?, ?it/s]

Starting collapsing closest clusters...


  0%|          | 0/986 [00:00<?, ?it/s]

Clustering with method min_dist
Starting counting distances between clusters...


  0%|          | 0/1086 [00:00<?, ?it/s]

Starting collapsing closest clusters...


  0%|          | 0/986 [00:00<?, ?it/s]

Clustering with method average
Starting counting distances between clusters...


  0%|          | 0/1086 [00:00<?, ?it/s]

Starting collapsing closest clusters...


  0%|          | 0/986 [00:00<?, ?it/s]

Clustering with method weighted
Starting counting distances between clusters...


  0%|          | 0/1086 [00:00<?, ?it/s]

Starting collapsing closest clusters...


  0%|          | 0/986 [00:00<?, ?it/s]

Clustering with method ward
Starting counting distances between clusters...


  0%|          | 0/1086 [00:00<?, ?it/s]

Starting collapsing closest clusters...


  0%|          | 0/986 [00:00<?, ?it/s]

In [10]:
stats

{'euclidean': {'min_distances': [0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
  

In [3]:
c = Clustering(get_dists=get_dists)
clusters = c.fit(metric='euclidean', method='ward', type='centroid', k=100, dists_path='pp_category_id_2000000')

  0%|          | 0/10000 [00:00<?, ?it/s]

MemoryError: Unable to allocate 1.05 TiB for an array with shape (38031, 100, 38031) and data type int64

In [None]:
c.get_stats()