In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import math
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score
from collections import defaultdict
from joblib import Parallel, delayed
from funcy import log_durations
import logging
from tqdm import tqdm
import math
import torch
from torch import Tensor
import os

In [2]:
# Determine the location of the dataframe containing the typo-corrected text
file_location = "../data/example_data/output/cleaned_products.parquet"

# Read the dataframe
df  = pd.read_parquet(file_location)

# Display dataframe
display(df)

Unnamed: 0,products_id,products_and_services,clustered_id,cleaned_text
0,164399edbf8e880dc2e856f50d51e720bd0a8abe,"fish, frozen and deep-frozen",a18df3877d3f9598d7c8fbae0adc2cad4acf37c6,fish frozen deepfrozen
1,b0d3c55743b1b858ec2843c8870116bb8af543fd,drilling and test boring - equipment,49659f8efe8d9a92455f0d378783469558ae7df1,drilling test boring equipment
2,b14c038972e6a52bfbf3ffbe77def57a62c5b9cf,well-management services,b14c038972e6a52bfbf3ffbe77def57a62c5b9cf,wellmanagement service
3,abadc2542b4b5c1ecfe41c22afb2347b1d9b65af,electronic data processing - software,35596a3df5495e2dc5d18cff45c58cadda91040c,electronic data processing software
4,60c58ad2ef34d96fae028f1039fab03dec9eb9a2,communication,60c58ad2ef34d96fae028f1039fab03dec9eb9a2,communication
...,...,...,...,...
31541,a56bfdd9971ddba76de33e5dd394faab63d2c58c,trading in non-ferrous products,5af4a5f264253d48a9504c6e9e9de651f5528121,trading nonferrous product
31542,d16685f9db86a7e446d5a4c763a17016ffdfa613,precision weights for scales,b52520ccdfafa1b05949ffe08c0fdde9e2556a9e,precision weight scale
31543,37c8e6d302d907a76f49d45a91949c86dd5fcc03,weights and masses - measurement and verificat...,822c0e12996351ae9cf05354936d074bb4c6103b,weight mass measurement verification instrument
31544,4aa756effa61af41058cf80f475a03b439232cfe,manicure scissors,4aa756effa61af41058cf80f475a03b439232cfe,manicure scissors


In [8]:
print('Embedding data')
model = SentenceTransformer('all-MiniLM-L6-v2')
print('Model loaded')

sentences = df['cleaned_text'].tolist()
unique_sentences = df['cleaned_text'].unique()
print('Unique sentences', len(unique_sentences))

sentences = df['cleaned_text'].tolist()
unique_sentences = df['cleaned_text'].unique()
embeddings = model.encode(unique_sentences, show_progress_bar=True, batch_size=128)

mapping = {sentence: embedding for sentence, embedding in zip(unique_sentences, embeddings)}
embeddings = np.array([mapping[sentence] for sentence in sentences])

Embedding data
Model loaded
Unique sentences 30841


Batches:   0%|          | 0/241 [00:00<?, ?it/s]

In [11]:
embeddings

array([[-0.04650404,  0.00217801, -0.02107469, ..., -0.02285529,
        -0.00280993,  0.05536779],
       [-0.04808221, -0.0097314 ,  0.01712464, ...,  0.00614742,
        -0.01118629, -0.07062028],
       [-0.06738088,  0.04993471,  0.0725988 , ...,  0.00713424,
         0.01262637, -0.03323374],
       ...,
       [-0.04802645,  0.07145448, -0.047667  , ..., -0.04117569,
        -0.02189348, -0.05587696],
       [-0.04348597,  0.0291066 , -0.01997651, ...,  0.00454498,
         0.00567288,  0.01318082],
       [-0.10508741,  0.02357166, -0.04046485, ...,  0.02065877,
         0.10108754, -0.06006776]], dtype=float32)

In [3]:
def embed_data(data, key='text', model_name='all-MiniLM-L6-v2', cores=1, gpu=False, batch_size=128):
    """
    Embed the sentences/text using the MiniLM language model (which uses mean pooling)
    """
    print('Embedding data')
    model = SentenceTransformer(model_name)
    print('Model loaded')

    sentences = data[key].tolist()
    unique_sentences = data[key].unique()
    print('Unique sentences', len(unique_sentences))

    if cores == 1:
        embeddings = model.encode(unique_sentences, show_progress_bar=True, batch_size=batch_size, convert_to_tensor=True)
    else:
        devices = ['cpu'] * cores
        if gpu:
            devices = None  # use all CUDA devices

        # Start the multi-process pool on multiple devices
        print('Multi-process pool starting')
        pool = model.start_multi_process_pool(devices)
        print('Multi-process pool started')

        chunk_size = math.ceil(len(unique_sentences) / cores)

        # Compute the embeddings using the multi-process pool
        embeddings = model.encode_multi_process(unique_sentences, pool, batch_size=batch_size, chunk_size=chunk_size)
        model.stop_multi_process_pool(pool)

    print("Embeddings computed")
    mapping = {sentence: embedding for sentence, embedding in zip(unique_sentences, embeddings)}
    embeddings = np.array([mapping[sentence] for sentence in sentences])
  
    return embeddings

In [4]:
def cos_sim(a: Tensor, b: Tensor):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """

    if not isinstance(a, torch.Tensor):
        a = torch.tensor(np.array(a))

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(np.array(b))
    
    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))


def get_embeddings(ids, embeddings):
    return np.array([embeddings[idx] for idx in ids])


def reorder_and_filter_cluster(
    cluster_idx, cluster, cluster_embeddings, cluster_head_embedding, threshold
):
    cos_scores = cos_sim(cluster_head_embedding, cluster_embeddings)
    sorted_vals, indices = torch.sort(cos_scores[0], descending=True)
    bigger_than_threshold = sorted_vals > threshold
    indices = indices[bigger_than_threshold]
    sorted_vals = sorted_vals.numpy()
    return cluster_idx, [(cluster[i][0], sorted_vals[i]) for i in indices]


def get_ids(cluster):
    return [transaction[0] for transaction in cluster]


def reorder_and_filter_clusters(clusters, embeddings, threshold, parallel):
    results = parallel(
        delayed(reorder_and_filter_cluster)(
            cluster_idx,
            cluster,
            get_embeddings(get_ids(cluster), embeddings),
            get_embeddings([cluster_idx], embeddings),
            threshold,
        )
        for cluster_idx, cluster in tqdm(clusters.items())
    )

    clusters = {k: v for k, v in results}

    return clusters


def get_embeddings(ids, embeddings):
    return np.array([embeddings[idx] for idx in ids])


def get_clustured_ids(clusters):
    clustered_ids = set(
        [transaction[0] for cluster in clusters.values() for transaction in cluster]
    )
    clustered_ids |= set(clusters.keys())
    return clustered_ids


def get_clusters_ids(clusters):
    return list(clusters.keys())


def get_unclustured_ids(ids, clusters):
    clustered_ids = get_clustured_ids(clusters)
    unclustered_ids = list(set(ids) - clustered_ids)
    return unclustered_ids


def sort_clusters(clusters):
    return dict(
        sorted(clusters.items(), key=lambda x: len(x[1]), reverse=True)
    )  # sort based on size


def sort_cluster(cluster):
    return list(
        sorted(cluster, key=lambda x: x[1], reverse=True)
    )  # sort based on similarity


def filter_clusters(clusters, min_cluster_size):
    return {k: v for k, v in clusters.items() if len(v) >= min_cluster_size}


def unique(collection):
    return list(dict.fromkeys(collection))


def unique_txs(collection):
    seen = set()
    return [x for x in collection if not (x[0] in seen or seen.add(x[0]))]



def chunk(txs, chunk_size):
    n = math.ceil(len(txs) / chunk_size)
    k, m = divmod(len(txs), n)
    return (txs[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))



def online_community_detection(
    ids,
    embeddings,
    clusters=None,
    threshold=0.7,
    min_cluster_size=3,
    chunk_size=2500,
    iterations=10,
    cores=1,
):
    if clusters is None:
        clusters = {}

    with Parallel(n_jobs=cores) as parallel:
        for iteration in range(iterations):
            print("1. Nearest cluster")
            unclustered_ids = get_unclustured_ids(ids, clusters)
            cluster_ids = list(clusters.keys())
            print("Unclustured", len(unclustered_ids))
            print("Clusters", len(cluster_ids))
            clusters = nearest_cluster(
                unclustered_ids,
                embeddings,
                clusters,
                chunk_size=chunk_size,
                parallel=parallel,
            )
            print("\n\n")

            print("2. Create new clusters")
            unclustered_ids = get_unclustured_ids(ids, clusters)
            print("Unclustured", len(unclustered_ids))
            new_clusters = create_clusters(
                unclustered_ids,
                embeddings,
                clusters={},
                min_cluster_size=3,
                chunk_size=chunk_size,
                threshold=threshold,
                parallel=parallel,
            )
            new_cluster_ids = list(new_clusters.keys())
            print("\n\n")

            print("3. Merge new clusters", len(new_cluster_ids))
            max_clusters_size = 25000
            while True:
                new_cluster_ids = list(new_clusters.keys())
                old_new_cluster_ids = new_cluster_ids
                new_clusters = create_clusters(
                    new_cluster_ids,
                    embeddings,
                    new_clusters,
                    min_cluster_size=1,
                    chunk_size=max_clusters_size,
                    threshold=threshold,
                    parallel=parallel,
                )
                new_clusters = filter_clusters(new_clusters, 2)

                new_cluster_ids = list(new_clusters.keys())
                print("New merged clusters", len(new_cluster_ids))
                if len(old_new_cluster_ids) < max_clusters_size:
                    break

            new_clusters = filter_clusters(new_clusters, min_cluster_size)
            print(
                f"New clusters with min community size >= {min_cluster_size}",
                len(new_clusters),
            )
            clusters = {**new_clusters, **clusters}
            print("Total clusters", len(clusters))
            clusters = sort_clusters(clusters)
            print("\n\n")

            print("4. Nearest cluster")
            unclustered_ids = get_unclustured_ids(ids, clusters)
            cluster_ids = list(clusters.keys())
            print("Unclustured", len(unclustered_ids))
            print("Clusters", len(cluster_ids))
            clusters = nearest_cluster(
                unclustered_ids,
                embeddings,
                clusters,
                chunk_size=chunk_size,
                parallel=parallel,
            )
            clusters = sort_clusters(clusters)

            unclustered_ids = get_unclustured_ids(ids, clusters)
            clustured_ids = get_clustured_ids(clusters)
            print("Clustured", len(clustured_ids))
            print("Unclustured", len(unclustered_ids))
            print(
                f"Percentage clustured {len(clustured_ids) / (len(clustured_ids) + len(unclustered_ids)) * 100:.2f}%"
            )

            print("\n\n")
    return clusters


def get_ids(cluster):
    return [transaction[0] for transaction in cluster]


def nearest_cluster_chunk(
    chunk_ids, chunk_embeddings, cluster_ids, cluster_embeddings, threshold
):
    cos_scores = cos_sim(chunk_embeddings, cluster_embeddings)
    top_val_large, top_idx_large = cos_scores.topk(k=1, largest=True)
    top_idx_large = top_idx_large[:, 0].tolist()
    top_val_large = top_val_large[:, 0].tolist()
    cluster_assignment = []
    for i, (score, idx) in enumerate(zip(top_val_large, top_idx_large)):
        cluster_id = cluster_ids[idx]
        if score < threshold:
            cluster_id = None
        cluster_assignment.append(((chunk_ids[i], score), cluster_id))
    return cluster_assignment


def nearest_cluster(
    transaction_ids,
    embeddings,
    clusters=None,
    parallel=None,
    threshold=0.75,
    chunk_size=2500,
):
    cluster_ids = list(clusters.keys())
    if len(cluster_ids) == 0:
        return clusters
    cluster_embeddings = get_embeddings(cluster_ids, embeddings)

    c = list(chunk(transaction_ids, chunk_size))

    with log_durations(logging.info, "Parallel jobs nearest cluster"):
        out = parallel(
            delayed(nearest_cluster_chunk)(
                chunk_ids,
                get_embeddings(chunk_ids, embeddings),
                cluster_ids,
                cluster_embeddings,
                threshold,
            )
            for chunk_ids in tqdm(c)
        )
        cluster_assignment = [assignment for sublist in out for assignment in sublist]

    for (transaction_id, similarity), cluster_id in cluster_assignment:
        if cluster_id is None:
            continue
        clusters[cluster_id].append(
            (transaction_id, similarity)
        )  # TODO sort in right order

    clusters = {
        cluster_id: unique_txs(sort_cluster(cluster))
        for cluster_id, cluster in clusters.items()
    }  # Sort based on similarity

    return clusters


def create_clusters(
    ids,
    embeddings,
    clusters=None,
    parallel=None,
    min_cluster_size=3,
    threshold=0.75,
    chunk_size=2500,
):
    to_cluster_ids = np.array(ids)
    np.random.shuffle(
        to_cluster_ids
    )  # TODO evaluate performance without, try sorted list

    c = list(chunk(to_cluster_ids, chunk_size))

    with log_durations(logging.info, "Parallel jobs create clusters"):
        out = parallel(
            delayed(fast_clustering)(
                chunk_ids,
                get_embeddings(chunk_ids, embeddings),
                threshold,
                min_cluster_size,
            )
            for chunk_ids in tqdm(c)
        )

    # Combine output
    new_clusters = {}
    for out_clusters in out:
        for idx, cluster in out_clusters.items():
            # new_clusters[idx] = unique([(idx, 1)] + new_clusters.get(idx, []) + cluster)
            new_clusters[idx] = unique_txs(cluster + new_clusters.get(idx, []))

    # Add ids from old cluster to new cluster
    for cluster_idx, cluster in new_clusters.items():
        community_extended = []
        for (idx, similarity) in cluster:
            community_extended += [(idx, similarity)] + clusters.get(idx, [])
        new_clusters[cluster_idx] = unique_txs(community_extended)

    new_clusters = reorder_and_filter_clusters(
        new_clusters, embeddings, threshold, parallel
    )  # filter to keep only the relevant
    new_clusters = sort_clusters(new_clusters)

    clustered_ids = set()
    for idx, cluster_ids in new_clusters.items():
        filtered = set(cluster_ids) - clustered_ids
        cluster_ids = [
            cluster_idx for cluster_idx in cluster_ids if cluster_idx in filtered
        ]
        new_clusters[idx] = cluster_ids
        clustered_ids |= set(cluster_ids)

    new_clusters = filter_clusters(new_clusters, min_cluster_size)
    new_clusters = sort_clusters(new_clusters)
    return new_clusters


def fast_clustering(ids, embeddings, threshold=0.70, min_cluster_size=10):
    """
    Function for Fast Clustering

    Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
    """

    # Compute cosine similarity scores
    cos_scores = cos_sim(embeddings, embeddings)

    # Step 1) Create clusters where similarity is bigger than threshold
    bigger_than_threshold = cos_scores >= threshold
    indices = bigger_than_threshold.nonzero()

    cos_scores = cos_scores.numpy()

    extracted_clusters = defaultdict(lambda: [])
    for row, col in indices.tolist():
        extracted_clusters[ids[row]].append((ids[col], cos_scores[row, col]))

    extracted_clusters = sort_clusters(extracted_clusters)  # FIXME

    # Step 2) Remove overlapping clusters
    unique_clusters = {}
    extracted_ids = set()

    for cluster_id, cluster in extracted_clusters.items():
        add_cluster = True
        for transaction in cluster:
            if transaction[0] in extracted_ids:
                add_cluster = False
                break

        if add_cluster:
            unique_clusters[cluster_id] = cluster
            for transaction in cluster:
                extracted_ids.add(transaction[0])

    new_clusters = {}
    for cluster_id, cluster in unique_clusters.items():
        community_extended = []
        for idx in cluster:
            community_extended.append(idx)
        new_clusters[cluster_id] = unique_txs(community_extended)

    new_clusters = filter_clusters(new_clusters, min_cluster_size)

    return new_clusters


In [5]:
ids = df.products_id

In [6]:
embeddings = embed_data(df, 'products_and_services', cores=1)
embeddings = {idx: embedding for idx, embedding in zip(ids, embeddings)}

Embedding data
Model loaded
Unique sentences 31546


Batches:   0%|          | 0/247 [00:00<?, ?it/s]

Embeddings computed


In [7]:
clusters = {}

In [8]:
clusters = online_community_detection(ids, embeddings, clusters, chunk_size=5000)

1. Nearest cluster
Unclustured 31546
Clusters 0



2. Create new clusters
Unclustured 31546


100%|██████████| 7/7 [00:01<00:00,  5.74it/s]
100%|██████████| 1845/1845 [00:00<00:00, 2842.66it/s]





3. Merge new clusters 1845


100%|██████████| 1/1 [00:00<00:00, 20.41it/s]
100%|██████████| 791/791 [00:00<00:00, 2601.81it/s]


New merged clusters 791
New clusters with min community size >= 3 791
Total clusters 791



4. Nearest cluster
Unclustured 25588
Clusters 791


100%|██████████| 6/6 [00:00<00:00, 20.27it/s]


Clustured 8102
Unclustured 23444
Percentage clustured 25.68%



1. Nearest cluster
Unclustured 23444
Clusters 791


100%|██████████| 5/5 [00:00<00:00, 18.73it/s]





2. Create new clusters
Unclustured 23444


100%|██████████| 5/5 [00:00<00:00,  5.21it/s]
100%|██████████| 943/943 [00:00<00:00, 3207.27it/s]





3. Merge new clusters 943


100%|██████████| 1/1 [00:00<00:00, 55.56it/s]
100%|██████████| 664/664 [00:00<00:00, 2801.53it/s]


New merged clusters 664
New clusters with min community size >= 3 664
Total clusters 1455



4. Nearest cluster
Unclustured 20393
Clusters 1455


100%|██████████| 5/5 [00:00<00:00, 22.03it/s]


Clustured 12079
Unclustured 19467
Percentage clustured 38.29%



1. Nearest cluster
Unclustured 19467
Clusters 1455


100%|██████████| 4/4 [00:00<00:00, 16.61it/s]





2. Create new clusters
Unclustured 19467


100%|██████████| 4/4 [00:00<00:00,  5.92it/s]
100%|██████████| 552/552 [00:00<00:00, 3305.20it/s]





3. Merge new clusters 552


100%|██████████| 1/1 [00:00<00:00, 100.00it/s]
100%|██████████| 505/505 [00:00<00:00, 3175.92it/s]


New merged clusters 505
New clusters with min community size >= 3 505
Total clusters 1960



4. Nearest cluster
Unclustured 17669
Clusters 1960


100%|██████████| 4/4 [00:00<00:00, 14.87it/s]


Clustured 14306
Unclustured 17240
Percentage clustured 45.35%



1. Nearest cluster
Unclustured 17240
Clusters 1960


100%|██████████| 4/4 [00:00<00:00, 16.13it/s]





2. Create new clusters
Unclustured 17240


100%|██████████| 4/4 [00:00<00:00,  5.46it/s]
100%|██████████| 250/250 [00:00<00:00, 3246.64it/s]





3. Merge new clusters 250


100%|██████████| 1/1 [00:00<00:00, 166.56it/s]
100%|██████████| 247/247 [00:00<00:00, 3207.56it/s]


New merged clusters 247
New clusters with min community size >= 3 247
Total clusters 2207



4. Nearest cluster
Unclustured 16453
Clusters 2207


100%|██████████| 4/4 [00:00<00:00, 14.39it/s]


Clustured 15240
Unclustured 16306
Percentage clustured 48.31%



1. Nearest cluster
Unclustured 16306
Clusters 2207


100%|██████████| 4/4 [00:00<00:00, 15.04it/s]





2. Create new clusters
Unclustured 16306


100%|██████████| 4/4 [00:00<00:00,  7.71it/s]
100%|██████████| 170/170 [00:00<00:00, 3207.42it/s]





3. Merge new clusters 170


100%|██████████| 1/1 [00:00<00:00, 250.15it/s]
100%|██████████| 170/170 [00:00<00:00, 2881.17it/s]


New merged clusters 170
New clusters with min community size >= 3 170
Total clusters 2377



4. Nearest cluster
Unclustured 15773
Clusters 2377


100%|██████████| 4/4 [00:00<00:00,  9.88it/s]


Clustured 15845
Unclustured 15701
Percentage clustured 50.23%



1. Nearest cluster
Unclustured 15701
Clusters 2377


100%|██████████| 4/4 [00:00<00:00, 14.70it/s]





2. Create new clusters
Unclustured 15701


100%|██████████| 4/4 [00:00<00:00,  8.16it/s]
100%|██████████| 157/157 [00:00<00:00, 3018.99it/s]





3. Merge new clusters 157


100%|██████████| 1/1 [00:00<00:00, 250.11it/s]
100%|██████████| 157/157 [00:00<00:00, 2343.18it/s]


New merged clusters 157
New clusters with min community size >= 3 157
Total clusters 2534



4. Nearest cluster
Unclustured 15220
Clusters 2534


100%|██████████| 4/4 [00:00<00:00, 14.65it/s]


Clustured 16370
Unclustured 15176
Percentage clustured 51.89%



1. Nearest cluster
Unclustured 15176
Clusters 2534


100%|██████████| 4/4 [00:00<00:00, 14.60it/s]





2. Create new clusters
Unclustured 15176


100%|██████████| 4/4 [00:00<00:00,  6.98it/s]
100%|██████████| 96/96 [00:00<00:00, 3096.33it/s]





3. Merge new clusters 96


100%|██████████| 1/1 [00:00<00:00, 124.99it/s]
100%|██████████| 96/96 [00:00<00:00, 2823.16it/s]


New merged clusters 96
New clusters with min community size >= 3 96
Total clusters 2630



4. Nearest cluster
Unclustured 14881
Clusters 2630


100%|██████████| 3/3 [00:00<00:00, 11.63it/s]


Clustured 16703
Unclustured 14843
Percentage clustured 52.95%



1. Nearest cluster
Unclustured 14843
Clusters 2630


100%|██████████| 3/3 [00:00<00:00, 12.00it/s]





2. Create new clusters
Unclustured 14843


100%|██████████| 3/3 [00:00<00:00,  5.67it/s]
100%|██████████| 146/146 [00:00<00:00, 2979.50it/s]





3. Merge new clusters 146


100%|██████████| 1/1 [00:00<00:00, 333.15it/s]
100%|██████████| 145/145 [00:00<00:00, 2958.97it/s]


New merged clusters 145
New clusters with min community size >= 3 145
Total clusters 2775



4. Nearest cluster
Unclustured 14395
Clusters 2775


100%|██████████| 3/3 [00:00<00:00, 11.72it/s]


Clustured 17185
Unclustured 14361
Percentage clustured 54.48%



1. Nearest cluster
Unclustured 14361
Clusters 2775


100%|██████████| 3/3 [00:00<00:00,  8.47it/s]





2. Create new clusters
Unclustured 14361


100%|██████████| 3/3 [00:00<00:00,  4.74it/s]
100%|██████████| 83/83 [00:00<00:00, 3073.73it/s]





3. Merge new clusters 83


100%|██████████| 1/1 [00:00<00:00, 166.70it/s]
100%|██████████| 83/83 [00:00<00:00, 3073.73it/s]


New merged clusters 83
New clusters with min community size >= 3 83
Total clusters 2858



4. Nearest cluster
Unclustured 14107
Clusters 2858


100%|██████████| 3/3 [00:00<00:00, 11.45it/s]


Clustured 17454
Unclustured 14092
Percentage clustured 55.33%



1. Nearest cluster
Unclustured 14092
Clusters 2858


100%|██████████| 3/3 [00:00<00:00, 10.56it/s]





2. Create new clusters
Unclustured 14092


100%|██████████| 3/3 [00:00<00:00,  6.52it/s]
100%|██████████| 84/84 [00:00<00:00, 3111.01it/s]





3. Merge new clusters 84


100%|██████████| 1/1 [00:00<00:00, 333.46it/s]
100%|██████████| 84/84 [00:00<00:00, 2624.88it/s]


New merged clusters 84
New clusters with min community size >= 3 84
Total clusters 2942



4. Nearest cluster
Unclustured 13836
Clusters 2942


100%|██████████| 3/3 [00:00<00:00, 10.91it/s]

Clustured 17719
Unclustured 13827
Percentage clustured 56.17%








In [9]:
for i, cluster in enumerate(clusters):
    for product_id in clusters[cluster]:
        df.loc[df["products_id"].str.contains(product_id[0]),"cluster_id"] = int(i)+1

In [10]:
indices = df[df['cluster_id'].notnull()].products_id

In [11]:
silhouette_score([embeddings[x] for x in indices.iloc[:]],df.loc[df['cluster_id'].notnull(), 'cluster_id'])

0.1166907

In [13]:
complete_df

Unnamed: 0,products_id,clustered_id_x,products_and_services,clustered_id_y,cleaned_text,predicted_cluster
0,0184628897818527ff8610ee5e277c042d54bd78,0184628897818527ff8610ee5e277c042d54bd78,magnetic platens,0184628897818527ff8610ee5e277c042d54bd78,magnetic platen,2120.0
1,b5da318747baea08381c0f76f43061621540c5f8,b5da318747baea08381c0f76f43061621540c5f8,platens,b5da318747baea08381c0f76f43061621540c5f8,platen,2120.0
2,319a82778e111a5231977d8c56dcaeff867a2f8f,d5c69d047908660c5b1360d5f22c3a87d410ded6,data loggers,d5c69d047908660c5b1360d5f22c3a87d410ded6,data logger,2145.0
3,92fb45ad67b286563d7630ea1b20776c1b136046,d5c69d047908660c5b1360d5f22c3a87d410ded6,data-loggers,d5c69d047908660c5b1360d5f22c3a87d410ded6,dataloggers,2145.0
4,e029962abf3d76f83e3c80e11db5699136e09854,e029962abf3d76f83e3c80e11db5699136e09854,data logger recorders,e029962abf3d76f83e3c80e11db5699136e09854,data logger recorder,2145.0
...,...,...,...,...,...,...
17714,71576cb159bee5ed36bdec2f240e74b75f60b18e,da626fbdee5dc2c14e1f294fde700571dfb78c0b,"air-conditioning systems, vehicles",da626fbdee5dc2c14e1f294fde700571dfb78c0b,airconditioning system vehicle,42.0
17715,c5d893093bc7070c63c19110a4fc0e4a265031ca,c5d893093bc7070c63c19110a4fc0e4a265031ca,air conditioning systems for building,c5d893093bc7070c63c19110a4fc0e4a265031ca,air conditioning system building,42.0
17716,8dd049875009bb697ad73f715bfa881b5b664bcc,c5d893093bc7070c63c19110a4fc0e4a265031ca,air conditioning and ventilation systems for b...,c5d893093bc7070c63c19110a4fc0e4a265031ca,air conditioning ventilation system building,42.0
17717,a7bada8d0c2c93f0e80c8ab2226ba7394ff45ff8,38c28b8fbe87a29272225385b1f2b0295e8288fc,cnc turned plastic parts,38c28b8fbe87a29272225385b1f2b0295e8288fc,cnc turned plastic part,1115.0


In [20]:
x = df[df['cluster_id'].notnull()]

In [21]:
# get rows where cluster_id value is equal to 1
x[x['cluster_id'] == 1]

Unnamed: 0,products_id,products_and_services,clustered_id,cleaned_text,cluster_id
94,15ea1aa6b5579a4524c4e1b6fc255993ba43336c,welding work - steels and metal,21e4006d4ebaddc61e70dc1108bd4a82bfdb4cd1,welding work steel metal,1.0
359,20ffa4c9bae74888ad3c7929e7037b1c9ec204ca,robotised welding,20ffa4c9bae74888ad3c7929e7037b1c9ec204ca,robotised welding,1.0
465,055453216d6334f7018a1b6302a23bca4e682203,"welding, plastics - machinery",9268d0b2eeaa712fbcf9304a3c1bca5213428a4d,welding plastic machinery,1.0
1462,e1f0f79e4d4076a66868349d634cc5cf69bdd1d7,mechanized welding,e1f0f79e4d4076a66868349d634cc5cf69bdd1d7,mechanized welding,1.0
1705,b06f16e1a0318eaf4bfe21901587c20badf74ddd,iron welding,b06f16e1a0318eaf4bfe21901587c20badf74ddd,iron welding,1.0
...,...,...,...,...,...
29359,37db9e2fbc522a86217e46e86bdbfc7d5966d198,gas welding - equipment and supplies,0d71ff724ce0306736dc85f0b844814a6dd11ad0,gas welding equipment supply,1.0
29360,282383bdb9038a6780c0c2a3911c02e427c1b0fb,welding gases,282383bdb9038a6780c0c2a3911c02e427c1b0fb,welding gas,1.0
29821,5c7cbd9e41a08cd75001c7906d9c1f416cdf8ef2,supplies for welding,bb3c3fc23a71b902a0e891183f9126862cea49e9,supply welding,1.0
30480,1434bc30c8413149af64043dc803ff3e703d86a4,welding plastic materials,1434bc30c8413149af64043dc803ff3e703d86a4,welding plastic material,1.0


In [15]:
complete_df = df[df['cluster_id'].notnull()].rename(columns={"clustered_id": "truth_cluster", "cluster_id": "predicted_cluster"})
ari = np.round(adjusted_rand_score(complete_df["truth_cluster"].values, complete_df["predicted_cluster"].values), 3)
nmi = np.round(normalized_mutual_info_score(complete_df["truth_cluster"].values, complete_df["predicted_cluster"].values), 3)

In [1]:
ari

NameError: name 'ari' is not defined