In [3]:
import os
import pickle
import numpy as np
from sklearn.cluster import KMeans
import csv

def load_embeddings(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

def apply_kmeans_clustering(folder_path, optimal_clusters_csv, output_txt):
    # Read the optimal number of clusters for each art_id
    optimal_clusters = {}
    with open(optimal_clusters_csv, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip header
        for row in reader:
            filename, n_clusters = row
            art_id = filename.split('_')[1].split('.')[0]
            optimal_clusters[art_id] = int(n_clusters)

    with open(output_txt, 'w') as outfile:
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.pkl'):
                art_id = file_name.split('_')[1].split('.')[0]

                if art_id not in optimal_clusters:
                    print(f"Skipping {art_id}: No optimal cluster information")
                    continue

                file_path = os.path.join(folder_path, file_name)
                print(f"Processing {file_name}...")
                embeddings_dict = load_embeddings(file_path)
                phrases, embeddings_list = zip(*embeddings_dict.items())
                embeddings_array = np.array(embeddings_list)

                n_clusters = optimal_clusters[art_id]
                
                if n_clusters == 1:
                    # If only one cluster, just use the phrase closest to the mean
                    mean_embedding = np.mean(embeddings_array, axis=0)
                    distances = np.linalg.norm(embeddings_array - mean_embedding, axis=1)
                    closest_idx = np.argmin(distances)
                    core_keyphrases = [phrases[closest_idx]]
                else:
                    if embeddings_array.shape[0] < n_clusters:
                        print(f"Not enough embeddings for {n_clusters} clusters. Using {embeddings_array.shape[0]} clusters instead.")
                        n_clusters = embeddings_array.shape[0]

                    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(embeddings_array)

                    # Find the keyphrase closest to each centroid
                    core_keyphrases = []
                    for centroid in kmeans.cluster_centers_:
                        distances = np.linalg.norm(embeddings_array - centroid, axis=1)
                        closest_idx = np.argmin(distances)
                        core_keyphrases.append(phrases[closest_idx])

                # Write results to the output file
                outfile.write(f"embeddings_{art_id}.csv.pkl: {', '.join(core_keyphrases)}\n")

# Set the paths
embeddings_folder = '../../embeddings'
optimal_clusters_csv = '../optimal_n_clusters.csv'
output_txt = 'keyphrases_by_product_kmeans.txt'

# Run the clustering and output results
apply_kmeans_clustering(embeddings_folder, optimal_clusters_csv, output_txt)

Processing embeddings_89386107.csv.pkl...
Processing embeddings_39442672.csv.pkl...
Processing embeddings_70538846.csv.pkl...
Processing embeddings_415599.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_29481683.csv.pkl...
Processing embeddings_79552913.csv.pkl...
Processing embeddings_29442663.csv.pkl...
Processing embeddings_50324511.csv.pkl...
Processing embeddings_70301542.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_40501892.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_39329384.csv.pkl...
Processing embeddings_80401292.csv.pkl...
Processing embeddings_39442686.csv.pkl...
Processing embeddings_99017445.csv.pkl...
Processing embeddings_20454504.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_90495596.csv.pkl...
Processing embeddings_501889.csv.pkl...
Processing embeddings_49294529.csv.pkl...
Processing embeddings_50334591.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_79278250.csv.pkl...
Processing embeddings_60278170.csv.pkl...
Processing embeddings_70339291.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_79294523.csv.pkl...
Processing embeddings_89278259.csv.pkl...
Processing embeddings_9442640.csv.pkl...
Processing embeddings_69336359.csv.pkl...
Processing embeddings_49285817.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_90510865.csv.pkl...
Processing embeddings_40415601.csv.pkl...
Processing embeddings_49481658.csv.pkl...
Processing embeddings_40346924.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_59902658.csv.pkl...
Processing embeddings_29278304.csv.pkl...
Processing embeddings_59425916.csv.pkl...
Processing embeddings_19490273.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_29481659.csv.pkl...
Processing embeddings_10324513.csv.pkl...
Processing embeddings_89294532.csv.pkl...
Processing embeddings_89442679.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_79442694.csv.pkl...
Processing embeddings_79278293.csv.pkl...
Processing embeddings_19442710.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_10409932.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_275848.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_9442720.csv.pkl...
Processing embeddings_69329387.csv.pkl...
Processing embeddings_50339292.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_19285833.csv.pkl...
Processing embeddings_39278290.csv.pkl...
Processing embeddings_59442713.csv.pkl...
Processing embeddings_69481681.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_19278328.csv.pkl...
Processing embeddings_99294517.csv.pkl...
Processing embeddings_49294548.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_79285830.csv.pkl...
Processing embeddings_80428852.csv.pkl...
Processing embeddings_20275814.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_39017486.csv.pkl...
Processing embeddings_324518.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_20213312.csv.pkl...
Processing embeddings_79294542.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_9442621.csv.pkl...
Processing embeddings_80512959.csv.pkl...
Processing embeddings_79521356.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_90305803.csv.pkl...
Processing embeddings_40334558.csv.pkl...
Processing embeddings_19931865.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_20510864.csv.pkl...
Processing embeddings_60221683.csv.pkl...
Processing embeddings_99442645.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_50454507.csv.pkl...
Processing embeddings_89285820.csv.pkl...
Processing embeddings_10345073.csv.pkl...


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing embeddings_10487816.csv.pkl...
Processing embeddings_40303439.csv.pkl...
Skipping 70309840: No optimal cluster information
Skipping 9189200: No optimal cluster information
Skipping 40314640: No optimal cluster information
Skipping 59481653: No optimal cluster information
Skipping 59189127: No optimal cluster information
Skipping 9442616: No optimal cluster information
Skipping 9442664: No optimal cluster information
Skipping 9481684: No optimal cluster information
Skipping 19442654: No optimal cluster information
Skipping 19278253: No optimal cluster information
Skipping 59278307: No optimal cluster information
Skipping 314642: No optimal cluster information
Skipping 90286649: No optimal cluster information
Skipping 40311873: No optimal cluster information
Skipping 20423720: No optimal cluster information
Skipping 10305741: No optimal cluster information
Skipping 89442655: No optimal cluster information
Skipping 9294526: No optimal cluster information
Skipping 80334603: No o

  super()._check_params_vs_input(X, default_n_init=10)
