In [10]:
import pandas as pd
import json


input_file_path = r"D:\Sharif University of Tech\Data\Library Recommender\Pypi data\OriginalItems.json"
with open(input_file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)


data_df = pd.DataFrame(data)

In [11]:
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
import faiss
import hdbscan
from sklearn.cluster import KMeans
import torch


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [12]:
models = [
    SentenceTransformer('all-MiniLM-L6-v2'),
    SentenceTransformer('paraphrase-MiniLM-L6-v2'),
    SentenceTransformer('distilbert-base-nli-mean-tokens')
]

def generate_ensemble_embeddings(texts):
    all_embeddings = []
    for model in models:
        embeddings = model.encode(texts, show_progress_bar=False, convert_to_tensor=True)
        all_embeddings.append(embeddings)
    concatenated_embeddings = torch.cat(all_embeddings, dim=1)
    return concatenated_embeddings.cpu().numpy()

.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [14]:
data_df['text'] = data_df['Summary'].str[0] + " " + data_df['Description'].str[0]
data_df['text'] = data_df['text'].fillna('')

In [15]:
data_df['text'] = data_df['text'].apply(preprocess_text)
ensemble_embeddings = generate_ensemble_embeddings(data_df['text'].tolist())

Lets create a clustring system

In [16]:
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)
hdbscan_labels = hdbscan_clusterer.fit_predict(ensemble_embeddings)
data_df['hdbscan_cluster'] = hdbscan_labels

def refine_clusters_with_kmeans(embeddings, hdbscan_labels, n_subclusters=5):
    unique_clusters = set(hdbscan_labels) - {-1}
    refined_labels = np.array(hdbscan_labels)

    for cluster_id in unique_clusters:
        mask = hdbscan_labels == cluster_id
        cluster_embeddings = embeddings[mask]

        kmeans = KMeans(n_clusters=n_subclusters, random_state=42)
        kmeans_labels = kmeans.fit_predict(cluster_embeddings)

        refined_labels[mask] = kmeans_labels + cluster_id * n_subclusters

    return refined_labels

In [17]:
refined_labels = refine_clusters_with_kmeans(ensemble_embeddings, hdbscan_labels)
data_df['refined_cluster'] = refined_labels

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


In [18]:
dimension = ensemble_embeddings.shape[1]
faiss_indexes = {}

for cluster_id in set(refined_labels):
    cluster_mask = refined_labels == cluster_id
    cluster_embeddings = ensemble_embeddings[cluster_mask].astype('float32')

    index = faiss.IndexFlatL2(dimension)
    index.add(cluster_embeddings)
    faiss_indexes[cluster_id] = index

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

def find_nearest_cluster(query_embedding, data_df, embeddings):
    similarities = cosine_similarity(query_embedding.reshape(1, -1), embeddings)
    nearest_index = np.argmax(similarities)
    return data_df.iloc[nearest_index]['refined_cluster']

def semantic_search_refined_cluster(query, data_df, top_n=5):
    query_processed = preprocess_text(query)
    query_embedding = np.hstack([
        model.encode([query_processed], convert_to_tensor=True).cpu().numpy()
        for model in models
    ]).astype('float32')

    refined_cluster_id = find_nearest_cluster(query_embedding, data_df, ensemble_embeddings)

    if refined_cluster_id in faiss_indexes:
        index = faiss_indexes[refined_cluster_id]
        _, top_n_indices = index.search(query_embedding.reshape(1, -1), top_n)

        cluster_mask = data_df['refined_cluster'] == refined_cluster_id
        cluster_libraries = data_df[cluster_mask].iloc[top_n_indices[0]]
        return cluster_libraries[['Package', 'Summary', 'Description']]
    else:
        return pd.DataFrame()


In [25]:
query = "data visualization library"
similar_libraries = semantic_search_refined_cluster(query, data_df)
print(similar_libraries)

                     Package  \
5571  [KraitUtilities 1.0.0]   
5845         [dbspace 0.3.1]   
7237  [dewi-dataclass 1.0.0]   
8560      [geoai-gdal 3.4.3]   
7352        [sample-lib 0.1]   

                                                Summary Description  
5571  [A package for data preprocessing and visualiz...          []  
5845  [Library for data-congruent, model-centric DBS...          []  
7237  [DEWI DataClass: a (mutable) data class / conf...          []  
8560        [GDAL: Geospatial Data Abstraction Library]          []  
7352  [A sample library that illustrates the usage o...          []  
