In [7]:
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
import faiss
import hdbscan
from sklearn.cluster import KMeans
import torch
from joblib import dump, load
import psutil
import time

class SemanticSearchModel:
    def __init__(self):
        self.models = [
            SentenceTransformer('all-MiniLM-L6-v2'),
            SentenceTransformer('paraphrase-MiniLM-L6-v2'),
            SentenceTransformer('distilbert-base-nli-mean-tokens')
        ]
        self.hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)
        self.faiss_indexes = {}
        self.data_df = None
        self.ensemble_embeddings = None

    def preprocess_text(self, text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        return text

    def generate_ensemble_embeddings(self, texts):
        all_embeddings = []
        for model in self.models:
            embeddings = model.encode(texts, show_progress_bar=False, convert_to_tensor=True)
            all_embeddings.append(embeddings)
        concatenated_embeddings = torch.cat(all_embeddings, dim=1)
        return concatenated_embeddings.cpu().numpy()

    def fit(self, data_df):
        self.data_df = data_df
        self.data_df['text'] = self.data_df['Summary'].str[0] + " " + self.data_df['Description'].str[0]
        self.data_df['text'] = self.data_df['text'].fillna('')
        self.data_df['text'] = self.data_df['text'].apply(self.preprocess_text)
        self.ensemble_embeddings = self.generate_ensemble_embeddings(self.data_df['text'].tolist())
        broad_clusters_labels = self.hdbscan_clusterer.fit_predict(self.ensemble_embeddings)
        self.data_df['broad_cluster'] = broad_clusters_labels
        hdbscan_labels = self.hdbscan_clusterer.fit_predict(self.ensemble_embeddings)
        self.data_df['hdbscan_cluster'] = hdbscan_labels
        refined_labels = self.refine_clusters_with_kmeans(self.ensemble_embeddings, hdbscan_labels)
        self.data_df['refined_cluster'] = refined_labels
        self.build_faiss_indexes(refined_labels)

    def refine_clusters_with_kmeans(self, embeddings, hdbscan_labels, n_subclusters=5):
        unique_clusters = set(hdbscan_labels) - {-1}
        refined_labels = np.array(hdbscan_labels)

        for cluster_id in unique_clusters:
            mask = hdbscan_labels == cluster_id
            cluster_embeddings = embeddings[mask]

            kmeans = KMeans(n_clusters=n_subclusters, random_state=42)
            kmeans_labels = kmeans.fit_predict(cluster_embeddings)

            refined_labels[mask] = kmeans_labels + cluster_id * n_subclusters

        return refined_labels

    def build_faiss_indexes(self, refined_labels):
        dimension = self.ensemble_embeddings.shape[1]
        for cluster_id in set(refined_labels):
            cluster_mask = refined_labels == cluster_id
            cluster_embeddings = self.ensemble_embeddings[cluster_mask].astype('float32')

            index = faiss.IndexFlatL2(dimension)
            index.add(cluster_embeddings)
            self.faiss_indexes[cluster_id] = index

    def semantic_search_refined_cluster(self, query, top_n=5):
        query_processed = self.preprocess_text(query)
        query_embedding = np.hstack([
            model.encode([query_processed], convert_to_tensor=True).cpu().numpy()
            for model in self.models
        ]).astype('float32')

        refined_cluster_id = self.find_nearest_cluster(query_embedding)

        if refined_cluster_id in self.faiss_indexes:
            index = self.faiss_indexes[refined_cluster_id]
            _, top_n_indices = index.search(query_embedding.reshape(1, -1), top_n)

            cluster_mask = self.data_df['refined_cluster'] == refined_cluster_id
            cluster_libraries = self.data_df[cluster_mask].iloc[top_n_indices[0]]
            return cluster_libraries[['Package', 'Summary', 'Description']]
        else:
            return pd.DataFrame()

    def find_nearest_cluster(self, query_embedding):
        from sklearn.metrics.pairwise import cosine_similarity
        similarities = cosine_similarity(query_embedding.reshape(1, -1), self.ensemble_embeddings)
        nearest_index = np.argmax(similarities)
        return self.data_df.iloc[nearest_index]['refined_cluster']

In [8]:
import json

input_file_path = r"D:\Sharif University of Tech\Data\Library Recommender\Pypi data\1\Pypi_data_Feb_19_2024.json"
with open(input_file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

In [9]:
data_df = pd.DataFrame(data)

In [10]:
len(data_df)

396564

In [11]:
import psutil
import time

start_time = time.time()

In [12]:
model = SemanticSearchModel()
model.fit(data_df)

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_me

In [13]:
dump(model, 'semantic_search_model.joblib')

['semantic_search_model.joblib']

In [14]:
execution_time = time.time() - start_time

cpu_usage = psutil.cpu_percent()
ram_usage = psutil.virtual_memory().percent

with open('metrics.txt', 'w') as f:
    f.write(f'CPU Usage: {cpu_usage}%\n')
    f.write(f'RAM Usage: {ram_usage}%\n')
    f.write(f'Execution Time: {execution_time} seconds\n')

In [15]:
query = "I need a package for dataset management"
similar_libraries = model.semantic_search_refined_cluster(query, 10)
print(similar_libraries)

                               Package  \
86087   django-paranoid-sessions 0.2.0   
174056                lessweb.py 0.1.1   
188322                 mdx_audio 0.1.4   
235649                    OneLib 0.3.4   
247318               pecan-mount 0.0.2   
256069               polyarchiv 0.14.3   
263367          pulsebuildmonitor 0.90   
278489                     pymop 0.2.4   
280705                   pyparis 0.0.1   
287278                    pyTDMS 0.0.3   

                                                  Summary  \
86087   make Django work harder to prevent session-ste...   
174056                           Masking the web.py magic   
188322                   Markdown 2.0 extension for audio   
235649                                 my little tool set   
247318                                   Mount Pecan apps   
256069  Multiple-source backup tool: backup files|MySQ...   
263367         monitor mozilla tinderbox builds via pulse   
278489              Multi-Objective Optimization 