In [9]:
import pandas as pd
import json


input_file_path = r"D:\Sharif University of Tech\Data\Library Recommender\Pypi data\OriginalItems.json"
with open(input_file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)


data_df = pd.DataFrame(data)

In [10]:
data_df['text'] = data_df['Summary'].str[0] + " " + data_df['Description'].str[0]
data_df['text'] = data_df['text'].fillna('')

# Embedding Generation with RoBERTa

In [11]:
from transformers import RobertaModel, RobertaTokenizer
import torch


model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
model.eval()

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

recommended to use GPU if possible

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

The pre processing :

In [12]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

and the embedding itself :

In [13]:
def generate_embeddings(texts, tokenizer, model, device):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    encoded_input = encoded_input.to(device)

    with torch.no_grad():
        model_output = model(**encoded_input)

    embeddings = model_output.last_hidden_state
    attention_mask = encoded_input['attention_mask']
    mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
    sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
    mean_pooled_embeddings = sum_embeddings / sum_mask

    return mean_pooled_embeddings.cpu().numpy()


In [14]:
preprocessed_descriptions = [preprocess_text(desc) for desc in data_df['text']]
embeddings = generate_embeddings(preprocessed_descriptions, tokenizer, model, device)

RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 15252062208 bytes.

# Clustering and FAISS Index Creation

lets start with a HDBSCAN

In [None]:
import hdbscan


clusterer_hdbscan = hdbscan.HDBSCAN(min_cluster_size=5, prediction_data=True)
hdbscan_labels = clusterer_hdbscan.fit_predict(embeddings)
data_df['hdbscan_cluster'] = hdbscan_labels

and now refine with K-means

In [None]:
from sklearn.cluster import KMeans
import numpy as np

def refine_clusters_with_kmeans(data, embeddings, n_clusters=5):
    refined_labels = np.copy(data['hdbscan_cluster'])
    current_label_max = refined_labels.max() + 1

    for cluster_id in set(data['hdbscan_cluster']):
        if cluster_id == -1:
            continue
        cluster_mask = (data['hdbscan_cluster'] == cluster_id)
        cluster_embeddings = embeddings[cluster_mask]

        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        kmeans_labels = kmeans.fit_predict(cluster_embeddings)

        refined_labels[cluster_mask] = kmeans_labels + current_label_max
        current_label_max += n_clusters

    return refined_labels

data_df['refined_cluster'] = refine_clusters_with_kmeans(data_df, embeddings, n_clusters=5)

Creating FAISS Indices for Refined Clusters

In [14]:
import faiss


dimension = embeddings.shape[1]
faiss_indices = {}

for cluster_id in set(data_df['refined_cluster']):
    cluster_embeddings = embeddings[data_df['refined_cluster'] == cluster_id].astype('float32')

    cluster_index = faiss.IndexFlatL2(dimension)
    cluster_index.add(cluster_embeddings)

    faiss_indices[cluster_id] = cluster_index

ModuleNotFoundError: No module named 'faiss'

# Enhanced Semantic Search

In [None]:
def generate_query_embedding(query, tokenizer, model, device):
    query_processed = preprocess_text(query)
    encoded_input = tokenizer(query_processed, return_tensors='pt', padding=True, truncation=True, max_length=512)
    encoded_input = encoded_input.to(device)

    with torch.no_grad():
        outputs = model(**encoded_input)

    embeddings = outputs.last_hidden_state
    attention_mask = encoded_input['attention_mask']
    mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    sum_embeddings = torch.sum(embeddings * mask_expanded, 1)
    sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
    mean_pooled_embedding = sum_embeddings / sum_mask

    return mean_pooled_embedding.cpu().numpy()

In [None]:
def search_similar_packages(query_embedding, faiss_indices, data_df, top_n=10):
    distances, indices = faiss_indices.search(query_embedding, top_n)
    similar_packages = data_df.iloc[indices[0]]
    return similar_packages[['Package', 'Summary', 'Description']]


Now lets try it out :

In [None]:
query = "Fast numerical computations library"

In [None]:
query_embedding = generate_query_embedding(query, tokenizer, model, device)

In [None]:
faiss_index_global = faiss.IndexFlatL2(embeddings.shape[1])
faiss_index_global.add(embeddings.astype('float32'))

In [None]:
similar_packages = search_similar_packages(query_embedding, faiss_index_global, data_df, top_n=5)
print(similar_packages)