In [1]:
import pandas as pd
import json


input_file_path = r"D:\Sharif University of Tech\Data\Library Recommender\Pypi data\OriginalItems.json"
with open(input_file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)


data_df = pd.DataFrame(data)

In [2]:
data_df['text'] = data_df['Summary'].str[0] + " " + data_df['Description'].str[0]
data_df['text'] = data_df['text'].fillna('')

Lets start Generating Embeddings for Library Descriptions

In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np


model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = np.array(model.encode(data_df['text'].tolist()))
data_df['embeddings'] = list(embeddings)

Semantic Search Function

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

def semantic_search(query, data_df, top_n=5):
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, np.array(list(data_df['embeddings'])))
    top_n_indices = similarities[0].argsort()[-top_n:][::-1]
    similar_libraries = data_df.iloc[top_n_indices]

    return similar_libraries[['Package', 'Summary', 'Description']]

Because we have a very large data set, we can use FAISS for Large Scale Efficiency

In [5]:
import faiss


embedding_matrix = np.array(list(data_df['embeddings'])).astype('float32')
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)

In [6]:
def semantic_search_faiss(query, data_df, top_n=5):
    query_embedding = model.encode([query]).astype('float32')
    _, top_n_indices = index.search(query_embedding, top_n)
    similar_libraries = data_df.iloc[top_n_indices[0]]

    return similar_libraries[['Package', 'Summary', 'Description']]

Lets try it out :

In [7]:
query = "data visualization library"
similar_libraries = semantic_search(query, data_df)
print(similar_libraries)

                                      Package  \
5571                   [KraitUtilities 1.0.0]   
2559              [ainwater-package-test 0.1]   
5668                           [astetik 1.13]   
4577                         [MetrPlot 0.0.4]   
8747  [datasette-nteract-data-explorer 0.5.1]   

                                                Summary  \
5571  [A package for data preprocessing and visualiz...   
2559  [Graphics library for some uses in exploratory...   
5668  [Astetik data visualization and reporting libr...   
4577  [It is a Python library for build a very nice ...   
8747     [automatic visual data explorer for datasette]   

                                            Description  
5571                                                 []  
2559                                                 []  
5668  [ astetik provides a very high level overlay o...  
4577                                       [ MetrPlot ]  
8747  [ datasette-nteract-data-explorer      An auto...  
