In [35]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import re


def create_ngrams(text, n=10):
    #Split into running ngrams of words using spaces, commas or semicolons as separators
    words = text.replace(';',' ').split()
    ngrams = [' '.join(words[i:i+n]) for i in range(len(words) - n + 1)]
    return ngrams

# Function to find the most relevant snippet from the text
def find_most_relevant_snippet(query, text, vectorizer):
    sentences = create_ngrams(text)
    
    # Add the query to the list of sentences
    sentences_with_query = [query] + sentences

    # Calculate the TF-IDF vectors for the query and sentences
    tfidf_matrix = vectorizer.fit_transform(sentences_with_query)

    # Calculate cosine similarity between the query and sentences
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]

    # Find the index of the sentence with the highest similarity
    most_relevant_idx = np.argmax(cosine_similarities)

    # Return the most relevant sentence and its similarity score
    return sentences[most_relevant_idx], cosine_similarities[most_relevant_idx]


model = SentenceTransformer('paraphrase-distilroberta-base-v1')
documents = np.load("documents_with_embeddings.npy", allow_pickle=True)

# Function to find the most relevant documents
def search_documents(query, documents, model, top_n=3):
    #query = enrich_query(query)
    query_embedding = model.encode(query)
    
    similarities = []
    for doc in documents:
        # Calculate the cosine similarity between each query sentence and the full text embedding
        cosine_similarities = cosine_similarity([query_embedding], [doc["embeddings"]])
        
        # Find the maximum similarity score for each query sentence
        max_similarities = np.max(cosine_similarities, axis=1)
        
        # Calculate the average similarity score across all query sentences for this document
        avg_similarity = np.mean(max_similarities)
        similarities.append((doc, avg_similarity))
    
    # Sort the documents by similarity score
    sorted_docs = sorted(similarities, key=lambda x: x[1], reverse=True)
    
    # Return the top_n relevant documents
    return sorted_docs[:top_n]




In [36]:
#convert this .ipynb file to .py file
!jupyter nbconvert --to script search.ipynb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[NbConvertApp] Converting notebook search.ipynb to script
[NbConvertApp] Writing 3099 bytes to search.py


In [37]:
# Test the search function
query = "Earth Observation for Kenya"
relevant_docs = search_documents(query, documents, model)
vectorizer = TfidfVectorizer()

print(f"Query: {query}")
for i, (doc, score) in enumerate(relevant_docs):
    print(f"Document {i+1}:")
    print(f"PID: {doc['ids']}")
    print(f"Title: {doc['title']}")
    print(f"Link: {doc['url']}")

    most_relevant_snippet, _ = find_most_relevant_snippet(query, doc['full_text'], vectorizer)
    print(f"Relevant snippet: {most_relevant_snippet}")

    print(f"Abstract: {doc['abstract']}")
    print(f"Similarity score: {score:.4f}")
    print()

Query: Earth Observation for Kenya
Document 1:
PID: ['P154784']
Title: Kenya - Climate Smart Agriculture Project : Environmental Assessment (Vol. 2) : Environmental and Social Impact Assessment Report for the Desilting and Expansion of Kabarbesi Water Pan Sub-Project Located in Emining Ward-Mogotio Sub-County, Baringo County
Link: http://documents.worldbank.org/curated/en/841071615445783266/Environmental-and-Social-Impact-Assessment-Report-for-the-Desilting-and-Expansion-of-Kabarbesi-Water-Pan-Sub-project-Located-in-Emining-Ward-Mogotio-Sub-County-Baringo-County
Relevant snippet: of the Climate Smart Agriculture Project for Kenya is to
Abstract: The development objective of the Climate Smart Agriculture Project for Kenya is to increase agricultural productivity and build resilience to climate change risks in the targeted smallholder farming and pastoral communities in Kenya, and in the event of an eligible crisis or emergency, to provide immediate and effective response. Suggested meas