In [1]:
import pandas as pd
data = pd.read_csv('saved_paragraphs.csv')
org_data = pd.read_csv('original_paragraphs.csv')

In [19]:
import networkx as nx
import spacy
from sentence_transformers import SentenceTransformer, util
import torch
def search_paragraphs(query, paragraphs):
    nlp = spacy.load("en_core_web_sm")

    # Load Sentence Transformer model
    model_name = "all-mpnet-base-v2"
    sentence_transformer = SentenceTransformer(model_name)

    # Extract content from the list of tuples
    paragraph_contents = [content for idx, content in paragraphs]

    query_embedding = sentence_transformer.encode([query])[0]

    # Calculate paragraph embeddings
    paragraph_embeddings = sentence_transformer.encode(paragraph_contents)

    # Calculate cosine similarity
    scores = util.pytorch_cos_sim(torch.tensor([query_embedding]), torch.tensor(paragraph_embeddings))[0]
    scores = scores.cpu().numpy()

    # Get the indices of top results
    top_indices = scores.argsort()[::-1][:5]

    # Create a list of tuples with score, index, and content
    top_results = [(scores[idx], idx, paragraphs[idx][1]) for idx in top_indices]

    return top_results

In [3]:
paragarphs=[]
cnt = 0
entity_graphs = []  # Replace with actual entity graphs
for i in data['para_content']:
    #entity_graphs.append(generate_entity_graph(i))
    #print(i)
    
    paragarphs.append((cnt,i))
    cnt = cnt + 1
   # print(paragarphs)

In [4]:
len(paragarphs)

67

In [5]:
paragarphs[1][0]

1

In [6]:
from sklearn.metrics import silhouette_score
import re
import re 
from unicodedata import normalize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def text_normalization(text):
    """ 
    Perform text normalization on a paragraph of text.
    
    This involves:
    - Lowercasing 
    - Fixing whitespace issues
    - Removing punctuation
    - Expanding contractions
    - Removing accented characters
    - Lemmatizing text  
    - Removing stop words
    
    It returns the normalized text as a string.
    """
    
    # Lowercase
    text = text.lower()
    
    # Fix whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove punctuation    
    text = re.sub(r'[^\w\s]', '', text)
    
    # Expand contractions
    contractions = {"ain't": "am not", "aren't": "are not"} 
    text = text.replace("n't", " not")
    text = re.sub('|'.join(contractions.keys()), 
                  lambda x: contractions[x.group()], text) 
    
    # Remove accented characters
    text = (normalize('NFKD', text)
                  .encode('ascii', 'ignore')
                  .decode('utf-8', 'ignore'))
    
    # Get root form of words (lemmatize)
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))  
    text = ' '.join([word for word in text.split() 
                    if word not in stop_words])

    return text

In [7]:
# Search query
user_query = """There is some guy who made a project for pringing and etc"""

# Call the search function
result = search_paragraphs(user_query, paragarphs)



  scores = util.pytorch_cos_sim(torch.tensor([query_embedding]), torch.tensor(paragraph_embeddings))[0]


In [8]:
print(f"Search Query: {user_query}")
for score, idx, _ in result:
    print(f"Score: {score}\nParagraph:\n{org_data.iloc[idx][1]}")
    print("-----------------------------")

Search Query: There is some guy who made a project for pringing and etc
Score: 0.4448288381099701
Paragraph:
"I am a great inventor, you must know, andI manufacture my products in this lonely spot. ""What are your products?" enquired the Wizard.
-----------------------------
Score: 0.3671092987060547
Paragraph:
A simple example is modeling a sequence of days with sunny or rainy weather, where the weather on a given day is randomly determined based solely on the weather of the previous day. This pure probabilistic approach enables modeling phenomena from physics, biology, and beyond for theoretical and practical insights. This establishes a lightweight, yet versatile approach compared to heavy parameterization with neural networks. By optimizing cumulative future reward, the model learns probable transitions between words reflecting sensible narratives. For example, initializing the state with "Alice was" and sampling subsequent actions as words yield:"Alice was heading to the store whe

In [15]:
import numpy as np
from datasets import load_dataset

dataset = load_dataset("ai2_arc", "ARC-Easy")

Downloading data:   0%|          | 0.00/681M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2376 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/570 [00:00<?, ? examples/s]

In [16]:
paragraphs_valid = dataset['validation']

In [17]:
paragraphs_valid

Dataset({
    features: ['id', 'question', 'choices', 'answerKey'],
    num_rows: 570
})

In [39]:
def search_paragraphs_test(query, paragraphs):
    nlp = spacy.load("en_core_web_sm")
    
    # Load Sentence Transformer model
    model_name = "all-mpnet-base-v2"
    sentence_transformer = SentenceTransformer(model_name)

    # Extract content from the list of tuples
    paragraph_contents = [str(para['question']) + " " + str(para['choices']) for para in paragraphs]
  
  # Rest of implementation

    query_embedding = sentence_transformer.encode([query])[0]

    paragraph_embeddings = sentence_transformer.encode(paragraph_contents)  

    # Calculate cosine similarity
    scores = util.pytorch_cos_sim(torch.tensor([query_embedding]), torch.tensor(paragraph_embeddings))[0]
    scores = scores.cpu().numpy()

    # Get the indices of top results
    top_indices = scores.argsort()[::-1][:5]
    
      # Return paragraphs
    top_results = []
    for idx in top_indices:
        para = paragraph_contents[idx]
        top_results.append((scores[idx], idx,para))

    return top_results

In [47]:
paragraphs_train = dataset['train']
paragraphs_valid = dataset['validation']

def test_search():

  queries = ["science technology", "chemistry research"]
  
  rel_paragraphs = [paragraphs_valid[5], paragraphs_valid[2]]
  
  mrr_scores = []
  
  for query, rel_para in zip(queries, rel_paragraphs):
  
    results = search_paragraphs_test(query, paragraphs_train)
    #print(results)
    top_ids = [res[1] for res in results]
    #print(top_ids)
    rank = next((i for i, idx in enumerate(top_ids) if idx==rel_para['id']), None)
    print(rel_para['id'])
    if rank is not None:
      mrr_scores.append(1/float(rank + 1))
    else:
      mrr_scores.append(0)
      
  mean_mrr = np.mean(mrr_scores)
  
  print("MRR Score:", mean_mrr)

  return mean_mrr,results
  
_, results = test_search()

MCAS_2016_8_3
ACTAAP_2014_7_6
MRR Score: 0.0


In [45]:
results[0][2]

"Which of these involves the formation of a new chemical substance? {'text': ['evaporation of gasoline', 'mixing salt and pepper', 'dissolving sugar in tea', 'rusting of an iron chain'], 'label': ['A', 'B', 'C', 'D']}"