In [1]:
import spacy
import fitz
from py2neo import Graph, Node , Relationship
import pandas as pd

In [43]:
file_name = "file.pdf"
pdf_file = fitz.open("file.pdf")

In [2]:
# Connect to Neo4j
graph = Graph("bolt://localhost:7689", auth=("neo4j", "password"))
nlp = spacy.load("en_core_web_sm")

In [45]:
# Create a node for the PDF document
pdf_node = Node("PDF", name="file.pdf")
graph.create(pdf_node)

In [46]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline")

def michau_transformer_gen_headlines(article):
    encoding = tokenizer.encode_plus(article, return_tensors="pt")
    input_ids = encoding["input_ids"]
    attention_masks = encoding["attention_mask"]

    beam_outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_masks,
        max_length= 10,
        num_beams=3,
        early_stopping=True
    )

    result = tokenizer.decode(beam_outputs[0])
    return result.replace("<pad>", "").replace("</s>","")


In [47]:
for pg_no in range(len(pdf_file)):
    # Get the current page object
    page = pdf_file[pg_no]

    # Extract the text from the page object
    page_text = page.get_text()
    # heading = page_text.split('\n')[0]
    heading = michau_transformer_gen_headlines(page_text)


    page_text = page_text.split("\n", 1)
    page_text = page_text[1] if len(page_text) > 1 else ""

    page_text = page_text.replace("\n", " ")
    page_text = page_text.replace("➢", " ")

    doc = nlp(page_text) 

    # doc_dict = {}
    # for token in doc:
    #     doc_dict[token.i] = {
    #         'text': token.text,
    #         'lemma': token.lemma_,
    #         'pos': token.pos_,
    #         'tag': token.tag_,
    #         'ent': token.ent_type_
    #     } 

    # doc_dict = {
    # key.encode(): value
    # for key, value in doc_dict.items()
    # }

    page_node = Node("PAGE", id = pg_no, text = page_text)
    pg_rel = Relationship(pdf_node, heading, page_node, data = heading)

    graph.create(page_node)
    graph.create(pg_rel)
    
    sent_after_node = None
    for sent in doc.sents:
        sentence_node = Node("Sentence",topic = heading, text= sent.text)
        sent_rel = Relationship(page_node, "CONTAINS", sentence_node)
    
        graph.create(sentence_node)
        graph.create(sent_rel) 
        
        if sent_after_node:
            sent_after = Relationship(sent_after_node, 'AFTER', sentence_node)
            graph.create(sent_after)
        sent_after_node = sentence_node  



In [195]:
# Get the current page object
page = pdf_file[5]

# Extract the text from the page object
page_text = page.get_text()
page_text = page_text.replace("\n", " ")
page_text = page_text.replace("➢", " ")
print(page_text)

Attendance regularization requests   In case of a missed punch, you have an option to regularize your attendance.   Select the date you wish to raise the regularization for.   Actual reason (for missing the punch) to be selected and time needs to be entered using the dropdown time menu.   This request goes to your reporting manager for approval. Post approval your attendance will be regularized.   You can track the status of your “Attendance Regularization Request” under “Attendance Management” menu. PATH :- Home Page -> Attendance -> Attendance Regularization Request 


In [8]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(page_text)
print(doc)
for sent in doc.sents:
    print(sent.text)

In [2]:
import spacy

# Load the Spacy model
nlp = spacy.load("en_core_web_sm")

# Define the question
question = "User can “Create Leave Request” under “Leave” menu. Select the type of leave you wish to apply for. To apply for a half-day leave, click on “First Half” or “Second Half”. For a full-day leave request, choose “Full- Day” On right-hand side, the user can view the holiday list as well as his/her leave balance as on date Fill up the details for the leave period and submit. This request goes to the reporting manager for approval. Post approval, your leave will appear on your calendar. You can track the status in the Leave Menu under “Leave Request” PATH :- Home Page -> Leave -> Create Leave Request"

# Apply preprocessing steps using Spacy
doc = nlp(question)

# Tokenization
tokens = [token.text for token in doc]

# Lowercasing
lowercase_tokens = [token.lower() for token in tokens]

# Stopword Removal
filtered_tokens = [token for token in lowercase_tokens if not nlp.vocab[token].is_stop]

# Lemmatization
lemmatized_tokens = [token.lemma_ for token in nlp(" ".join(filtered_tokens))]

# Punctuation Removal
punct_removed_tokens = [token for token in lemmatized_tokens if not nlp.vocab[token].is_punct]

# Named Entity Recognition
entities = [(ent.text, ent.label_) for ent in doc.ents]

print("Original Question: ", question)
print("Preprocessed Question: ", " ".join(punct_removed_tokens))
print("Named Entities: ", entities)

Original Question:  User can “Create Leave Request” under “Leave” menu. Select the type of leave you wish to apply for. To apply for a half-day leave, click on “First Half” or “Second Half”. For a full-day leave request, choose “Full- Day” On right-hand side, the user can view the holiday list as well as his/her leave balance as on date Fill up the details for the leave period and submit. This request goes to the reporting manager for approval. Post approval, your leave will appear on your calendar. You can track the status in the Leave Menu under “Leave Request” PATH :- Home Page -> Leave -> Create Leave Request
Preprocessed Question:  user create leave request leave menu select type leave wish apply apply half day leave click half second half day leave request choose full- day right hand user view holiday list leave balance date fill detail leave period submit request go report manager approval post approval leave appear calendar track status leave menu leave request path home page >

In [None]:
process

In [194]:
from py2neo import Graph

# Create a Graph object and connect to the database
graph = Graph("bolt://localhost:7689", auth=("neo4j", "password"))


# Define a Cypher query to get all nodes with the given label and property value
query = f"MATCH (n)-[r]->(m) WHERE n.name = 'file.pdf' RETURN r.data as heading"

# Run the query and print the results
results = graph.run(query)
df = pd.DataFrame(results.data())

topics = set(df.heading)
topics

{' Attendance Regularization Requests',
 '3i HR Team - For Any Issue',
 '3i Infotech Employee & Reporting',
 'Attendance Calendar - Real Time Attendance',
 'Attendance Regularization Transactions PATH',
 'Attendance Regularizations Approval (',
 'HonoHR Application URL: https://3',
 'Leave Balance PATH :- Home Page',
 'Leave Request - How to Apply for',
 'Leave Transactions PATH :- Home',
 'Leave and Attendance for Employee & Report',
 'Out on Duty Request - How to Track',
 'Out on Duty Transactions (OD)',
 'Out on Duty Transactions PATH :',
 'Reporting Manager Create Employee’s Roster',
 'Reporting Manager Create Leave Request on behalf of',
 'Reporting Manager PATH :- Home',
 'Reporting Manager To Approve Leave Request',
 'Reporting Manager Upload Shift Roster',
 'Reporting Manager View Employee’s Leave Details',
 'Shift Change Request Approval',
 'Shift Roster - View Shi'}

In [187]:
import spacy
from py2neo import Graph
import streamlit as st

nlp = spacy.load("en_core_web_md")

# Create a Graph object and connect to the database
graph = Graph("bolt://localhost:7687", auth=("neo4j", "password"))

# Define a Cypher query to get all nodes with the given label and property value
query = f"MATCH (n)-[r]->(m) WHERE n.name = 'file.pdf' RETURN r.data AS heading"

# Run the query and extract information
results = graph.run(query)
df = pd.DataFrame(results.data())
topics = set(df.heading)

def process(query):
    doc_query = nlp(query)
    
    # Tokenization
    tokens = [token.text for token in doc_query]

    # Lowercasing
    lowercase_tokens = [token.lower() for token in tokens]

    # Stopword Removal
    filtered_tokens = [token for token in lowercase_tokens if not nlp.vocab[token].is_stop]

    # Lemmatization
    lemmatized_tokens = [token.lemma_ for token in nlp(" ".join(filtered_tokens))]

    # Punctuation Removal
    punct_removed_tokens = [token for token in lemmatized_tokens if not nlp.vocab[token].is_punct]

    query_processed =  " ".join(punct_removed_tokens)

    return query_processed


def query_sim(query_processed):
    doc = nlp(query_processed)
    sim_list = []
    for topic in topics:
        rel = nlp(topic)
        sim = doc.similarity(rel)
        sim_list.append(sim)
    max_sim = max(sim_list)
    max_index = sim_list.index(max_sim)
    sim_topic = topic[max_index]

    return sim_topic

def answer(sim_topic):
    # Define a Cypher query to get all nodes with the given label and property value
    answer_query = f"MATCH (n) WHERE r.data = {sim_topic} RETURN r.data AS heading"

    # Run the query and extract information
    results = graph.run(query)
    df = pd.DataFrame(results.data())
    answer = set(df.heading)
    st.write(answer)

ConnectionUnavailable: Cannot open connection to ConnectionProfile('bolt://localhost:7687')

In [196]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

# Load the pre-trained BART model and tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Set the input text
text = page_text

# Encode the input text
inputs = tokenizer.encode(text, return_tensors='pt')

# Generate the summary
summary_ids = model.generate(inputs, num_beams=4, max_length=70, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Input text: ", text)
print("Generated summary: ", summary)



Input text:  Attendance regularization requests   In case of a missed punch, you have an option to regularize your attendance.   Select the date you wish to raise the regularization for.   Actual reason (for missing the punch) to be selected and time needs to be entered using the dropdown time menu.   This request goes to your reporting manager for approval. Post approval your attendance will be regularized.   You can track the status of your “Attendance Regularization Request” under “Attendance Management” menu. PATH :- Home Page -> Attendance -> Attendance Regularization Request 
Generated summary:  In case of a missed punch, you have an option to regularize your attendance. Select the date you wish to raise the regularization for. Actual reason (for missing the punch) to be selected and time needs to be entered. This request goes to your reporting manager for approval. Post approval your attendance will be regularized.


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the list of documents
documents = sentence

# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the documents
vectorizer.fit(documents)

# Get the TF-IDF matrix for the documents
tfidf_matrix = vectorizer.transform(documents)

# Get the feature names (i.e. the words) from the vectorizer
feature_names = vectorizer.get_feature_names_out()

# Print the TF-IDF matrix for each document
for i, document in enumerate(documents):
    print("Document", i)
    for j, feature in enumerate(feature_names):
        print(feature, tfidf_matrix[i, j])
    print()

Document 0
approval 0.0
approve 0.39349963397048116
approved 0.0
as 0.14963330713297865
attendance 0.0
can 0.19674981698524058
day 0.0
duty 0.2992666142659573
for 0.0
from 0.19674981698524058
manager 0.19674981698524058
od 0.2992666142659573
on 0.2992666142659573
once 0.0
or 0.19674981698524058
out 0.2992666142659573
page 0.19674981698524058
path 0.0
present 0.0
reflect 0.0
reject 0.19674981698524058
request 0.0
requests 0.19674981698524058
status 0.0
text 0.19674981698524058
that 0.0
the 0.0
this 0.19674981698524058
to 0.19674981698524058
transactions 0.19674981698524058
will 0.0
you 0.19674981698524058

Document 1
approval 0.0
approve 0.0
approved 0.30746098821535434
as 0.2338320064840948
attendance 0.0
can 0.0
day 0.30746098821535434
duty 0.0
for 0.30746098821535434
from 0.0
manager 0.0
od 0.0
on 0.0
once 0.30746098821535434
or 0.0
out 0.0
page 0.0
path 0.0
present 0.30746098821535434
reflect 0.30746098821535434
reject 0.0
request 0.0
requests 0.0
status 0.30746098821535434
text 0.0

In [14]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [15]:
doc = nlp('text: To approve Out on Duty Transactions  (OD) :-   As  a Manager you can approve or  reject Out on Duty (OD) requests  from this page.   Once approved, the status will  reflect as “Present” for that day. PATH :- Attendance -> Out On Duty Request Approval (OD) ')

In [16]:
sentence = [sent.text for sent in doc.sents]

In [22]:
import spacy

# load the English language model
nlp = spacy.load('en_core_web_sm')

# define the paragraph to analyze
paragraph = 'To approve Out on Duty Transactions  (OD) :-   As  a Manager you can approve or  reject Out on Duty (OD) requests  from this page.   Once approved, the status will  reflect as “Present” for that day. PATH :- Attendance -> Out On Duty Request Approval (OD) '

# create a spaCy document object
doc = nlp(paragraph)

# initialize a list to store identified entities
entities = []

# loop through each entity in the document
for ent in doc.ents:
    # if the entity is a noun or proper noun, add it to the list
    if ent.label_ == 'NOUN' or ent.label_ == 'PROPN':
        entities.append(ent.text)

# join the entities into a string and print it as the subject matter
subject = ', '.join(entities)
print("The subject matter of the paragraph is:", subject)


The subject matter of the paragraph is: 


In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample strings
string1 = "The quick brown fox jumps over the lazy dog."
string2 = "The quick brown fox jumps over the lazy cat."

# Create TF-IDF vectors for the strings
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform([string1, string2])

# Calculate cosine similarity between the vectors
cosine_sim = cosine_similarity(tfidf[0], tfidf[1])[0][0]

print("Cosine similarity:", cosine_sim)


Cosine similarity: 0.8350499057520981


In [104]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def simplet5_transformer_gen_headlines(input, samples=3):
    """
    This function will generate the given number of one-line summaries for the given text input.
    
    Args:
    input (str): Text to summarize
    samples (int): Number of samples for one-line summary, default set to 3
    
    Returns:
    preds (list): List of generated summaries
    """
    model_name = "snrspeaks/t5-one-line-summary"
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    input_ids = tokenizer.encode(input, return_tensors="pt", add_special_tokens=True)
    generated_ids = model.generate(input_ids=input_ids,
                                    num_beams=5,
                                    max_length=50,
                                    repetition_penalty=2.5,
                                    length_penalty=1,
                                    early_stopping=True,
                                    num_return_sequences=samples)
    preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
    return preds


In [4]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Define the input question and context
question = "What is the capital of France?"
context = "France is a country located in Western Europe. Its capital is Paris."

# Encode the input question and context as input IDs and attention masks
inputs = tokenizer(question, context, return_tensors='pt')
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

# Use the model to generate an answer to the question
start_scores, end_scores = model(input_ids, attention_mask=attention_mask, output_attentions=False).values()
start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores)
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][start_index:end_index+1]))

print(answer)


ValueError: Unrecognized configuration class <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> for this kind of AutoModel: AutoModelForQuestionAnswering.
Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, CamembertConfig, CanineConfig, ConvBertConfig, Data2VecTextConfig, DebertaConfig, DebertaV2Config, DistilBertConfig, ElectraConfig, ErnieConfig, ErnieMConfig, FlaubertConfig, FNetConfig, FunnelConfig, GPTJConfig, IBertConfig, LayoutLMv2Config, LayoutLMv3Config, LEDConfig, LiltConfig, LongformerConfig, LukeConfig, LxmertConfig, MarkupLMConfig, MBartConfig, MegaConfig, MegatronBertConfig, MobileBertConfig, MPNetConfig, MvpConfig, NezhaConfig, NystromformerConfig, OPTConfig, QDQBertConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, SplinterConfig, SqueezeBertConfig, XLMConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig, YosoConfig.

In [5]:
import time

def print_letter_by_letter(text, delay=0.03):
    for char in text:
        print(char, end='', flush=True)
        time.sleep(delay)
    print()

# Example usage
answer = "This is the answer to the user's query."
print_letter_by_letter(answer)


This is the answer to the user's query.


In [3]:
import re

def remove_special_characters(text):
    # Define the pattern to match special characters (excluding URLs)
    pattern = r'[^a-zA-Z0-9\s\/:.-]'
    
    # Remove special characters using regular expressions
    cleaned_text = re.sub(pattern, ' ', text)
    
    return cleaned_text

# Example usage
text_with_special_chars = "https://www.linkedin.com/feed/"
cleaned_text = remove_special_characters(text_with_special_chars)
print(cleaned_text)  # Output: Hello How are you doing Excited


https://www.linkedin.com/feed/


In [10]:
from py2neo import Graph

# Connect to the Neo4j database
uri = "bolt://localhost:7689"
username = "neo4j"
password = "password"
graph = Graph(uri, auth=(username, password))

# Define the Cypher query
cypher_query = "MATCH (pdf:PDF) RETURN pdf.name"

# Execute the query and retrieve the data property values
try:

    data_list = graph.run(cypher_query).to_table()[0]

except:
    data_list = []

# Print the list of data property values
print(data_list)

[]


In [36]:
import fitz

def extract_table_text(pdf_path, page_number):
    pdf_doc = fitz.open(pdf_path)
    page = pdf_doc[page_number - 1]  # Pages are 0-indexed in fitz
    
    # Get the table blocks on the page
    table_blocks = page.get_text_blocks()
    print(table_blocks)
    table_text = []
    
    for block in table_blocks:
        for row in block.rows:
            table_text.append([cell.get_text().strip() for cell in row])
    
    return table_text

# Provide the path to your PDF file
pdf_path = 'Document_Check_List_-_India (1).pdf'

# Specify the page number containing the table
page_number = 1

table_text = extract_table_text(pdf_path, page_number)
print(table_text)


[(98.8799819946289, 87.33995819091797, 358.5650634765625, 121.94580078125, ' \nDate :                                   \nLocation :  \n \n', 0, 0), (98.87995910644531, 122.49993133544922, 385.5649108886719, 146.66571044921875, 'Employee Name :  \n \nClaimant Name : \n \n', 1, 0), (98.87995910644531, 147.2198944091797, 393.0048828125, 171.3857421875, 'Claim intimation no :  \n \nCorporate Name :  \n', 2, 0), (98.87995910644531, 171.8198699951172, 237.56546020507812, 208.2257080078125, 'Type of Claim (Main claim/Pre-\nPost/Deduction/Deficiency) \n \n', 3, 0), (310.79986572265625, 171.8198699951172, 416.5250244140625, 183.9857177734375, 'No of Pages submitted : \n', 4, 0), (305.3998718261719, 208.7403564453125, 376.5857849121094, 224.21226501464844, 'CHECK LIST \n', 5, 0), (98.87995910644531, 224.49986267089844, 499.20489501953125, 248.90570068359375, 'Sr. no \nParticulars  \nCollected  \nYes/No \n', 6, 0), (98.87995910644531, 249.33982849121094, 456.2450256347656, 262.666259765625, '1 \

AttributeError: 'tuple' object has no attribute 'rows'

In [48]:
import fitz

def extract_table_text(pdf_path, page_number):
    pdf_doc = fitz.open(pdf_path)
    page = pdf_doc[page_number - 1]  # Pages are 0-indexed in fitz
    
    # Get the table blocks on the page
    table_blocks = page.get_text_blocks()
    
    text_list = []
    
    for block in table_blocks:
        for row in block:
            if isinstance(row, str):
                row = row.replace('\n', ' ')
                if not row.isspace():
                    text_list.append(row)
            
    return text_list

# Provide the path to your PDF file
pdf_path = 'Document_Check_List_-_India (1).pdf'

# Specify the page number containing the table
page_number = 1

table_text = extract_table_text(pdf_path, page_number)
print(table_text)


['  Date :                                    Location :     ', 'Employee Name :     Claimant Name :    ', 'Claim intimation no :     Corporate Name :   ', 'Type of Claim (Main claim/Pre- Post/Deduction/Deficiency)    ', 'No of Pages submitted :  ', 'CHECK LIST  ', 'Sr. no  Particulars   Collected   Yes/No  ', '1  Duly Filled & signed Claim Form of IRDA.    ', '2  Original Discharge Card / Summary/Transfer Summary/Death  Summary    ', '3  Original Final Bill of the Hospital with breakup of all charges     ', '4  Original Bill Paid Receipt (Deposit/Final payment receipt) with  revenue stamp    ', '5  Original Investigation Reports (ECG, USG, CT Scan, X-ray, Blood  report, A scan etc)    ', '6  All Imaging Films, ECG Strips, Doppler / Angiogram CD etc    ', '7  Original Pharmacy bill with supporting prescriptions.     ', '8  Hospital Registration Certificate (in case of a unknown small  hospital)    ', '9  Any other original documents related to the claim.    ', '10  MLC/FIR in case of A

In [59]:
import spacy

nlp = spacy.load('en_core_web_sm')

# Get the word or sentence to get synonyms for
word_or_sentence = "dog"

# Get the token object for the word or sentence
token = nlp(word_or_sentence)

# Get the synonyms for the token
synonyms = token.lemma_.synsets[0].lemmas

# Print the synonyms
print(synonyms)

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'lemma_'

In [81]:
import nltk
from nltk.corpus import wordnet
#Creating a list 
synonyms = []
for syn in wordnet.synsets("destroy"):
    for lm in syn.lemmas():
             synonyms.append(lm.name())#adding into synonyms
print (set(synonyms))

{'destruct', 'demolish', 'ruin', 'destroy', 'put_down'}


In [63]:
!pip install nltk





In [86]:
import spacy

def generate_synonyms(word):
    nlp = spacy.load("en_core_web_sm")
    synonyms = []

    # Retrieve the word's token from the spaCy model
    token = nlp.vocab[word]

    # Retrieve similar words based on word vector similarity
    for lexeme in token:
        if lexeme.has_vector and lexeme.text != word:
            synonyms.append(lexeme.lemma_.name())

    return synonyms

def enhance_search(query):
    nlp = spacy.load("en_core_web_sm")
    enhanced_query = []

    # Tokenize the query
    tokens = nlp(query)

    for token in tokens:
        # Add the original token
        enhanced_query.append(token.text)

        # Generate synonyms for nouns, verbs, adjectives, and adverbs
        if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]:
            synonyms = generate_synonyms(token.text)
            enhanced_query.extend(synonyms)

    return " ".join(enhanced_query)

# User query
query = "happy birthday"

# Enhance the search query with synonyms
enhanced_query = enhance_search(query)

print("Original Query:", query)
print("Enhanced Query:", enhanced_query)


TypeError: 'spacy.lexeme.Lexeme' object is not iterable

In [88]:
import nltk

def enhance_search(query):
    enhanced_query = []

    # Tokenize the query
    tokens = nltk.word_tokenize(query)

    # Get the synsets for each token
    for token in tokens:
        synsets = nltk.wordnet.synsets(token)

        # Get the lemmas for each synset
        for synset in synsets:
            for lemma in synset.lemmas():
                enhanced_query.append(lemma.name())

    return " ".join(enhanced_query)

# User input
query = input("Enter a search query: ")

# Enhance the search query
enhanced_query = enhance_search(query)

# Print the enhanced search query
print("Enhanced search query:", enhanced_query)


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\cilvo/nltk_data'
    - 'c:\\Users\\cilvo\\AppData\\Local\\Programs\\Python\\Python310\\nltk_data'
    - 'c:\\Users\\cilvo\\AppData\\Local\\Programs\\Python\\Python310\\share\\nltk_data'
    - 'c:\\Users\\cilvo\\AppData\\Local\\Programs\\Python\\Python310\\lib\\nltk_data'
    - 'C:\\Users\\cilvo\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [10]:
from parrot import Parrot
import torch
import warnings
warnings.filterwarnings("ignore")


# uncomment to get reproducable paraphrase generations
def random_state(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
random_state(1234)


#Init models (make sure you init ONLY once if you integrate this to your code)
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5", use_gpu=False)

phrases = ["what is the main objective of computers?",
           "What are the famous places we should not miss in Russia?"
]

for phrase in phrases:
  print("-"*100)
  print("Input_phrase: ", phrase)
  print("-"*100)
  para_phrases = parrot.augment(input_phrase=phrase)
  for para_phrase in para_phrases:
   print(para_phrase)


----------------------------------------------------------------------------------------------------
Input_phrase:  what is the main objective of computers?
----------------------------------------------------------------------------------------------------
("what's the main purpose of a computer?", 25)
----------------------------------------------------------------------------------------------------
Input_phrase:  What are the famous places we should not miss in Russia?
----------------------------------------------------------------------------------------------------
("list the places to visit in russia that we shouldn't miss?", 46)
('list some good places to visit in russia?', 40)
('list some of the best places to visit in russia?', 38)
('list some of the most amazing places we should not miss in russia?', 26)


'recommend some places to visit in russia?'

In [7]:
!pip install git+https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git

Collecting git+https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git
  Cloning https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git to c:\users\cilvo\appdata\local\temp\pip-req-build-lt4h_jvv
  Resolved https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git to commit 720a87a1ee557d8ed8d9a021adbdd1dd5616c5f9
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting python-Levenshtein (from parrot==1.0)
  Downloading python_Levenshtein-0.21.0-py3-none-any.whl (9.4 kB)
Collecting fuzzywuzzy (from parrot==1.0)
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Collecting Levenshtein==0.21.0 (from python-Levenshtein->parrot==1.0)
  Downloading Levenshtein-0.21.0-cp310-cp310-win_amd64.whl (100 kB)
     ------------------------------------ 100.9/100.9 kB 725.0 kB/s eta 0:00:00
Collecting rapidfuzz<4.0.0,>=2.3.0 (from Levenshtein==0.21.0->python-Levenshtein->parrot==1.0)
  Downloading rapidfuzz-3.0.0-cp

  Running command git clone --filter=blob:none --quiet https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git 'C:\Users\cilvo\AppData\Local\Temp\pip-req-build-lt4h_jvv'


In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5")
model = AutoModelForSeq2SeqLM.from_pretrained("prithivida/parrot_paraphraser_on_T5")

paragraph = '''User can Create Leave Request under Leave menu. Select the type of leave you wish to apply for. To apply for a half-day leave click on First Half or Second Half . For a full-day leave request choose Full- Day On right-hand side the user can view the holiday list as well as his/her leave balance as on date Fill up the details for the leave period and submit. This request goes to the reporting manager for approval. Post approval your leave will appear on your calendar. You can track the status in the Leave Menu under Leave Request PATH :- Home Page - Leave - Create Leave Request'''

inputs = tokenizer.encode("paraphrase: " + paragraph, return_tensors="pt")

paraphrases = model.generate(inputs, max_length=150, num_return_sequences=5, num_beams=5, temperature=1.0)

for i, paraphrase in enumerate(paraphrases):
    print(f"Paraphrase {i + 1}: {tokenizer.decode(paraphrase, skip_special_tokens=True)}")


Paraphrase 1: To apply for a half-day leave click on First Half or Second Half. For a full-day leave request choose Full- Day On the right-hand side the user can view the holiday list as well as his/her leave balance as on date Fill up the details for the leave period and submit. This request goes to the reporting manager for approval.
Paraphrase 2: User can create Leave Request under Leave menu. Select the type of leave you wish to apply for. To apply for a half-day leave click on First Half or Second Half. For a full-day leave request choose Full- Day On the right-hand side the user can view the holiday list as well as his/her leave balance as on date Fill up the details for the leave period and submit. This request goes to the reporting manager for approval
Paraphrase 3: User can Create Leave Request under Leave menu. Select the type of leave you wish to apply for. To apply for a half-day leave click on First Half or Second Half. For a full-day leave request choose Full- Day On the 

In [9]:
paraphrases_decoded =list(set([tokenizer.decode(paraphrase, skip_special_tokens=True) for paraphrase in paraphrases]))

print (len(paraphrases_decoded))

5


In [14]:
from pyinstrument import Profiler

profiler = Profiler()
profiler.start()

# Create a Graph object and connect to the database
graph = Graph("bolt://localhost:7689", auth=("neo4j", "password"))

# Define a Cypher query to get all nodes with the given label and property value
query = f"MATCH (n)-[r:Topic]->(m) RETURN r.data AS headline, r.heading_list as list"

# Run the query and extract information
results = graph.run(query)
df = pd.DataFrame(results.data())
topics = list(set(df.headline))
topics_paras = list(df.list)
# print(df)


# code you want to profile

profiler.stop()

print(profiler.output_text(unicode=True, color=True))


  _     ._   __/__   _ _  _  _ _/_   Recorded: 17:48:09  Samples:  157
 /_//_/// /_\ / //_// / //_'/ //     Duration: 1.003     CPU time: 0.031
/   _/                      v4.4.0

Program: c:\Users\cilvo\AppData\Local\Programs\Python\Python310\lib\site-packages\ipykernel_launcher.py --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme="hmac-sha256" --Session.key=b"db182ec2-703f-4fac-aaf6-e1809f3cf457" --shell=9002 --transport="tcp" --iopub=9004 --f=c:\Users\cilvo\AppData\Roaming\jupyter\runtime\kernel-v2-47240lgEHmvG1c9q.json

[31m0.997[0m ZMQInteractiveShell.run_ast_nodes[0m  [2mIPython\core\interactiveshell.py:3274[0m
└─ [31m0.995[0m [48;5;24m[38;5;15m<module>[0m  [2m..\..\..\Temp\ipykernel_18220\412372190.py:1[0m
   ├─ [33m0.464[0m DataFrame.__init__[0m  [2mpandas\core\frame.py:609[0m
   │     [118 frames hidden]  [2mpandas, numpy, <built-in>, <__array_f...[0m
   ├─ [33m0.331[0m DataFrame.__getattr__[0m  [2mpandas\core\generic.py:58

Unnamed: 0,headline,list
0,Reporting Manager To Approve Leave Request,[ Reporting Manager To Approve Leave Request]
1,Shift Change Request Approval,[ Shift Change Request Approval]
2,Reporting Manager Create Leave Request on beh...,[ Reporting Manager Create Leave Request on be...
3,Reporting Manager View Employee’s Leave Details,[ Reporting Manager View Employee’s Leave Deta...
4,Reporting Manager Create Employee’s Roster un...,[ Reporting Manager Create Employee’s Roster u...
5,Reporting Manager Upload Shift Roster,[]
6,3i HR Team For Any Issues / Queries,[ 3i HR Team For Any Issues / Queries]
7,Infotech Leave and Attendance Manual User Manu...,[]
8,HonoHR Application URL: https://3i.honohr.com,[]
9,Leave and Attendance for Employee & Reporting...,[ Leave and Attendance for Employee & Reportin...


In [13]:
ara = []
len(ara)

0

In [None]:
def sim_list_maker_paraphrase(query_processed, sim_func):
    '''
    Calculates the similarity scores between the paraphrased queries and the topics using a specified similarity function.

    Args:
        query_processed (str): The processed query string.
        sim_func (function): The similarity function to calculate the similarity scores.

    Returns:
        list: A list of similarity scores between the paraphrased queries and topics.
    '''
    queries = paraphrase(query_processed)
    sim_list = []
    
    topic_para_index = 0
    while topic_para_index < len(topics_paras):
        topic_para = topics_paras[topic_para_index]
        topic_sim = []
        
        topic_index = 0
        while topic_index < len(topic_para):
            topic = topic_para[topic_index]
            sim_query = []
            
            query_index = 0
            while query_index < len(queries):
                query = queries[query_index]
                sim = sim_func(query, topic)
                sim_query.append(sim)
                query_index += 1
            
            avg_sim_query = sum(sim_query) / len(sim_query)
            topic_sim.append(avg_sim_query)
            topic_index += 1
        
        avg_topic = sum(topic_sim) / len(topic_sim)
        sim_list.append(avg_topic)
        topic_para_index += 1
    
    return sim_list


In [5]:
cypher_query = "MATCH (pdf:PDF) RETURN pdf.name"
pdf_list = list (graph.run(cypher_query).to_table()[0])

In [7]:
pdf_list

['Group_Mediclaim_Policy_Infotech (1).pdf']