In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#Install required ***packages***

In [2]:
!pip install pypdf2
!pip install nltk
!pip install -U gensim



#Import necessary libraries

In [3]:
import nltk
from gensim.models import Word2Vec
import gensim.downloader as api
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
import PyPDF2

#Function to extract text from a PDF file
Open the PDF file in binary mode

Create a PDF reader object

Loop through the pages in the PDF file

Get the page object

Extract the text from the page

In [4]:
def extract_text_from_pdf(pdf_path):
  with open(pdf_path, 'rb') as f:
    pdf_reader = PyPDF2.PdfReader(f)
    text = ""
    for page_num in range(len(pdf_reader.pages)):
      page = pdf_reader.pages[page_num]
      text += page.extract_text()
  return text

In [5]:
  nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Function to tokenize sentences from a string
Use NLTK's sentence tokenizer to tokenize the text

Return the tokenized sentences

In [6]:
def tokenize_sentences(text):
  tokens = nltk.sent_tokenize(text)
  return tokens

# Load the Word2Vec model

In [7]:
v2w_model = None
try:
  v2w_model = gensim.models.Keyedvectors.load('./w2vecmodel.mod')
  print("w2v Model Successfully loaded")
except:
  v2w_model = api.load('word2vec-google-news-300')
  v2w_model.save("./w2vecmodel.mod")
  print("w2v Model Saved")

w2v Model Saved


# Function to get the word embedding of a word

In [8]:
def get_word_embedding(word, model):
  samp = model['pc']
  vec = [0]*len(samp)
  try:
    vec = model[word]
  except:
    vec = [0]*len(samp)
  return vec

# Function to get the phrase embedding of a phrase
Initialize a zero vector of the same size as the embedding vectors

Initialize a counter for the number of words in the phrase

Loop through the words in the phrase

Increment the counter

Add the word embedding to the phrase embedding

Reshape the phrase embedding to a 2D array

In [9]:
import numpy as np
def get_phrase_embedding(phrase, embeddingmodel):
  samp = get_word_embedding('computer', embeddingmodel)
  vec = np.array([0]*len(samp))
  den = 0;
  for word in phrase.split():
    den = den+1
    vec = vec+np.array(get_word_embedding(word, embeddingmodel))
  return vec.reshape(1, -1)

# Function to retrieve answer to a question from a list of sentences
Initialize the maximum similarity score and the index of the most similar sentence

Loop through the sentence embeddings

Calculate the cosine similarity between the question embedding and the current sentence embedding

If the current similarity score is greater than the maximum similarity score, update the maximum
similarity score and the index of the most similar sentence

Return the index of the most similar sentence

In [10]:
def retrieve_and_print_faq_answer(question_embedding, sentence_embeddings, sentences):
  max_sim = -1
  index_sim = -1
  for index, embedding in enumerate(sentence_embeddings):
    sim = cosine_similarity(embedding, question_embedding)[0][0]
    if sim > max_sim:
      max_sim = sim
      index_sim = index

  return index_sim

# Main function
Extract the text from the PDF file

Tokenize the sentences from the text

Create a list of sentence embeddings

Get the phrase embedding for the question

Retrieve and print the answer to the question

In [11]:
def main(pdf_path, question):

  pdf_text = extract_text_from_pdf(pdf_path)
  sentences = tokenize_sentences(pdf_text)

  sent_embeddings = []
  for sent in sentences:
    sent_embeddings.append(get_phrase_embedding(sent, v2w_model))

  question_embedding = get_phrase_embedding(question, v2w_model)
  index = retrieve_and_print_faq_answer(question_embedding, sent_embeddings, sentences)

  print("Question: ", question)
  print("Answer: ", sentences[index])

# Run the main function if the script is run directly
Get the path to the PDF file and the question from the user

Run the main function

In [13]:
if __name__ == "__main__":
  pdf_path = input("Enter the path to the PDF file: ")
  question = input("Enter your question: ")
  main(pdf_path, question)

Enter the path to the PDF file: /content/gdrive/MyDrive/Colab Notebooks/acme_terms.pdf
Enter your question: what is acme corporation’s approach to handling customer information
Question:  what is acme corporation’s approach to handling customer information
Answer:  Personal information 
collected is used solely for improving our services and enhancing user experience.
