In [None]:
!pip install transformers

In [73]:
import torch
import numpy as np
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer_for_bert = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


In [4]:
def bert_question_answer(question, passage, max_len=512):
    #Tokenize input question and passage
    #Add special tokens - [CLS] and [SEP]
    input_ids = tokenizer_for_bert.encode (question, passage,  max_length= max_len, truncation=True)


    #Getting number of tokens in 1st sentence (question) and 2nd sentence (passage that contains answer)
    sep_index = input_ids.index(102)
    len_question = sep_index + 1
    len_passage = len(input_ids)- len_question


    #Need to separate question and passage
    #Segment ids will be 0 for question and 1 for passage
    segment_ids =  [0]*len_question + [1]*(len_passage)

    #Converting token ids to tokens
    tokens = tokenizer_for_bert.convert_ids_to_tokens(input_ids)

    #Getting start and end scores for answer
    #Converting input arrays to torch tensors before passing to the model
    start_token_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]) )[0]
    end_token_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]) )[1]

    #Converting scores tensors to numpy arrays
    start_token_scores = start_token_scores.detach().numpy().flatten()
    end_token_scores = end_token_scores.detach().numpy().flatten()

    #Getting start and end index of answer based on highest scores
    answer_start_index = np.argmax(start_token_scores)
    answer_end_index = np.argmax(end_token_scores)


    #Getting scores for start and end token of the answer
    start_token_score = np.round(start_token_scores[answer_start_index], 2)
    end_token_score = np.round(end_token_scores[answer_end_index], 2)


    #Combining subwords starting with ## and get full words in output.
    #It is because tokenizer breaks words which are not in its vocab.
    answer = tokens[answer_start_index]
    for i in range(answer_start_index + 1, answer_end_index + 1):
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        else:
            answer += ' ' + tokens[i]

    # If the answer didn't find in the passage
    if ( answer_start_index == 0) or (start_token_score < 0 ) or  (answer == '[SEP]') or ( answer_end_index <  answer_start_index):
        answer = "Sorry!, I could not find an answer in the passage."

    return (answer_start_index, answer_end_index, start_token_score, end_token_score,  answer)


In [None]:
pip install PyPDF2

In [6]:
import os
import PyPDF2
import re

# Directory containing the PDF files
directory_path = '/content/drive/MyDrive/BOOK'

# List to store the extracted text from all PDF files
all_text = []

# Iterate through the PDF files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(directory_path, filename)

        # Open the PDF file
        with open(pdf_path, 'rb') as file:
            # Create a PDF reader object
            reader = PyPDF2.PdfReader(file)

            # Extract text from each page
            pages_text = []
            for page in reader.pages:
                pages_text.append(page.extract_text())

            # Concatenate all the page text into a single string
            pdf_text = ' '.join(pages_text)

            # Append the PDF text to the list
            all_text.append(pdf_text)

# Split the text into words using regular expressions
words = re.findall(r'\b\w+\b', ' '.join(all_text))

# Remove extra spaces and special characters from words
cleaned_words = [re.sub(r'\s+', ' ', word) for word in words]
cleaned_words = [re.sub(r'[^\w\s]', '', word) for word in cleaned_words]

# Store the cleaned words in a text file
txt_path = '/content/drive/MyDrive/BOOK/PDF/extracted_words3.txt'
with open(txt_path, 'w') as file:
    for word in cleaned_words:
        file.write(word + ' ')

# Store the cleaned words in a variable
text = ' '.join(cleaned_words)


In [8]:
len(text)

2635706

In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Split the passage into smaller chunks based on word count with overlap
chunk_size = 512
overlap = 20
texts = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Compute the TF-IDF matrix for the text chunks
tfidf_matrix = vectorizer.fit_transform(texts)

# Convert the query chunk to a TF-IDF vector
query_tfidf = vectorizer.transform([query_chunk])

# Compute the cosine similarity between the query chunk and text chunks
similarity_scores = cosine_similarity(query_tfidf, tfidf_matrix)

# Sort the similarity scores in descending order
sorted_indices = similarity_scores.argsort()[0][::-1]

# Select the top similar chunks
top_similar_chunks = [texts[idx] for idx in sorted_indices[:5]]

# Load the similar chunks into separate variables
neighbor1 = top_similar_chunks[0]
neighbor2 = top_similar_chunks[1]
neighbor3 = top_similar_chunks[2]
neighbor4 = top_similar_chunks[3]
neighbor5 = top_similar_chunks[4]

# Print the top similar chunks
print("Top Similar Chunks:")
for i, chunk in enumerate(top_similar_chunks, 1):
    print(f"Chunk {i}:", chunk)


Top Similar Chunks:
Chunk 1: milar tools to compile your Python code into fast optimized machine code Andriy Burkov The Hundred Page Machine Learning Book Draft 11 TheHundred PageMachineLearningBookAndriy Burkov All models are wrong but some are useful George Box The book is distributed on the read ﬁrst buy later principle Andriy BurkovThe Hundred Page Machine Learning Book Draft 9 Unsupervised Learning Unsupervised learning deals with problems in which your dataset doesn t have labels This property is what makes it very problematic fo
Chunk 2: eated using the dataset of people could take as input a feature vector describing a person and output a probability that the person has cancer 1In this book if a term is in bold t h a tm e a n st h a tt h i st e r mc a nb ef o u n di nt h ei n d e xa tt h ee n do ft h e book Andriy Burkov The Hundred Page Machine Learning Book Draft 3 1 2 2 Unsupervised Learning Inunsupervised learning the dataset is a collection of unlabeled examples xi N i 1 A

In [65]:
query_chunk = "what is unsupervised learning?"

In [67]:
print ('\nQuestion 1:\n', query_chunk)
_, _ , _ , _, ans  = bert_question_answer( query_chunk, neighbor1)
print('\nAnswer from BERT: ', ans ,  '\n')


Question 1:
 what is unsupervised learning?

Answer from BERT:  deals with problems in which your dataset doesn t have labels 



In [63]:
from transformers import pipeline
summarized_chunks = []

# Load the summarization pipeline with default max_length and min_length
summarizer = pipeline("summarization")

# Summarize each top similar chunk
for i, chunk in enumerate(top_similar_chunks, 1):
    summarized_chunk = summarizer(chunk)[0]['summary_text']
    summarized_chunks.append(summarized_chunk)
    # print(f"Neighbor {i} Summarized:")
    # print(summarized_chunk)

# Store the summarized chunks in separate variables
neighbor1 = summarized_chunks[0]
neighbor2 = summarized_chunks[1]
neighbor3 = summarized_chunks[2]
neighbor4 = summarized_chunks[3]
neighbor5 = summarized_chunks[4]
# Concatenate the summarized chunks with a separator
summary = ' '.join(summarized_chunks[:5])[:512]

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Your max_length is set to 142, but your input_length is only 98. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Your max_length is set to 142, but your input_length is only 107. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Your max_length is set to 142, but your input_length is only 102. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)
Your max_len

In [55]:
print ('\nQuestion 1:\n', query_chunk)
_, _ , _ , _, ans  = bert_question_answer( query_chunk, summary)
print('\nAnswer from BERT: ', ans ,  '\n')


Question 1:
 what is suppervised learning?

Answer from BERT:  supervised learning is the type of machine learning most frequently used in practice . the data for supervised learning is a collection of pairs input output 

