In [5]:
from document_extraction_ngram_bm25 import *
from bs4 import BeautifulSoup
import fitz  # PyMuPDF
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import shutil
def get_text_from_page(page):
    """Extracts text from a single page."""
    html_text = page.get_text('html')
    soup = BeautifulSoup(html_text, 'html.parser')
    text_blocks = soup.find_all(['p', 'span'])  # Get paragraphs and span tags which may contain text.
    page_text = " ".join(block.text for block in text_blocks)
    return page_text.strip()

def combine_into_dataframe(book_path):
    """Combines all pages text into a pandas DataFrame."""
    doc = fitz.open(book_path)
    doc_info = {
        'page_no': [],
        'text': []
    }
    for page_no in range(len(doc)):
        print(f'Processing page: {page_no}')
        page = doc[page_no]
        page_text = get_text_from_page(page)
        
        doc_info['page_no'].append(page_no)
        doc_info['text'].append(page_text)
        
    df = pd.DataFrame.from_dict(doc_info)
    return df
# Define a simple Document class if it's not already defined
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata if metadata is not None else {}

# Now, modify the split_text function to create Document objects from strings
from langchain.text_splitter import RecursiveCharacterTextSplitter  # This import may need to be updated

def split_text(doc_text):
    # Create an instance of the RecursiveCharacterTextSplitter with the desired configuration
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=600,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    
    # Create a Document object from the input text string
    documents = [Document(doc_text)]
    
    # Now split the documents into chunks using the text_splitter
    chunks = text_splitter.split_documents(documents)
    
    # Here you can handle the chunks as needed, e.g., print the number of chunks
    print(f"Split 1 document into {len(chunks)} chunks.")
    
    # Return the chunks if necessary
    return chunks

# Use the function with the text from the DataFrame


In [6]:
def prompt_generation(ranked_text,query):
    context = ""
    for i in range(len(ranked_text)):
        context = f"{i+1}: {ranked_text['text'].iloc[i]}"
    prompt = f"""Generate legal advice for {query} 
                using the following contexual information {context}"""
    return prompt

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
def generate_text(prompt,tokenizer,model):
    """
    Generate text from a pre-trained language model given a prompt and a model name.

    Parameters:
    prompt (str): The prompt text to feed to the language model.
    model_name (str): The model identifier on Hugging Face's model hub.

    Returns:
    str: The text generated by the model.
    """

    # Encode the prompt text
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Generate text using the model
    output = model.generate(input_ids, max_length=100,max_new_tokens = 50)

    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return generated_text

In [17]:
print(df)

   page_no                                               text
0        0  1  1  THE GOODS AND SERVICES TAX (COMPENSATION...
1        1  2  2  THE GOODS AND SERVICES TAX (COMPENSATION...
2        2  3  3  (n) “State tax” means the State goods an...
3        3  4  4  (b) tax levied under the Central Sales T...
4        4  5  5  (iii) any collection of taxes on account...
5        5  6  6  (2) The cess shall be levied on such sup...
6        6  7  7  (5) The accounts of the Fund, as certifi...
7        7  8  8  THE SCHEDULE   THE SCHEDULE   [See secti...


In [18]:
def chunking_process(book_df):
    chunked_dict = {}
    chunked_dict['text'] = []
    for i in range(len(book_df)):
        chunked_data = split_text(book_df['text'].iloc[i])
        chunked_dict['text'].extend([page_data.page_content for page_data in chunked_data])
    return pd.DataFrame.from_dict(chunked_dict)

In [None]:
model_name='bigscience/bloom-3b'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
book_path = 'A2017-15_2.pdf'
df = combine_into_dataframe(book_path)
chunked_data = chunking_process(df)
query = 'is service tax chargeable after the coming of GST ?'
top_results = calculate_bm25_scores(chunked_data,query)
generated_text = generate_text(prompt_generation(top_results,query),tokenizer,model)
print(generated_text)

Processing page: 0
Processing page: 1
Processing page: 2
Processing page: 3
Processing page: 4
Processing page: 5
Processing page: 6
Processing page: 7
Split 1 document into 3 chunks.
Split 1 document into 11 chunks.
Split 1 document into 16 chunks.
Split 1 document into 17 chunks.
Split 1 document into 18 chunks.
Split 1 document into 17 chunks.
Split 1 document into 17 chunks.
Split 1 document into 10 chunks.


Both `max_new_tokens` (=50) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [None]:
# Assume df is your DataFrame and it has a column 'text' with text data
# chunks_data = split_text(df['text'].iloc[0])
# query = str(input())