In [10]:
from document_extraction_ngram_bm25 import *
from bs4 import BeautifulSoup
import fitz  # PyMuPDF
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import shutil
def get_text_from_page(page):
    """Extracts text from a single page."""
    html_text = page.get_text('html')
    soup = BeautifulSoup(html_text, 'html.parser')
    text_blocks = soup.find_all(['p', 'span'])  # Get paragraphs and span tags which may contain text.
    page_text = " ".join(block.text for block in text_blocks)
    return page_text.strip()

def combine_into_dataframe(book_path):
    """Combines all pages text into a pandas DataFrame."""
    doc = fitz.open(book_path)
    doc_info = {
        'page_no': [],
        'text': []
    }
    for page_no in range(len(doc)):
        print(f'Processing page: {page_no}')
        page = doc[page_no]
        page_text = get_text_from_page(page)
        
        doc_info['page_no'].append(page_no)
        doc_info['text'].append(page_text)
        
    df = pd.DataFrame.from_dict(doc_info)
    return df
# Define a simple Document class if it's not already defined
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata if metadata is not None else {}

# Now, modify the split_text function to create Document objects from strings
from langchain.text_splitter import RecursiveCharacterTextSplitter  # This import may need to be updated

def split_text(doc_text):
    # Create an instance of the RecursiveCharacterTextSplitter with the desired configuration
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=600,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    
    # Create a Document object from the input text string
    documents = [Document(doc_text)]
    
    # Now split the documents into chunks using the text_splitter
    chunks = text_splitter.split_documents(documents)
    
    # Here you can handle the chunks as needed, e.g., print the number of chunks
    print(f"Split 1 document into {len(chunks)} chunks.")
    
    # Return the chunks if necessary
    return chunks

# Use the function with the text from the DataFrame


In [11]:
def prompt_generation(ranked_text,query):
    context = ""
    for i in range(len(ranked_text)):
        context = f"{i+1}: {ranked_text['text'].iloc[i]}"
    prompt = f"""Generate legal advice for {query} 
                using the following contexual information {context}"""
    return prompt

In [48]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
def generate_text(prompt,tokenizer,model):
    """
    Generate text from a pre-trained language model given a prompt and a model name.

    Parameters:
    prompt (str): The prompt text to feed to the language model.
    model_name (str): The model identifier on Hugging Face's model hub.

    Returns:
    str: The text generated by the model.
    """

    # Encode the prompt text
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    print("Done")
    # Generate text using the model
    output = model.generate(
    input_ids,
    max_length=250,
    max_new_tokens=250,
    temperature=0.5,  # Adjust temperature for determinism
    top_p=0.95,       # Narrow down while allowing some diversity
    no_repeat_ngram_size=2  # Prevent repeating n-grams
    )
    
    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    cleaned_text = generated_text.split("Generate legal advice for")[1].strip()  # Simplified example
    return cleaned_text

In [13]:
def chunking_process(book_df):
    chunked_dict = {}
    chunked_dict['text'] = []
    for i in range(len(book_df)):
        chunked_data = split_text(book_df['text'].iloc[i])
        chunked_dict['text'].extend([page_data.page_content for page_data in chunked_data])
    return pd.DataFrame.from_dict(chunked_dict)

In [14]:
model_name='tomrb/bettercallbloom-3b'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/798 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/31.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.91G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [23]:
model = model.to(device)

In [59]:
book_path = '/kaggle/input/bookpdf/A2017-15_2.pdf'
df = combine_into_dataframe(book_path)
chunked_data = chunking_process(df)
query = "Is Service charge applicable over GST?"
top_results = calculate_bm25_scores(chunked_data,query,[10])
prompt = prompt_generation(top_results,query)
generated_text = generate_text(prompt,tokenizer,model)

Processing page: 0
Processing page: 1
Processing page: 2
Processing page: 3
Processing page: 4
Processing page: 5
Processing page: 6
Processing page: 7
Split 1 document into 3 chunks.
Split 1 document into 11 chunks.
Split 1 document into 16 chunks.
Split 1 document into 17 chunks.
Split 1 document into 18 chunks.
Split 1 document into 17 chunks.
Split 1 document into 17 chunks.
Split 1 document into 10 chunks.


Both `max_new_tokens` (=250) and `max_length`(=250) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Done


In [60]:
print(prompt)

Generate legal advice for Is Service charge applicable over GST? 
                using the following contexual information 109: (m) “State” means,––  ( m ) “State” means,––  (i) for the purposes of sections 3, 4, 5, 6 and 7 the States as defined under the Central Goods  ( i ) for the purposes of sections 3, 4, 5, 6 and 7 the States as defined under the Central Goods  and Services Tax Act; and  and Services Tax Act; and  (ii) for the purposes of sections 8, 9, 10, 11, 12, 13 and 14 the States as defined under the Central  ( ii ) for the purposes of sections 8, 9, 10, 11, 12, 13 and 14 the States as defined under the Central  Goods and Services Tax Act and the Union territories as defined under the Union Territories


In [61]:
print(generated_text)

Is Service charge applicable over GST? 
                using the following contexual information 109: (m) “State” means,––  ( m ) “State” means,––  (i) for the purposes of sections 3, 4, 5, 6 and 7 the States as defined under the Central Goods  ( i ) for the purposes of sections 3, 4, 5, 6 and 7 the States as defined under the Central Goods  and Services Tax Act; and  and Services Tax Act; and  (ii) for the purposes of sections 8, 9, 10, 11, 12, 13 and 14 the States as defined under the Central  ( ii ) for the purposes of sections 8, 9, 10, 11, 12, 13 and 14 the States as defined under the Central  Goods and Services Tax Act and the Union territories as defined under the Union Territories  goods and services tax Act, and (j) the territories under section 1 of the Customs ( j ) the Territories under Section 1  of Customs Acts, including the Northern Territory, the Australian Capital Territory,  the Tasmania, South Australia, Queensland, Western Australia and New  South Wales, as well a

In [62]:
query2 = "How is revenue calculated over an year"
top_results2 = calculate_bm25_scores(chunked_data,query2,[10])
prompt2 = prompt_generation(top_results2,query2)
generated_text2 = generate_text(prompt2,tokenizer,model)

Both `max_new_tokens` (=250) and `max_length`(=250) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Done


In [63]:
print(prompt2)

Generate legal advice for How is revenue calculated over an year 
                using the following contexual information 109: (m) “State” means,––  ( m ) “State” means,––  (i) for the purposes of sections 3, 4, 5, 6 and 7 the States as defined under the Central Goods  ( i ) for the purposes of sections 3, 4, 5, 6 and 7 the States as defined under the Central Goods  and Services Tax Act; and  and Services Tax Act; and  (ii) for the purposes of sections 8, 9, 10, 11, 12, 13 and 14 the States as defined under the Central  ( ii ) for the purposes of sections 8, 9, 10, 11, 12, 13 and 14 the States as defined under the Central  Goods and Services Tax Act and the Union territories as defined under the Union Territories


In [64]:
print(generated_text2)

How is revenue calculated over an year 
                using the following contexual information 109: (m) “State” means,––  ( m ) “State” means,––  (i) for the purposes of sections 3, 4, 5, 6 and 7 the States as defined under the Central Goods  ( i ) for the purposes of sections 3, 4, 5, 6 and 7 the States as defined under the Central Goods  and Services Tax Act; and  and Services Tax Act; and  (ii) for the purposes of sections 8, 9, 10, 11, 12, 13 and 14 the States as defined under the Central  ( ii ) for the purposes of sections 8, 9, 10, 11, 12, 13 and 14 the States as defined under the Central  Goods and Services Tax Act and the Union territories as defined under the Union Territories  goods and services tax Act, and (iii) the Federal Territory of Fiji, the  Federal  Territory  of  Fiji,  the Federated States of Micronesia, Federally  independent  States  in  Micronesia, Solomon Islands, Vanuatu, New Zealand, Fiji and Papua New Guinea.”

I am not sure if this is the right place to

In [68]:
query3 = "My friend is refusing to return money what should I do?"
top_results3 = calculate_bm25_scores(chunked_data,query3,[10])
prompt3 = prompt_generation(top_results3,query3)
generated_text3 = generate_text(prompt3,tokenizer,model)

Both `max_new_tokens` (=250) and `max_length`(=250) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Done


In [69]:
print(prompt3)

Generate legal advice for My friend is refusing to return money what should I do? 
                using the following contexual information 109: (m) “State” means,––  ( m ) “State” means,––  (i) for the purposes of sections 3, 4, 5, 6 and 7 the States as defined under the Central Goods  ( i ) for the purposes of sections 3, 4, 5, 6 and 7 the States as defined under the Central Goods  and Services Tax Act; and  and Services Tax Act; and  (ii) for the purposes of sections 8, 9, 10, 11, 12, 13 and 14 the States as defined under the Central  ( ii ) for the purposes of sections 8, 9, 10, 11, 12, 13 and 14 the States as defined under the Central  Goods and Services Tax Act and the Union territories as defined under the Union Territories


In [70]:
print(generated_text3)

My friend is refusing to return money what should I do? 
                using the following contexual information 109: (m) “State” means,––  ( m ) “State” means,––  (i) for the purposes of sections 3, 4, 5, 6 and 7 the States as defined under the Central Goods  ( i ) for the purposes of sections 3, 4, 5, 6 and 7 the States as defined under the Central Goods  and Services Tax Act; and  and Services Tax Act; and  (ii) for the purposes of sections 8, 9, 10, 11, 12, 13 and 14 the States as defined under the Central  ( ii ) for the purposes of sections 8, 9, 10, 11, 12, 13 and 14 the States as defined under the Central  Goods and Services Tax Act and the Union territories as defined under the Union Territories  goods and services tax Act, and (j) the words “or any other State” in the definition of “goods” and “services”  in section 1 of the Act shall be replaced by the  words, “and any goods or services”, respectively, in order to ensure that the provisions of  the act are not affected by 