In [1]:
import os
import PyPDF2
from PyPDF2 import PdfReader
import pandas as pd
import re
import sys
from document_retrieval_BM25 import get_query_results



In [2]:
dataset_path = os.path.join(os.path.dirname(os.getcwd()), 'Dataset')
csv_path = os.path.join(os.path.dirname(os.getcwd()),'res','csv_etl_files')
pdf_paths = os.listdir(dataset_path)

In [5]:
print(pdf_paths)

['A2019-17.pdf', 'A2017-10_3.pdf', 'A2016-36_0.pdf', 'A2016-30_3.pdf', 'A2016-16_0.pdf', 'A2015-11.pdf', 'A2017-12_5.pdf', 'A2019-10.pdf', 'A2016-32_0.pdf', 'A2017-14_1.pdf', 'Indian Institutes of Management Act 2017.pdf', 'A2017-16_2.pdf', 'A2017-13_1.pdf', 'A2018-02_0.pdf', 'A2018-24_0.pdf', 'A2016-11_1.pdf', 'A2018-04_0.pdf', 'A2016-49_1.pdf', 'A2018-25.pdf', 'A2017-15_2.pdf', 'A2016-17_1.pdf', 'A2017-26_1.pdf', 'A2016-4_1.pdf', 'A2017-02_2.pdf', 'A2018-17.pdf', 'A2018-03.pdf', 'A2017-20_0.pdf', 'A2017-22_1.pdf', 'A2016-2_0.pdf', 'A2016-18_0.pdf', 'A2015-22.pdf', 'A2019-21.pdf', 'A2017-27_1.pdf', 'A2019-20.pdf', 'A2017-23_0.pdf', 'The Insolvency and Bankruptcy Code, 2016..pdf', 'A2016-38_1.pdf']


In [9]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        data = [(page.extract_text(), page_number + 1)
                for page_number, page in enumerate(reader.pages)]
    return data

def process_data_and_save_to_dataframe(pdf_path):
    data = extract_text_from_pdf(pdf_path)
    processed_data = []

    for page_text, page_no in data:
        lines = page_text.split('\n')
        title = ''
        for line in lines:
            line = line.strip()
            if line:
                if line.isupper() and line.strip('.').isalpha() and not line.isdigit():  
                    title = line
                else:
                    processed_data.append({'Title': title, 'Paragraph Text': line, 'Page No': page_no})

    dataframe = pd.DataFrame(processed_data)
    return dataframe


In [25]:
def split_text(text, max_length=500, overlap=50, min_final_chunk_length=150):
    # If the text is shorter than the max length, return it as is
    if len(text) <= max_length:
        return [text]
    
    chunks = []
    start_index = 0
    
    while start_index < len(text):
        # End index is either the max length of the chunk or the end of the text
        end_index = min(start_index + max_length, len(text))
        
        # If we are at the end of the text, append the chunk and break
        if end_index == len(text):
            chunks.append(text[start_index:end_index])
            break
        
        # Find the last space in the text to potentially split at
        split_index = text.rfind(' ', start_index, end_index + overlap)
        
        # If no space found, or if the split index is before the start index,
        # just split at the max length without an overlap
        if split_index <= start_index:
            split_index = end_index
        
        # Append the chunk up to the split index
        chunks.append(text[start_index:split_index])
        
        # Update the start index to continue from the split index
        start_index = split_index

    # After splitting, if the last chunk is smaller than the minimum size, merge it with the previous chunk
    if len(chunks) > 1 and len(chunks[-1]) < min_final_chunk_length:
        chunks[-2] += chunks[-1]  # Merge the last chunk with the second-to-last chunk
        chunks = chunks[:-1]  # Remove the last chunk since it's now merged

    return chunks


In [26]:
def combining_text_info(dataframe):
    dataframe_dict = dataframe.to_dict(orient='list')
    dataframe_dict_ret = {}
    paragraph_start_pattern = re.compile(r'^(\d+\.\s+.+)')
    for i in dataframe_dict.keys():
        dataframe_dict_ret[i] = []
    current_combined_text = ""
    current_paragraph_index = None
    for index, row in dataframe.iterrows():
        text = str(row['Paragraph Text']).strip()
        if paragraph_start_pattern.match(text):
            if current_paragraph_index is  None:
                current_paragraph_index = index
            else:
                split_combined_text = split_text(current_combined_text)
                title_lst = [dataframe_dict['Title'][current_paragraph_index]]*len(split_combined_text)
                page_info_lst = [dataframe_dict['Page No'][current_paragraph_start_index]]*len(split_combined_text)
                dataframe_dict_ret['Title'].extend(title_lst)
                dataframe_dict_ret['Paragraph Text'].extend(split_combined_text)
                dataframe_dict_ret['Page No'].extend(page_info_lst)
            current_combined_text = ""
            current_paragraph_start_index = index
            
        current_combined_text += text
    
    if len(current_combined_text) !=0:
        dataframe_dict_ret['Title'].append(dataframe_dict['Title'][current_paragraph_index])
        dataframe_dict_ret['Paragraph Text'].append(current_combined_text)
        dataframe_dict_ret['Page No'].append(dataframe_dict['Page No'][index])
    combined_dataframe = pd.DataFrame.from_dict(dataframe_dict_ret)
    return combined_dataframe

In [27]:
def pipeline(dataset_path,pdf_paths,csv_path):
    for i in range(len(pdf_paths)):
        main_path = dataset_path+'/'+pdf_paths[i]
        df = process_data_and_save_to_dataframe(main_path)
        com_df = combining_text_info(df)
        file_path = csv_path + '/'+pdf_paths[i][:-3]+'csv'
        com_df.to_csv(file_path)


In [3]:
from doc_spell_checker import split_and_correct
from tqdm import tqdm
def split_and_correct(csv_path):
    csv_files = os.listdir(csv_path)
    for i in tqdm(range(len(csv_files))):
        full_path = csv_path+"/"+csv_files[i]
        df = pd.read_csv(full_path)
        df_dict = df.to_dict(orient='list')
        for j in range(len(df_dict['Paragraph Text'])):
            df_dict['Paragraph Text'][j] = correct_concatenated_and_spaced_text(df_dict['Paragraph Text'][j])
        df = pd.DataFrame.from_dict(df_dict)
        df.to_csv(full_path)

[nltk_data] Downloading package words to
[nltk_data]     /Users/devansharora/nltk_data...
[nltk_data]   Package words is already up-to-date!


'word1word2' corrected to: 'w o r d 1 w o r d 2'
'word1.word2' corrected to: 'w o r d 1. w o r d 2'
'definitions.chapter' corrected to: 'd e f i n i t i o n s. chapter'
'anexampletestword' corrected to: 'a n e x a m p l e t e s t w o r d'


In [6]:
pipeline_spell_correction(csv_path)

100%|███████████████████████████████████████████| 37/37 [00:00<00:00, 37.24it/s]


In [7]:
import csv
import pickle
import spacy
from collections import defaultdict
def create_inverted_index_docs(tokenized_documents):
    inverted_index = defaultdict(set)
    for idx, tokens in enumerate(tokenized_documents):
        for token in tokens:
            inverted_index[token].add(idx)
    return inverted_index

def preprocess_documents(documents):
    # Extract relevant fields from documents
    titles = [doc['Title'] for doc in documents]
    bodies = [doc['Paragraph Text'] for doc in documents]
    pages = [doc['Page No'] for doc in documents]
    # You can preprocess the fields further if needed
    return titles, bodies, pages
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])  # For preprocessing

def preprocess_text_spacy(text):
    """Preprocess text using SpaCy."""
    return ' '.join([token.lemma_ for token in nlp(text.lower()) if not token.is_stop and not token.is_punct])
def read_csv(file_path):
    documents = []
    with open(file_path, 'r', encoding='utf-8') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        for row in csv_reader:
            documents.append(row)
    return documents
def create_inverted_index():
    dataset = []
    dataset_maplist = []
    dir_path = os.path.join(os.path.dirname(os.getcwd()),'res','csv_etl_files')
    print(dir_path)
    #dir_path = r'.\res\csv_etl_files'
    # List files present at the specified directory
    csv_files = os.listdir(dir_path)
    csv_files.sort()
    # Join file path and file name for each file
    file_paths = [os.path.join(dir_path, file) for file in csv_files]

    # Print the list of files
    print("Files present", dir_path, ":")
    for file in file_paths:
        # print(file)
        documents = read_csv(file)
        dataset.extend(documents)
        dataset_maplist.extend([file] * len(documents))

    titles, bodies, pages = preprocess_documents(dataset)
    corpus = list(bodies)
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    preprocessed_documents = [preprocess_text_spacy(doc) for doc in corpus]

    tokenized_documents = [doc.split() for doc in preprocessed_documents]
    inverted_index = create_inverted_index_docs(tokenized_documents)
    with open('inverted_index.pkl', 'wb') as file:
        pickle.dump(inverted_index, file)
    return inverted_index



In [8]:
inv = create_inverted_index()

/Users/devansharora/Desktop/IR/IR-Project/res/csv_etl_files
Files present /Users/devansharora/Desktop/IR/IR-Project/res/csv_etl_files :


In [9]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/devansharora/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
import nltk
nltk.download('words')

from nltk.corpus import wordnet as wn
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to
[nltk_data]     /Users/devansharora/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [11]:
import re

def contains_number(input_string):
    # Search for any digit in the string
    if re.search(r'\d', input_string):
        return True
    else:
        return False

In [12]:
key_list = list(inv.keys())
error_keys = []
for key in key_list:
    if key not in words and not contains_number(key):
        error_keys.append(key)
print(error_keys)

['definitions.chapter', 'iiauction', 'classify', 'icoal', 'disbursal', 'proceeds.chapter', 'iiitreatment', 'utilisation', 'secured', 'sandpermitte', 'forpayment', 'allottee.chapter', 'vpower', 'eddate', 'iicoal', 'vcertain', 'arra', 'ngement', 'arrangements.chapter', 'vi', 'realisation', 'offence', 'iv', 'saving.schedule', 'ii.schedule', 'iii.schedule', 'aview', 'thereto.wherea', 'vide', 'judgman', 'tdate', 'order;and', 'mini', 'nation;and', 'iof', 'pedientin', 'interest.be', '-sixth', 'india', 'specialprovision', 'october', 'expendiency', '―additional', '-five', 'tonne', 'coalextracted;(b', '―allotment', 'torder', '―appointe', 'ofseptember', 'stoodcancelle', 'and(ii', 'april', 'september', '―bank', 'securitisation', '―coal', '―company', 'thecompanie', 'sact', '―corporation', 'itin', 'sectio', '―financial', 'interestact', '―government', '―mine', 'miningoperation', 'centre', 'administrativeoffice', 'conveying', 'marcate', 'law;(k', '―nominated', 'ral', '―notificati', 'on‖', 'anotificati

In [13]:
error_num_keys = []
for key in key_list:
    if key not in words and contains_number(key):
        error_num_keys.append(key)

In [14]:
print(len(error_keys))
print(len(error_num_keys))
print(len(inv.keys()))

9333
5764
19930


In [28]:
pipeline(dataset_path,pdf_paths,csv_path)

In [24]:
a = " in the Sixty -sixth Year of the  Republic of India as follows: —CHAPTER I"
print(len(a))

74


In [4]:
query="What is service charge?"
results = get_query_results(query)

/Users/devansharora/Desktop/IR/IR-Project/res/csv_etl_files
Files present /Users/devansharora/Desktop/IR/IR-Project/res/csv_etl_files :


In [5]:
print(results)

{"query": "service charge", "retrieval_results": ["supply clearly identifies the service in question and its supplier in non-taxable territory;(b)) the intermediary involved in the supply does not authorise the charge to the customer or take part in its charge which is that the intermediary neither collects or processes payment in any manner nor is responsible for the payment between the non-taxable online recipient and the supplier of suchservices;(c)) the intermediary involved in the supply does not authorise delivery; and(d)) the general terms and conditions of the supply are not set by the", "payment or reward in consideration of the adoption, except as permitted under the adoption regulations framed by the Authority towards the adoption fees or service charge or child care corpus.(2)) The adoption proceedings shall be held in camera and the case shall be disposed of by the court within aperiod of two months from the date of filing.", "82. Tax to be first charge on property.", "of 