### 0. Import libraries and load data

In [1]:
import os
import json
import pandas as pd

DATA_FOLDER_PATH = "./folds_data/-paraphrase-multilingual-MiniLM-L12-v2_3015/"
GROUPED_SECTIONS_FILE_PATH = "../datascraping/data/grouped_sections.json"
STORAGE_PATH = "./rag_pages/"

In [2]:
# load grouped sections
with open(GROUPED_SECTIONS_FILE_PATH) as file:
    grouped_sections = dict(json.load(file))

# get filenames of folds
dataset_names = os.listdir(DATA_FOLDER_PATH)

### 1. Get and store the pages that contain the sections from each fold

In [3]:
def find_sections(grouped_sections, filename, section):
    """
    It returns the file name, page number and group of sections that contains the input
    section. If the section is not found or is duplicated, it raises an exception.
    """

    found_info = ""
    found_groups_count = 0

    # iterate over PDFs
    for current_filename, data in grouped_sections.items():
        # iterate over grouped sections
        for page_number, grouped_sections_ in data.items():
            if section in grouped_sections_ and current_filename == filename:
                found_groups_count += 1
                found_info = (current_filename, page_number, grouped_sections_)
                
    if found_groups_count != 1:
        raise Exception("Duplicated or not found section in the data.")
    else:
        return found_info


In [4]:
for dataset_name in dataset_names:
    fold_sections = pd.read_csv(DATA_FOLDER_PATH + dataset_name, index_col=0)
    sections = fold_sections["section_text"].tolist()
    filenames = fold_sections["filename"].tolist()

    print(f"{dataset_name} - {len(sections)} sections")

    filtered_grouped_sections = []

    for filename, section in zip(filenames, sections):
        filtered_grouped_sections.append(
            find_sections(grouped_sections, filename, section)
        )

    filtered_grouped_sections_df = pd.DataFrame(
        filtered_grouped_sections, columns=["filename", "page number", "page text"]
    )
    filtered_grouped_sections_df.to_csv(STORAGE_PATH + dataset_name[:-4] + "_pages.csv", index=None)


test_fold_one.csv - 50 sections
test_fold_two.csv - 50 sections
train_fold_one.csv - 200 sections
train_fold_two.csv - 200 sections
