In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install PyMuPDF
!pip install json


Collecting PyMuPDF
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.6 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.7 PyMuPDFb-1.24.6
[31mERROR: Could not find a version that satisfies the requirement json (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for json[0m[31m
[0m

In [3]:
import fitz  # PyMuPDF
import os
import json

def extract_text_from_pdf(pdf_path):
    pages_content = []
    try:
        with fitz.open(pdf_path) as doc:
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                pages_content.append(page.get_text())
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return pages_content


In [4]:
def convert_to_json_format(pdf_pages, title, subject):
    content_dict = {f"Page{index + 1}": page_content for index, page_content in enumerate(pdf_pages)}
    return {
        "title": title,
        "subject": subject,
        "content": content_dict
    }


In [5]:
import os

base_path = '/content/drive/MyDrive/Dataset_498R'
subjects = ['eng', 'math', 'science']
data = []

for subject in subjects:
    folder_path = os.path.join(base_path, subject)
    if os.path.exists(folder_path):
        for filename in os.listdir(folder_path):
            if filename.endswith('.pdf'):
                pdf_path = os.path.join(folder_path, filename)
                pdf_pages = extract_text_from_pdf(pdf_path)
                book_json = convert_to_json_format(pdf_pages, filename, subject)
                data.append(book_json)

# Save the data to a JSON file
json_output_path = '/content/drive/MyDrive/Dataset_498R/books_data.json'
with open(json_output_path, 'w') as json_file:
    json.dump(data, json_file, indent=4)


In [6]:
import re

input_file = '/content/drive/MyDrive/Dataset_498R/books_data.json'
output_file = '/content/drive/MyDrive/Dataset_498R/filtered_books_data.json'
def filter_and_format_data(data):
    formatted_data = []
    for book in data:
        title = book.get('title', 'Unknown Title')
        subject = book.get('subject', 'Unknown Subject')
        content = book.get('content', {})

        formatted_content = {}
        for page, text in content.items():
            # Remove unwanted patterns (example: remove "Reprint 2024-25" and other such patterns)
            filtered_text = re.sub(r'Reprint \d{4}-\d{2}', '', text)
            # Remove multiple newlines and extra spaces
            filtered_text = re.sub(r'\s+', ' ', filtered_text).strip()
            formatted_content[page] = filtered_text

        formatted_book = {
            "title": title,
            "subject": subject,
            "content": formatted_content
        }

        formatted_data.append(formatted_book)

    return formatted_data

with open(input_file, 'r') as infile:
    data = json.load(infile)

formatted_data = filter_and_format_data(data)


with open(output_file, 'w') as outfile:
    json.dump(formatted_data, outfile, indent=4)

print(f"Filtered and formatted data has been saved to {output_file}")

Filtered and formatted data has been saved to /content/drive/MyDrive/Dataset_498R/filtered_books_data.json
