## 1. Imports & Initial Setup
In this first cell, all required modules are imported. Import statements are combined and logically organized. We also handle potential SSL context issues that could arise with nltk.

In [None]:
# Import statements
import os
import ssl
import nltk
import openai
import PyPDF2
from langchain_core.documents import Document
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import GrobidParser
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset
)
from llama_index.core import SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.finetuning import SentenceTransformersFinetuneEngine
from huggingface_hub import notebook_login
from sentence_transformers import SentenceTransformer

# Handling SSL certificate issue
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download NLTK dependencies
nltk.download()


## 2. Split PDF Files into Parts
This cell defines functions for splitting large PDF files into smaller parts. The PDF files are split based on the number of pages.

In [2]:
# Function to split PDFs into smaller parts
import os
import ssl
import nltk
import openai
import PyPDF2
def split_pdf(file_path, parts, output_dir):
    # Get the base name of the file without the extension
    base_name = os.path.splitext(os.path.basename(file_path))[0]

    # Open the existing PDF
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        total_pages = len(reader.pages)

        # Calculate the number of pages per part
        pages_per_part = total_pages // parts

        for part in range(parts):
            writer = PyPDF2.PdfWriter()
            start_page = part * pages_per_part
            # Include all remaining pages in the last part
            end_page = (part + 1) * pages_per_part if part != parts - 1 else total_pages
            
            for page in range(start_page, end_page):
                writer.add_page(reader.pages[page])
            
            # Save the split PDF file
            part_path = os.path.join(output_dir, f"{base_name}_part_{part + 1}.pdf")
            with open(part_path, "wb") as part_file:
                writer.write(part_file)

# Process a directory of PDFs and split them
def process_directory(directory, destination_directory):
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            file_path = os.path.join(directory, filename)
            with open(file_path, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                total_pages = len(reader.pages)
                parts = 0

                # Determine number of parts based on total pages
                if 500 <= total_pages <= 750:
                    parts = 3
                elif total_pages > 750:
                    parts = 4
                elif 500 > total_pages >= 250:
                    parts = 2

                # Split the PDF if required
                if parts > 0:
                    split_pdf(file_path, parts, destination_directory)

# Example usage for processing PDFs
directory = "./books"  # Path to input books directory
destination_directory = "./splitted_books"  # Output directory for split PDFs
process_directory(directory, destination_directory)


## 3. Extract Text from Split PDFs Using Grobid Parser
In this cell, the split PDF files are processed using GrobidParser to extract text content and save it as .txt files.

In [None]:
# Ensure the output directory for text files exists
output_dir = './cleaned_docs'
os.makedirs(output_dir, exist_ok=True)

# Directory containing the split PDFs
source_directory = './splitted_books'

# Grobid parser configuration (without sentence segmentation)
parser = GrobidParser(segment_sentences=False)

# Process each PDF in the source directory
for filename in os.listdir(source_directory):
    if filename.endswith('.pdf'):
        file_path = os.path.join(source_directory, filename)
        
        # Extract content using the GenericLoader
        try:
            content_list = GenericLoader.from_filesystem(file_path, parser=parser).load()
        except Exception as e:
            print(f"Failed to process {filename}: {e}")
            continue
        
        # Save extracted text to a .txt file
        base_name = os.path.splitext(filename)[0]
        output_text_file = os.path.join(output_dir, f"{base_name}.txt")
        
        with open(output_text_file, 'w', encoding='utf-8') as text_file:
            for document in content_list:
                text_file.write(document.page_content)

print("All PDFs have been processed and text files are saved in 'cleaned_docs'.")


## 4. Load Documents for Semantic Splitting
This cell loads the documents from the extracted text files and prepares them for semantic node parsing, which is used to prepare the dataset for fine-tuning.



In [None]:
# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = "your-openai-api-key"

# Initialize OpenAI embedding model
embed_model = OpenAIEmbedding()

# Function to load documents and parse them into semantic nodes
def load_corpus(docs, for_training=False, verbose=False):
    parser = SemanticSplitterNodeParser(
        embed_model=embed_model,
        normalize=True,
        breakpoint_percentile_threshold=95  # Default percentile threshold
    )
    
    if for_training:
        nodes = parser.build_semantic_nodes_from_documents(docs[:], show_progress=verbose)
    
    if verbose:
        print(f'Parsed {len(nodes)} nodes')
    
    return nodes

# Load documents from the 'cleaned_docs' folder
books_folder_path = "cleaned_docs"
filenames = [f for f in os.listdir(books_folder_path) if os.path.isfile(os.path.join(books_folder_path, f))]

# Construct full file paths
SEC_FILE = [os.path.join(books_folder_path, filename) for filename in filenames]
print(f"Loading files {SEC_FILE}")

# Read and load document data
reader = SimpleDirectoryReader(input_files=SEC_FILE)
docs = reader.load_data()
print(f'Loaded {len(docs)} docs')

# Parse the loaded documents into semantic nodes
train_nodes = load_corpus(docs, for_training=True, verbose=True)


## 5. Fine-Tuning and Saving the Model
This section involves generating a QA dataset from the semantic nodes and fine-tuning the model. The fine-tuned model is then uploaded to Hugging Face Hub.

In [None]:
# Generate QA pairs from semantic nodes
train_dataset = generate_qa_embedding_pairs(
    llm=OpenAI(model="gpt-4o-mini"), 
    nodes=train_nodes
)

# Save the dataset to JSON
train_dataset.save_json("train_dataset_17_book_grobid_semantics.json")

# Load the training dataset for fine-tuning
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset_17_book_grobid_semantic.json")

# Initialize fine-tuning engine
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="sentence-transformers/all-mpnet-base-v2",
    model_output_path="fine_tuned_model_17book_grobid_semantic"
)

# Set environment variable for MPS (Apple Silicon support)
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

# Start fine-tuning
finetune_engine.finetune()


## 6. Upload Fine-Tuned Model to Hugging Face Hub
Finally, this cell handles uploading the fine-tuned model to the Hugging Face Hub.

In [None]:
# Log in to Hugging Face Hub
notebook_login()

# Load the fine-tuned model
model = SentenceTransformer('fine_tuned_model_17book_grobid_semantic')

# Save the model to Hugging Face Hub
model.save_to_hub(
    "AhmetAytar/all-mpnet-base-v2-fine-tuned_17_textbook_grobid_semantic"
)