<a href="https://colab.research.google.com/github/Ayu369-gen/LLM_trainable_data/blob/main/Book_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# %%
import os
import re
import json
import shutil
from typing import Optional, List
from pathlib import Path
# Install pymupdf and pymupdf4llm
try:
    import fitz # PyMuPDF
    from pymupdf4llm import to_markdown
except ImportError:
    print("Installing pymupdf and pymupdf4llm...")
    !pip install pymupdf pymupdf4llm --quiet
    import fitz # PyMuPDF
    from pymupdf4llm import to_markdown

# Install llama_api_client
try:
    import llama_api_client
except ImportError:
    print("Installing llama_api_client...")
    !pip install llama_api_client --quiet
    import llama_api_client

from llama_api_client import LlamaAPIClient

# No tkinter needed in Colab
# import tkinter as tk
# from tkinter import filedialog, messagebox

from datetime import datetime
from google.colab import files # Import Colab files utility

# Set up the Llama API client
try:
    # IMPORTANT: Replace with your actual Llama API Key or use Colab Secrets
    # Never hardcode sensitive keys in shared notebooks.
    # os.environ["LLAMA_API_KEY"] = "LLM|461707323672851|W1xFA7FgVN58q0ep2JFTPyFyTKQ"
    # Recommended way using Colab Secrets:
    # from google.colab import userdata
    # os.environ["LLAMA_API_KEY"] = userdata.get('LLAMA_API_KEY')

    # For this example, keep the hardcoded key for demonstration, but use Secrets in production
    os.environ["LLAMA_API_KEY"] = "YOUR_LLAMA_API"


    api_key = os.environ.get("LLAMA_API_KEY")
    if not api_key:
         raise ValueError("LLAMA_API_KEY not found. Please set it as a Colab Secret or environment variable.")

    client = LlamaAPIClient(
        api_key=api_key,
        base_url="https://api.llama.com/v1/",
    )
except ImportError:
    raise ImportError("llama_api_client is required. Please install it using 'pip install llama_api_client'.")
except ValueError as e:
    print(f"Configuration Error: {e}")
    print("Please set your LLAMA_API_KEY.")
    client = None # Set client to None if key is missing

# Function to handle file upload in Colab
def upload_pdf_files() -> List[str]:
    """
    Upload PDF files using Google Colab's file upload utility.

    Returns:
        List[str]: List of uploaded PDF file paths in the Colab environment.
    """
    print("Please upload PDF files using the file upload widget below:")
    uploaded = files.upload() # This opens a file dialog in your browser

    uploaded_files = []
    for filename in uploaded.keys():
        print(f'Uploaded file: {filename}')
        uploaded_files.append(filename) # files.upload() saves files to the current directory

    if not uploaded_files:
        print("No files were selected or uploaded.")
        return []

    # Optional: Filter for only PDF files if needed, though upload dialog might handle this
    # pdf_files_only = [f for f in uploaded_files if f.lower().endswith('.pdf')]
    # print(f"Found {len(pdf_files_only)} PDF files among uploaded.")
    # return pdf_files_only
    return uploaded_files


# Function to handle file download in Colab
def download_processed_files(jsonl_dir: str) -> None:
    """
    Download processed JSONL files using Google Colab's file download utility.

    Args:
        jsonl_dir (str): Directory containing the JSONL files to download.
    """
    try:
        print(f"\nPreparing files for download from: {jsonl_dir}")
        jsonl_files = [f for f in os.listdir(jsonl_dir) if f.endswith('.jsonl')]

        if not jsonl_files:
            print("No JSONL files found to download.")
            return

        print("Downloading processed files...")
        # Colab's files.download handles multiple files, but you can also loop
        for file_name in jsonl_files:
             file_path = os.path.join(jsonl_dir, file_name)
             try:
                 files.download(file_path) # Download the file to the user's local machine
                 print(f"Downloaded: {file_name}")
             except Exception as download_error:
                 print(f"Error downloading {file_name}: {download_error}")


        print("Download process finished.")

    except Exception as e:
        print(f"Error preparing files for download: {e}")


def convert_pdf_to_text(pdf_path: str, output_dir: str) -> str:
    """
    Convert a PDF file to markdown text using pymupdf4llm.

    Args:
        pdf_path (str): Path to the PDF file
        output_dir (str): Directory to save the markdown file

    Returns:
        str: Path to the generated markdown file or None if conversion fails
    """
    try:
        os.makedirs(output_dir, exist_ok=True)
        pdf_name = Path(pdf_path).stem
        md_path = os.path.join(output_dir, f"{pdf_name}_pymupdf4llm.md")
        print(f"Converting {pdf_path} to markdown...")
        markdown_text = to_markdown(pdf_path)
        print(f"Markdown text length: {len(markdown_text)}")
        with open(md_path, 'w', encoding='utf-8') as f:
            f.write(markdown_text)
        print(f"Markdown file saved to {md_path}")
        print("Markdown file created:", md_path)
        return md_path
    except Exception as e:
        print(f"Error converting PDF to markdown: {e}")
        return None

def chunk_text(text: str, max_chunk_size: int = 4000) -> List[str]:
    """
    Split text into chunks of maximum size while trying to keep paragraphs together.

    Args:
        text (str): Input text to chunk
        max_chunk_size (int): Maximum size of each chunk

    Returns:
        List[str]: List of text chunks
    """
    chunks = []
    current_chunk = ""

    # Split by paragraphs
    paragraphs = text.split('\n\n')

    for paragraph in paragraphs:
        # Check if adding the next paragraph exceeds the max size
        # +2 accounts for the potential '\n\n' delimiter
        if len(current_chunk) + len(paragraph) + 2 <= max_chunk_size:
            current_chunk += paragraph + '\n\n'
        else:
            # If adding the paragraph exceeds the size, add the current chunk
            if current_chunk:
                chunks.append(current_chunk.strip())
            # Start a new chunk with the current paragraph
            current_chunk = paragraph + '\n\n'

    # Add the last accumulated chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks


def query_llama_api(text: str, prompt: str) -> Optional[str]:
    """
    Send text to the Llama API with a prompt and return the response.

    Args:
        text (str): Input text to process (research paper content).
        prompt (str): Instructions for the LLM.

    Returns:
        Optional[str]: Processed text or None if an error occurs or client is not initialized.
    """
    if client is None:
        print("API client not initialized due to missing API key.")
        return None

    try:
        response = client.chat.completions.create(
            model="Llama-4-Maverick-17B-128E-Instruct-FP8",
            messages=[
                {"role": "system", "content": "You are an expert in marine science and aquaculture content processing. Extract ONLY the plain text content related to marine science and aquaculture. Remove all formatting, headers, numbers, and special characters. Output should be clean, continuous text without any structural elements."},
                {"role": "user", "content": f"{prompt}\n\n### Input Text:\n{text}"}
            ],
            temperature=0.3,
            top_p=0.9
        )
        # Adjusted to access content based on typical API response structure
        # The original code used response.completion_message.content.text.strip()
        # which might be specific to a custom client library.
        # A more standard structure might involve accessing choices and message content.
        # Assuming the original structure was correct for your client:
        response_text = response.completion_message.content.text.strip()
        return response_text
    except AttributeError:
         print("API response structure unexpected. Could not find completion_message.content.text.")
         # Fallback or alternative way to access response text if needed
         return None
    except Exception as e:
        print(f"Error querying API: {e}")
        # Optional: Add a retry logic here if needed, similar to your original fallback
        return None


def process_text_chunk(chunk: str, prompt: str) -> Optional[List[str]]:
    """
    Process a single chunk of text through the Llama API and split it into continuous text segments.

    Args:
        chunk (str): Text chunk to process
        prompt (str): Instructions for the LLM

    Returns:
        Optional[List[str]]: List of processed text segments or None if an error occurs
    """
    cleaned_content = query_llama_api(chunk, prompt)
    if cleaned_content is None: # Check for None explicitly
        return None

    # Post-process to clean up extra whitespace and normalize newlines
    cleaned_content = re.sub(r'\n\s*\n+', '\n\n', cleaned_content).strip()

    # Split content into continuous text segments
    segments = []
    # Simple split by double newlines to get potential paragraphs from LLM output.
    paragraphs = cleaned_content.split('\n\n')
    for para in paragraphs:
        stripped_para = para.strip()
        if stripped_para:
             # Each non-empty paragraph from the LLM output can be a segment
            segments.append(stripped_para)


    return segments if segments else None


def clean_research_paper(input_file: str, output_file: str) -> None:
    """
    Use Llama API to process marine science and aquaculture content while retaining detailed scientific information.
    Output is saved in JSONL format with each line as a complete text segment for pretraining.

    Args:
        input_file (str): Path to the input text file containing the content.
        output_file (str): Path to save the cleaned output text file in JSONL format.
    """
     # Check if the API client is available before proceeding
    if client is None:
        print(f"Skipping processing for {input_file} due to missing API key.")
        return

    # Read the input file
    try:
        with open(input_file, 'r', encoding='utf-8') as file:
            content = file.read()
    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        return
    except UnicodeDecodeError:
        print(f"Error: File encoding issue for '{input_file}'. Ensure the file is UTF-8 encoded.")
        return
    except Exception as e:
         print(f"Error reading input file '{input_file}': {e}")
         return

    # Define the prompt for the Llama API
    prompt = """Extract ONLY the plain text content related to marine science and aquaculture from the following text. Remove all formatting, headers, and structural elements.

**Content to Extract:**
- Marine biology and ecology
- Aquaculture systems and practices
- Fisheries management
- Marine conservation
- Oceanography and marine ecosystems
- Marine species biology and behavior
- Aquaculture technology and innovations
- Marine resource management
- Water quality and environmental parameters
- Marine food production systems

**Output Rules:**
1. Output ONLY plain text content.
2. Remove all headers, subheaders, and section titles.
3. Remove all numerical markings and bullet points.
4. Remove all formatting (bold, italic, etc.).
5. Remove all special characters and symbols.
6. Convert content into continuous paragraphs.
7. Maintain proper sentence structure and punctuation.
8. Keep all scientific terms and technical descriptions.
9. Preserve the meaning and context of the content.
10. DO NOT include any meta-instructions or processing commentary.

Remember: Your output should be clean, continuous text without any structural elements or formatting."""

    # Split content into chunks if needed
    chunks = chunk_text(content)
    all_segments = []

    print(f"Processing {len(chunks)} chunks from {input_file}...")
    for i, chunk in enumerate(chunks, 1):
        print(f"Processing chunk {i}/{len(chunks)}...")
        segments = process_text_chunk(chunk, prompt)
        if segments:
            all_segments.extend(segments)

    if not all_segments:
        print(f"No relevant content extracted from {input_file}. Skipping JSONL creation.")
        return

    # Write each segment as a separate JSON object in JSONL format
    try:
        # Ensure the output directory exists
        output_dir = os.path.dirname(output_file)
        os.makedirs(output_dir, exist_ok=True)

        with open(output_file, 'w', encoding='utf-8') as file:
            for segment in all_segments:
                json_obj = {
                    "text": segment
                }
                file.write(json.dumps(json_obj) + '\n')
        print(f"Cleaned content saved to '{output_file}' in JSONL format.")
    except Exception as e:
        print(f"Error writing to output file '{output_file}': {e}")

def process_multiple_files(input_files: List[str], base_output_dir: str) -> None:
    """
    Process multiple input files and save the cleaned versions in the output directory.

    Args:
        input_files (List[str]): List of input file paths within the Colab environment.
        base_output_dir (str): Base directory to save the processed files (markdown and jsonl).
    """
    # Create output directories within the base output directory
    md_dir = os.path.join(base_output_dir, 'markdown')
    jsonl_dir = os.path.join(base_output_dir, 'jsonl')
    os.makedirs(md_dir, exist_ok=True)
    os.makedirs(jsonl_dir, exist_ok=True)

    print("Files to process:", input_files)

    if not input_files:
        print("No files provided to process.")
        return

    for input_file in input_files:
        input_path = Path(input_file)

        # Check if the uploaded file is a PDF
        if input_path.suffix.lower() == '.pdf':
             # Convert PDF to markdown
            md_file = convert_pdf_to_text(input_file, md_dir)
            if not md_file:
                print(f"Skipping processing of {input_file} due to PDF conversion error.")
                continue
        else:
            # If not a PDF, inform the user and skip
            print(f"Skipping non-PDF file: {input_file}. Please upload only PDF files.")
            continue # Skip to the next file

        # Process the markdown file
        output_file = os.path.join(jsonl_dir, f"cleaned_{input_path.stem}.jsonl")
        print(f"\nProcessing file: {md_file}")
        clean_research_paper(md_file, output_file)


def main():
    # Create timestamp for unique output directory
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # Use a directory within the Colab environment, e.g., /content/
    base_output_dir = f'/content/cleaned_output_{timestamp}'

    # Upload PDF files using Colab's utility and get the list of paths
    input_files = upload_pdf_files()

    if not input_files:
        print("No PDF files uploaded. Exiting...")
        return

    # Process the files
    process_multiple_files(input_files, base_output_dir)

    # Download the processed JSONL files from the created directory
    jsonl_dir = os.path.join(base_output_dir, 'jsonl')
    download_processed_files(jsonl_dir) # Use the new download function

if __name__ == '__main__':
    # Check for API key before running main
    if os.environ.get("LLAMA_API_KEY"):
        main()
    else:
        print("LLAMA_API_KEY is not set. Please set your API key.")
        print("Go to 'Secrets' (key icon) in the left sidebar in Google Colab and add 'LLAMA_API_KEY' with your API key.")

Please upload PDF files using the file upload widget below:


Saving Marine ecological field methods _ a guide for marine (undergrad & grad).pdf to Marine ecological field methods _ a guide for marine (undergrad & grad).pdf
Uploaded file: Marine ecological field methods _ a guide for marine (undergrad & grad).pdf
Files to process: ['Marine ecological field methods _ a guide for marine (undergrad & grad).pdf']
Converting Marine ecological field methods _ a guide for marine (undergrad & grad).pdf to markdown...
Markdown text length: 503230
Markdown file saved to /content/cleaned_output_20250531_011831/markdown/Marine ecological field methods _ a guide for marine (undergrad & grad)_pymupdf4llm.md
Markdown file created: /content/cleaned_output_20250531_011831/markdown/Marine ecological field methods _ a guide for marine (undergrad & grad)_pymupdf4llm.md

Processing file: /content/cleaned_output_20250531_011831/markdown/Marine ecological field methods _ a guide for marine (undergrad & grad)_pymupdf4llm.md
Processing 141 chunks from /content/cleaned_ou

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: cleaned_Marine ecological field methods _ a guide for marine (undergrad & grad).jsonl
Download process finished.


In [None]:
!pip install llama_api_client

Collecting llama_api_client
  Downloading llama_api_client-0.1.1-py3-none-any.whl.metadata (14 kB)
Downloading llama_api_client-0.1.1-py3-none-any.whl (83 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/83.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.7/83.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: llama_api_client
Successfully installed llama_api_client-0.1.1


In [None]:
!pip install pymupdf4llm

Collecting pymupdf4llm
  Downloading pymupdf4llm-0.0.24-py3-none-any.whl.metadata (4.7 kB)
Collecting pymupdf>=1.25.5 (from pymupdf4llm)
  Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf4llm-0.0.24-py3-none-any.whl (28 kB)
Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf, pymupdf4llm
Successfully installed pymupdf-1.26.0 pymupdf4llm-0.0.24
