<a href="https://colab.research.google.com/github/Ayu369-gen/LLM_trainable_data/blob/main/Research_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pymupdf4llm

Collecting pymupdf4llm
  Downloading pymupdf4llm-0.0.24-py3-none-any.whl.metadata (4.7 kB)
Collecting pymupdf>=1.25.5 (from pymupdf4llm)
  Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf4llm-0.0.24-py3-none-any.whl (28 kB)
Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf, pymupdf4llm
Successfully installed pymupdf-1.26.0 pymupdf4llm-0.0.24


In [None]:
!pip install llama_api_client

Collecting llama_api_client
  Downloading llama_api_client-0.1.1-py3-none-any.whl.metadata (14 kB)
Downloading llama_api_client-0.1.1-py3-none-any.whl (83 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/83.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.7/83.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: llama_api_client
Successfully installed llama_api_client-0.1.1


In [None]:
import os
import re
import json
import shutil
from typing import Optional, List
from pathlib import Path
import fitz  # PyMuPDF
from pymupdf4llm import to_markdown
# Remove tkinter imports
# import tkinter as tk
# from tkinter import filedialog, messagebox
from datetime import datetime

# Import Colab file upload utilities
from google.colab import files

# Set up the Llama API client
try:
    from llama_api_client import LlamaAPIClient
    os.environ["LLAMA_API_KEY"] = "YOUR_LLAMA_API"
    client = LlamaAPIClient(
        api_key=os.environ.get("LLAMA_API_KEY"),
        base_url="https://api.llama.com/v1/",
    )
except ImportError:
    raise ImportError("llama_api_client is required. Please install it using 'pip install llama_api_client'.")

# Remove the select_pdf_files function

def download_jsonl_files(jsonl_dir: str, download_dir: str) -> None:
    """Copy processed JSONL files to the download directory and offer download in Colab."""
    try:
        os.makedirs(download_dir, exist_ok=True)
        print(f"\nCopying files to download directory: {download_dir}")
        for file in os.listdir(jsonl_dir):
            if file.endswith('.jsonl'):
                src_path = os.path.join(jsonl_dir, file)
                dst_path = os.path.join(download_dir, file)
                shutil.copy2(src_path, dst_path)
        print(f"Files prepared for download in: {download_dir}")

        # Offer download in Colab
        print("\nOffering files for download:")
        for file_name in os.listdir(download_dir):
            if file_name.endswith('.jsonl'):
                file_path = os.path.join(download_dir, file_name)
                try:
                    files.download(file_path)
                    print(f" - Downloaded {file_name}")
                except Exception as download_error:
                    print(f" - Failed to download {file_name}: {download_error}")

    except Exception as e:
        print(f"Error during file processing or download preparation: {e}")


def extract_metadata(pdf_path: str) -> dict:
    """Extract metadata from PDF (title, authors, year)."""
    try:
        doc = fitz.open(pdf_path)
        metadata = doc.metadata
        doc.close()

        return {
            "title": metadata.get("title", ""),
            "authors": metadata.get("author", ""),
            "creation_date": metadata.get("creationDate", ""),
            "keywords": metadata.get("keywords", "")
        }
    except Exception as e:
        print(f"Error extracting metadata: {e}")
        return {}

def convert_pdf_to_text(pdf_path: str, output_dir: str) -> Optional[str]:
    """Convert a PDF file to markdown text using pymupdf4llm."""
    try:
        os.makedirs(output_dir, exist_ok=True)
        pdf_name = Path(pdf_path).stem
        md_path = os.path.join(output_dir, f"{pdf_name}_pymupdf4llm.md")
        print(f"Converting {pdf_path} to markdown...")
        markdown_text = to_markdown(pdf_path)
        print(f"Markdown text length: {len(markdown_text)}")
        with open(md_path, 'w', encoding='utf-8') as f:
            f.write(markdown_text)
        print(f"Markdown file saved to {md_path}")
        return md_path
    except Exception as e:
        print(f"Error converting PDF to markdown: {e}")
        return None

def chunk_text(text: str, max_chunk_size: int = 4000) -> List[str]:
    """
    Split text into chunks while preserving research paper structure.
    Attempts to keep sections (Abstract, Methods, etc.) together.
    """
    chunks = []
    current_chunk = ""
    current_section = ""

    # Common research paper sections
    section_headers = [
        r'^#{1,3}\s*Abstract\s*$',
        r'^#{1,3}\s*Introduction\s*$',
        r'^#{1,3}\s*Methods\s*$',
        r'^#{1,3}\s*Materials and Methods\s*$',
        r'^#{1,3}\s*Results\s*$',
        r'^#{1,3}\s*Discussion\s*$',
        r'^#{1,3}\s*Conclusion\s*$',
        r'^#{1,3}\s*References\s*$',
        r'^#{1,3}\s*Acknowledgements\s*$'
    ]

    lines = text.split('\n')
    for line in lines:
        # Check if line is a section header
        is_section = any(re.match(header, line, re.IGNORECASE) for header in section_headers)

        if is_section:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = line + '\n'
            current_section = line
        else:
            if len(current_chunk) + len(line) + 1 <= max_chunk_size:
                current_chunk += line + '\n'
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = (current_section + '\n' + line + '\n') if current_section else (line + '\n')

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def query_llama_api(text: str, prompt: str) -> Optional[str]:
    """Send text to the Llama API with a prompt and return the response."""
    try:
        response = client.chat.completions.create(
            model="Llama-4-Maverick-17B-128E-Instruct-FP8",
            messages=[
                {"role": "system", "content": "You are an expert in marine science and aquaculture research paper analysis. Extract ONLY the plain text content related to marine science and aquaculture. Remove all formatting, headers, numbers, and special characters. Output should be clean, continuous text without any structural elements."},
                {"role": "user", "content": f"{prompt}\n\n### Input Text:\n{text}"}
            ],
            temperature=0.3,
            top_p=0.9
        )
        response_text = response.completion_message.content.text.strip()
        return response_text
    except Exception as e:
        print(f"Error querying API: {e}")
        try:
            # Fallback with default parameters if specific ones cause issues
            response = client.chat.completions.create(
                model="Llama-4-Maverick-17B-128E-Instruct-FP8",
                messages=[
                    {"role": "system", "content": "You are an expert in marine science and aquaculture research paper analysis. Extract ONLY the plain text content related to marine science and aquaculture. Remove all formatting, headers, numbers, and special characters. Output should be clean, continuous text without any structural elements."},
                    {"role": "user", "content": f"{prompt}\n\n### Input Text:\n{text}"}
                ]
            )
            response_text = response.completion_message.content.text.strip()
            return response_text
        except Exception as fallback_e:
            print(f"Fallback API call failed: {fallback_e}")
            return None

def process_text_chunk(chunk: str, prompt: str) -> Optional[List[str]]:
    """Process a single chunk of text through the Llama API."""
    cleaned_content = query_llama_api(chunk, prompt)
    if not cleaned_content:
        return None

    # Post-process to clean up extra whitespace and normalize newlines
    cleaned_content = re.sub(r'\n\s*\n+', '\n\n', cleaned_content).strip()

    # Split content into continuous text segments
    segments = []
    current_segment = []

    for line in cleaned_content.split('\n'):
        line = line.strip()
        if not line:
            continue

        # Skip reference-like lines (e.g., "[1]", "(Smith et al., 2020)")
        # Be careful with regex to avoid false positives on valid text
        if re.match(r'^\[\d+\]$', line) or re.match(r'^\([^)]+,\s*\d{4}\)$', line): # Improved regex for year citations
            continue

        current_segment.append(line)

    # Add the segment
    if current_segment:
        segment_text = ' '.join(current_segment)
        if segment_text.strip():
            segments.append(segment_text)

    return segments

def clean_research_paper(input_file: str, output_file: str, metadata: dict) -> None:
    """
    Process aquaculture research paper content while retaining scientific information.
    Output includes metadata and is saved in JSONL format.
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as file:
            content = file.read()
    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        return
    except UnicodeDecodeError:
        print("Error: File encoding issue. Ensure the file is UTF-8 encoded.")
        return

    prompt = """Extract ONLY the plain text content related to marine science and aquaculture from the following research paper text. Remove all formatting, headers, and structural elements.

**Content to Extract:**
- Marine biology and ecology
- Aquaculture systems and practices
- Fisheries management
- Marine conservation
- Oceanography and marine ecosystems
- Marine species biology and behavior
- Aquaculture technology and innovations
- Marine resource management
- Water quality and environmental parameters
- Marine food production systems
- Experimental methods and results
- Statistical analyses
- Scientific observations and conclusions

**Output Rules:**
1. Output ONLY plain text content
2. Remove all headers, subheaders, and section titles
3. Remove all numerical markings, bullet points, and reference citations (e.g., [1], (Smith et al., 2020))
4. Remove all formatting (bold, italic, etc.)
5. Remove all special characters and symbols except basic punctuation
6. Convert content into continuous paragraphs
7. Maintain proper sentence structure and punctuation
8. Preserve all scientific terms, technical descriptions, and data
9. Preserve the meaning and context of the content
10. Exclude reference lists, acknowledgments, and non-scientific content
11. DO NOT include any meta-instructions or processing commentary
12. Preserve numerical data and statistical results
13. Maintain methodology descriptions
14. Keep species names and technical terms intact

Remember: Your output should be clean, continuous text without any structural elements or formatting."""

    chunks = chunk_text(content)
    all_segments = []

    print(f"Processing {len(chunks)} chunks from {input_file}...")
    for i, chunk in enumerate(chunks, 1):
        print(f"Processing chunk {i}/{len(chunks)}...")
        segments = process_text_chunk(chunk, prompt)
        if segments:
            all_segments.extend(segments)

    try:
        with open(output_file, 'w', encoding='utf-8') as file:
            for segment in all_segments:
                json_obj = {
                    "text": segment,
                    "metadata": metadata
                }
                file.write(json.dumps(json_obj) + '\n')
        print(f"Cleaned content saved to '{output_file}' in JSONL format.")
    except Exception as e:
        print(f"Error writing to output file: {e}")

def process_multiple_files(input_files: List[str], output_dir: str) -> None:
    """Process multiple input files and save the cleaned versions."""
    md_dir = os.path.join(output_dir, 'markdown')
    jsonl_dir = os.path.join(output_dir, 'jsonl')
    os.makedirs(md_dir, exist_ok=True)
    os.makedirs(jsonl_dir, exist_ok=True)

    print("Selected files:", input_files)

    for input_file in input_files:
        input_path = Path(input_file)

        # Extract metadata
        metadata = extract_metadata(input_file) if input_path.suffix.lower() == '.pdf' else {}

        # Convert PDF to markdown
        if input_path.suffix.lower() == '.pdf':
            md_file = convert_pdf_to_text(input_file, md_dir)
            if not md_file:
                print(f"Skipping {input_file} due to conversion error")
                continue
        else:
            # If the input file is already markdown, just use it
            md_file = input_file

        # Process the markdown file
        output_file = os.path.join(jsonl_dir, f"cleaned_{input_path.stem}.jsonl")
        print(f"\nProcessing file: {md_file}")
        clean_research_paper(md_file, output_file, metadata)

def main():
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f'cleaned_output_{timestamp}'
    jsonl_dir = os.path.join(output_dir, 'jsonl')
    download_dir = os.path.join("downloads", f"processed_jsonl_{timestamp}") # Use a relative path for Colab

    print("Please upload your PDF files.")
    # Use Colab's file upload widget
    uploaded = files.upload()

    # 'uploaded' is a dictionary {filename: content}
    # We need to save these files to the Colab environment first
    input_files = []
    upload_dir = "uploaded_pdfs" # Directory to save uploaded files
    os.makedirs(upload_dir, exist_ok=True)

    if not uploaded:
        print("No files uploaded. Exiting...")
        return

    for filename, content in uploaded.items():
        file_path = os.path.join(upload_dir, filename)
        with open(file_path, 'wb') as f:
            f.write(content)
        input_files.append(file_path)
        print(f"Saved uploaded file: {file_path}")


    if not input_files:
        print("No files found in the upload directory. Exiting...")
        return

    # Process the uploaded files
    process_multiple_files(input_files, output_dir)

    # Offer the resulting JSONL files for download via Colab's download function
    download_jsonl_files(jsonl_dir, download_dir)

if __name__ == '__main__':
    main()