In [None]:
import os
import json
import re
import aiohttp
import asyncio
from aiohttp import ClientSession
from datetime import datetime
from urllib.parse import urlparse

import openai
openai.api_key = os.getenv("OPENAI_API_KEY") 

##########################
# PDF Downloader 
##########################

# Directory to save downloaded documents and log file
DOWNLOAD_DIR = 'Downloads'
LOG_FILE = 'download_log.txt'

# Create the download directory if it doesn't exist
if not os.path.exists(DOWNLOAD_DIR):
    os.makedirs(DOWNLOAD_DIR)

def get_unique_filename(url, original_filename):
    """Generate a unique filename based on URL, timestamp, and original filename."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Extract the file extension (default to .pdf if not present)
    file_extension = os.path.splitext(original_filename)[1]
    if not file_extension:
        file_extension = '.pdf'
    
    # Create a base filename from the original, or use 'document' if it's not suitable
    base_filename = os.path.splitext(original_filename)[0]
    if not base_filename or base_filename == 'pdf':
        base_filename = 'document'
    
    # Construct the unique filename
    unique_filename = f"{base_filename}_{timestamp}{file_extension}"
    return sanitize_filename(unique_filename)

def sanitize_filename(filename):
    return re.sub(r'[\/:*?"<>|]', '', filename)

async def download_file(semaphore: asyncio.Semaphore, session: ClientSession, url: str, filename: str):
    """Download a file from a URL and save it to the specified filename asynchronously with a semaphore limit."""
    async with semaphore:
        try:
            print(f"Attempting to download from URL: {url}")  # Debug print
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            
            # Generate a unique filename
            unique_filename = get_unique_filename(url, filename)
            file_path = os.path.join(DOWNLOAD_DIR, unique_filename)

            # Check if the file already exists in the download directory
            if os.path.exists(file_path):
                print(f"File {unique_filename} already exists, skipping download.")
                return file_path  # Return the path of the existing file

            async with session.get(url, headers=headers, timeout=30) as response:
                response.raise_for_status()  # Raise an exception for HTTP errors

                # Check if the content type is PDF or octet-stream
                content_type = response.headers.get('Content-Type', '').lower()
                if 'application/pdf' not in content_type and 'application/octet-stream' not in content_type:
                    print(f"Warning: Content-Type is {content_type}")
                    return None  # Stop if it's not a PDF

                # Write the file asynchronously
                with open(file_path, 'wb') as f:
                    while True:
                        chunk = await response.content.read(8192)
                        if not chunk:
                            break
                        f.write(chunk)

                # Verify the file size
                if os.path.getsize(file_path) == 0:
                    os.remove(file_path)
                    return f"Downloaded file is empty: {url}"

                return file_path  # Return the path of the downloaded file
        except Exception as e:
            print(f"Failed to download {url}. Error: {e}")
            return None
        

async def process_json(session: ClientSession, semaphore: asyncio.Semaphore, json_data):
    """Process JSON data to download PDF files from the resources links and log details asynchronously."""
    log_entries = []  # List to store log entries
    tasks = []  # List to store download tasks
    
    for author, results in json_data.items():
        log_entries.append(f'Starting: {author}')
        for idx, item in enumerate(results, start=1):
            resources = item.get('resources')
            if resources:
                resources = resources[0]
                if resources['file_format'] == 'PDF':
                    title = sanitize_filename(item.get('title'))
                    PDFLink = resources['link']
                    
                    # Create a task for the download and log entry
                    tasks.append(download_file(semaphore, session, PDFLink, title))
                    log_entries.append(f"{idx}. URL: {PDFLink}\n   Saved as: {title}\n")
    
    # Download in batches
    batch_size = 5
    while tasks:
        batch = tasks[:batch_size]
        tasks = tasks[batch_size:]
        
        # Wait for the current batch to finish
        downloaded_files = await asyncio.gather(*batch)
        
        # Write log entries to the log file
        with open(LOG_FILE, 'w') as log_file:
            log_file.writelines(log_entries)

    return downloaded_files

##########################
# Filter 
##########################

def is_coral_related(text):
    prompt = (
        "Determine whether the following text is related to the ocean. "
        "Only respond with 'Yes' if it is related and 'No' if it is not.\n\n"
        f"Text: \"{text}\"\n\n"
        "Response:"
    )

    try:
        response = openai.chat.completions.create(
            model="gpt-4o-mini",  # Or use "gpt-3.5-turbo"
            messages=[
                {"role": "system", "content": "You are an expert in marine biology."},
                {"role": "user", "content": prompt}
            ],
            temperature=0  # Keep responses consistent
        )
        
        answer = response.choices[0].message.content.strip().lower()
        print(answer + " - " + text)
        return answer == "yes"
    
    except Exception as e:
        print(f"Error: {e}")
        return False


async def filter_json_data(data):
    # Iterate over each author in the input data
    filtered_data = {}

    for author, publications in data.items():
        filtered_publications = []
        
        for publication in publications:
            # First check if there are PDF resources
            resources = [resource for resource in publication.get('resources', []) 
                       if resource.get('file_format') == 'PDF']
            
            if not resources:
                continue  # Skip if no PDF resources
            
            text = publication.get('title', '') + '\n' + publication.get('snippet', '')

            # If we get here, we have a PDF and valid date, now check coral-related content
            if is_coral_related(text):
                filtered_publication = publication.copy()
                filtered_publication['resources'] = resources
                filtered_publications.append(filtered_publication)

        # Only keep the author if they have publications after filtering
        if filtered_publications:
            filtered_data[author] = filtered_publications
    
    return filtered_data

##########################
# Run
##########################

async def main():
    
    # Name of the inputed JSON
    json_file_path = 'EveryResults_Backup.json'
    
    output_file = 'FilteredResults.json'

    try:
        with open(json_file_path, 'r') as file:
            # Load JSON data
            json_data = json.load(file)
            
            # Filter the data asynchronously
            filtered_data = await filter_json_data(json_data)
            
            # Save the filtered data for viewing if needed
            with open(output_file, 'w') as f:
                json.dump(filtered_data, f, indent=2)

            # Create an aiohttp session and a semaphore to limit concurrency
            semaphore = asyncio.Semaphore(10)  # Limit to 10 concurrent downloads
            
            async with aiohttp.ClientSession() as session:
                # Process the JSON data to download files asynchronously in batches
                downloaded_files = await process_json(session, semaphore, filtered_data)

            print(f"Filtered data has been saved to '{output_file}'.")

    except FileNotFoundError:
        print(f"File not found: {json_file_path}")
    except json.JSONDecodeError:
        print(f"Error decoding JSON from file: {json_file_path}")

if __name__ == "__main__":
    try:
        # Check if there is an active event loop
        loop = asyncio.get_event_loop()
        if loop.is_running():
            # If event loop is running (e.g., in Jupyter), use asyncio.ensure_future
            asyncio.ensure_future(main())
        else:
            # If no event loop is running, we can run the main function normally
            asyncio.run(main())
    except RuntimeError as e:
        # In case of runtime error with no event loop running
        asyncio.run(main())
