In [None]:
# ----------------------- Package Installation -----------------------
import sys
import subprocess

def install_packages():
    """
    Installs the required packages if they are not already installed.
    This function ensures that all dependencies are met before executing the data collection.
    """
    required_packages = [
        'requests',
        'fuzzywuzzy',
        'python-Levenshtein',
        'nltk',
        'rich',
        'tqdm',
        'langdetect',       # For language detection
        'beautifulsoup4'    # For HTML parsing if needed
    ]
    for package in required_packages:
        try:
            if package == 'fuzzywuzzy':
                __import__('fuzzywuzzy.fuzz')
            elif package == 'beautifulsoup4':
                __import__('bs4')
            else:
                __import__(package)
        except ImportError:
            print(f"Installing package: {package}")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install necessary packages
install_packages()

# ----------------------- Imports -----------------------
import os
import re
import string
import requests
import logging
import psutil
from typing import List, Dict

from fuzzywuzzy import fuzz
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from rich.logging import RichHandler
from rich.console import Console
from rich.traceback import install as install_rich_traceback
from langdetect import detect, LangDetectException
from bs4 import BeautifulSoup  # For HTML parsing
import json
import warnings

# Initialize Rich Traceback for better error messages
install_rich_traceback()

# ----------------------- Logging Configuration -----------------------
console = Console()
logger = logging.getLogger('DataCollectorLogger')
logger.setLevel(logging.DEBUG)
rich_handler = RichHandler(console=console, rich_tracebacks=True)
rich_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
rich_handler.setFormatter(formatter)
if not logger.handlers:
    logger.addHandler(rich_handler)

# ----------------------- Utility Functions -----------------------
def log_memory_usage(stage: str):
    process = psutil.Process(os.getpid())
    mem = process.memory_info().rss / (1024 ** 2)
    logger.info(f'Memory Usage after {stage}: {mem:.2f} MB')

def handle_exception(e: Exception, stage: str):
    logger.error(f'Exception in {stage}: {e}', exc_info=True)

# ----------------------- NLTK Resource Setup -----------------------
def setup_nltk():
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        logger.info("Downloading NLTK stopwords...")
        nltk.download('stopwords')
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        logger.info("Downloading NLTK WordNet...")
        nltk.download('wordnet')
    try:
        nltk.data.find('corpora/omw-1.4')
    except LookupError:
        logger.info("Downloading NLTK omw-1.4...")
        nltk.download('omw-1.4')

setup_nltk()

# ----------------------- Genre Keywords Definition -----------------------
genre_keywords = {
    'Fiction': [
        'fiction', 'novel', 'narrative', 'story', 'narrative prose',
        'imaginative', 'literary', 'prose', 'literature', 'literary fiction',
        'realistic fiction', 'contemporary fiction', 'historical fiction'
    ],
    'Romance': [
        'romance', 'love stories', 'courtship', 'relationship', 'erotic', 'romantic fiction',
        'romantic novel', 'romantic drama', 'love affair', 'passion', 'heartwarming',
        'love triangle', 'erotica'
    ],
    'Mystery': [
        'mystery', 'detective', 'crime', 'whodunit', 'noir', 'suspense', 'police procedural',
        'forensic', 'investigation', 'thriller', 'espionage', 'conspiracy', 'cold case',
        'locked room mystery', 'legal thriller'
    ],
    'Science Fiction': [
        'science fiction', 'sci-fi', 'space', 'dystopian', 'cyberpunk', 'time travel', 'aliens',
        'futuristic', 'post-apocalyptic', 'robotics', 'space opera', 'military sci-fi',
        'biopunk', 'steampunk', 'hard science fiction'
    ],
    'Fantasy': [
        'fantasy', 'magic', 'dragons', 'sword and sorcery', 'high fantasy', 'epic fantasy',
        'urban fantasy', 'mythical', 'enchanted', 'supernatural', 'dark fantasy',
        'fairy tale', 'paranormal', 'magical realism', 'heroic fantasy'
    ],
    'Horror': [
        'horror', 'ghosts', 'monsters', 'supernatural', 'gothic', 'psychological horror',
        'slasher', 'terror', 'occult', 'dark horror', 'paranormal horror', 'creepy',
        'suspenseful', 'spooky', 'demonic', 'haunted'
    ],
    'Drama': [
        'drama', 'society', 'conflict', 'melodrama', 'tragic', 'play', 'theater',
        'emotional', 'intense relationships', 'character study', 'family drama',
        'coming-of-age', 'social issues', 'psychological drama'
    ],
    'Comedy': [
        'comedy', 'humor', 'satire', 'parody', 'slapstick', 'farce', 'irony',
        'witty', 'lighthearted', 'humorous fiction', 'black comedy', 'romantic comedy',
        'situational comedy', 'comedic drama'
    ],
    'Nonfiction': [
        'nonfiction', 'essay', 'memoir', 'informative', 'educational', 'documentary',
        'factual', 'realistic', 'instructional', 'geography', 'political science', 
        'economics', 'history', 'biography', 'autobiography', 'self-help', 'philosophy',
        'science methodology', 'social science', 'psychology', 'travelogue',
        'true crime', 'journalism', 'essay collection'
    ],
    'Adventure': [
        'adventure', 'journey', 'exploration', 'quest', 'expedition', 'voyage', 'trek',
        'travelogue', 'action-packed', 'escapade', 'survival', 'explorer', 'pioneering',
        'daring', 'bravery', 'odyssey', 'sailing'
    ],
}

# ----------------------- Data Cleaning Functions -----------------------
def extract_text_from_html(html_content: str) -> str:
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        for script_or_style in soup(['script', 'style']):
            script_or_style.decompose()
        text = soup.get_text(separator=' ')
        text = re.sub(r'\s+', ' ', text)
        return text
    except Exception as e:
        handle_exception(e, 'Extracting Text from HTML')
        return ""

def clean_text_line_based(raw_text: str) -> str:
    try:
        raw_text = raw_text.encode('utf-8').decode('utf-8-sig')
        lines = raw_text.splitlines()

        start_idx = 0
        end_idx = len(lines)
        for i, line in enumerate(lines):
            if re.match(r'\*\*\*\s*START OF (THIS|THE) PROJECT GUTENBERG EBOOK', line, re.IGNORECASE):
                start_idx = i + 1
                break
        else:
            logger.warning("Start marker not found. Using entire text.")
            start_idx = 0

        for i, line in enumerate(lines):
            if re.match(r'\*\*\*\s*END OF (THIS|THE) PROJECT GUTENBERG EBOOK', line, re.IGNORECASE):
                end_idx = i
                break
        else:
            logger.warning("End marker not found. Using entire text up to the end.")
            end_idx = len(lines)

        content_lines = lines[start_idx:end_idx]

        content_start_idx = 0
        for i, line in enumerate(content_lines):
            if re.match(r'^(LETTER\s+[IVX]+\.?|LETTER\s+\d+\.?|CHAPTER\s+\d+\.?|CHAPTER\s+[IVX]+\.?)$', line.strip(), re.IGNORECASE):
                content_start_idx = i + 1
                break
        else:
            content_start_idx = 0

        content_lines = content_lines[content_start_idx:]

        cleaned_content = []
        for line in content_lines:
            if re.match(r'^(LETTER\s+[IVX]+\.?|LETTER\s+\d+\.?|CHAPTER\s+\d+\.?|CHAPTER\s+[IVX]+\.?)$', line.strip(), re.IGNORECASE):
                continue
            cleaned_content.append(line)

        cleaned_text = '\n'.join(cleaned_content)
        cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        cleaned_text = cleaned_text.strip()

        if not cleaned_text:
            logger.warning("Cleaned text is empty after processing.")
        return cleaned_text

    except Exception as e:
        handle_exception(e, 'Cleaning Text Line-Based')
        return raw_text

# ----------------------- Data Collector Class -----------------------
class DataCollector:
    def __init__(self, genre_keywords: Dict[str, List[str]] = genre_keywords,
                 target_per_genre=10, max_retries=5, backoff_factor=0.3):
        self.api_url = 'https://gutendex.com/books/'
        self.genre_keywords = genre_keywords
        self.target_per_genre = target_per_genre
        self.max_retries = max_retries
        self.backoff_factor = backoff_factor
        
        self.blacklist = set()
        self.genre_counts = {genre: 0 for genre in self.genre_keywords}

        self.session = requests.Session()
        retries = requests.adapters.Retry(
            total=self.max_retries,
            backoff_factor=self.backoff_factor,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"]
        )
        adapter = requests.adapters.HTTPAdapter(max_retries=retries)
        self.session.mount('http://', adapter)
        self.session.mount('https://', adapter)
        
        self.current_book_id = None

    def fetch_books(self, page: int) -> List[Dict]:
        try:
            response = self.session.get(self.api_url, params={'page': page}, timeout=15)
            if response.status_code != 200:
                logger.warning(f'Failed to fetch books from page {page}. Status Code: {response.status_code}')
                return []
            data = response.json()
            results = data.get('results', [])
            if not results:
                logger.info("No more books found in API.")
            logger.info(f'Fetched page {page} with {len(results)} books.')
            return results
        except requests.exceptions.ReadTimeout:
            logger.error(f'ReadTimeoutError: Timeout while fetching page {page}. Skipping to next page.')
            return []
        except requests.exceptions.RequestException as e:
            handle_exception(e, f'Fetching Books from Page {page}')
            return []

    def download_book(self, gutenberg_id: int, retries: int = 3) -> str:
        """
        Downloads and returns the plain text of a book. No file saving is done here.
        Returns an empty string if download fails.
        """
        try:
            metadata_url = f'https://gutendex.com/books/{gutenberg_id}'
            response = self.session.get(metadata_url, timeout=15)
            if response.status_code != 200:
                logger.warning(f'Failed to fetch metadata for Book {gutenberg_id}. Status Code: {response.status_code}')
                return ""
            book_data = response.json()
            formats = book_data.get('formats', {})
            
            preferred_formats = [
                'text/plain; charset=utf-8',
                'text/plain; charset=us-ascii',
                'text/plain',
                'application/octet-stream',
                'text/html; charset=utf-8'
            ]
            
            text_url = None
            chosen_format = None
            for fmt in preferred_formats:
                url = formats.get(fmt)
                if url:
                    text_url = url
                    chosen_format = fmt
                    break
            
            if not text_url:
                logger.warning(f'No preferred text format found for Book {gutenberg_id}.')
                return ""
            
            attempt = 0
            while attempt < retries:
                try:
                    text_response = self.session.get(text_url, timeout=15)
                    if text_response.status_code == 200:
                        return text_response.text
                    else:
                        logger.warning(f'Non-200 status code ({text_response.status_code}) for Book {gutenberg_id} from {text_url}.')
                        break
                except requests.exceptions.RequestException as e:
                    attempt += 1
                    logger.warning(f'Attempt {attempt} failed for Book {gutenberg_id} from {text_url}: {e}')
                    if attempt >= retries:
                        logger.error(f'All {retries} attempts failed for Book {gutenberg_id}.')
                        break
            
            return ""
        
        except Exception as e:
            handle_exception(e, f'Download Book {gutenberg_id}')
            return ""

    def get_genre_labels(self, subjects: List[str]) -> List[str]:
        try:
            mapped_genres = set()
            for subject in subjects:
                subject_lower = subject.lower()
                for genre, keywords in self.genre_keywords.items():
                    for keyword in keywords:
                        similarity = fuzz.partial_ratio(keyword.lower(), subject_lower)
                        if similarity >= 85:
                            mapped_genres.add(genre)
                            break
            return list(mapped_genres)
        except Exception as e:
            handle_exception(e, 'Mapping Genres')
            return []

    def collect_data(self) -> List[Dict]:
        collected_data = []
        current_page = 1
        total_genres = len(self.genre_keywords)
        target_total = self.target_per_genre * total_genres
        logger.info(f"Starting data collection aiming for {self.target_per_genre} books per genre ({target_total} total).")
        
        with tqdm(total=target_total, desc="Collecting Books", unit="book") as pbar:
            while sum(self.genre_counts.values()) < target_total:
                books = self.fetch_books(current_page)
                if not books:
                    logger.info("No more books to fetch from API.")
                    break
                for book in books:
                    book_id = book.get('id')
                    title = book.get('title', 'Unknown Title')
                    languages = book.get('languages', [])
                    
                    if book_id in self.blacklist:
                        continue
                    
                    self.current_book_id = book_id
                    
                    if 'en' not in languages:
                        continue
                    
                    subjects = book.get('subjects', [])
                    mapped_genres = self.get_genre_labels(subjects)
                    
                    if not mapped_genres:
                        continue
                    
                    genres_available = [g for g in mapped_genres if self.genre_counts[g] < self.target_per_genre]
                    if not genres_available:
                        continue
                    
                    book_text = self.download_book(book_id)
                    if not book_text:
                        self.blacklist.add(book_id)
                        continue
                    
                    cleaned_text = clean_text_line_based(book_text)
                    if not cleaned_text:
                        logger.warning(f"Cleaned text is empty for Book ID: {book_id}. Skipping this book.")
                        self.blacklist.add(book_id)
                        continue
                    
                    collected_data.append({
                        "id": book_id,
                        "title": title,
                        "text": cleaned_text,
                        "genres": genres_available
                    })
                    
                    for genre in genres_available:
                        self.genre_counts[genre] += 1
                        pbar.update(1)
                        if self.genre_counts[genre] >= self.target_per_genre:
                            logger.info(f'Target reached for genre: {genre}')
                    
                    logger.info(f'Book {book_id} processed with genres: {genres_available}')
                
                current_page += 1
        
        log_memory_usage("After Data Compilation")
        logger.info(f'Total books successfully processed: {len(collected_data)}')
        return collected_data

# ----------------------- Data Compilation Execution -----------------------
collector = DataCollector(genre_keywords=genre_keywords, target_per_genre=100)
collected_data = collector.collect_data()

# ----------------------- Data Saving -----------------------
output_file = 'collected_books.json'
try:
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(collected_data, f, ensure_ascii=False, indent=4)
    logger.info(f"Successfully saved collected data to '{output_file}'.")
except Exception as e:
    handle_exception(e, f'Saving Data to {output_file}')

# ----------------------- Data Summary -----------------------
logger.info("=== Data Collection Summary ===")
logger.info(f"Total Books Processed: {len(collected_data)}")
logger.info("Data collection process completed.")
