In [None]:
import pandas as pd
import requests
import time
import re
from bs4 import BeautifulSoup
import numpy as np
import os
from tqdm import tqdm
import json
from datetime import datetime
import signal
import sys
import pickle

class MovieDataEnricher:
    """
    Enriches a movie dataset with detailed information from online sources.
    Supports pausing and resuming the enrichment process.
    """
    
    def __init__(self, api_key=None):
        """
        Initialize the enricher with an OMDb API key.
        If no API key is provided, will use only web scraping.
        
        Get a free API key at: http://www.omdbapi.com/apikey.aspx
        """
        self.api_key = api_key
        self.session = requests.Session()
        
        # Set a user agent to avoid being blocked
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.9',
        }
        self.session.headers.update(self.headers)
        
        # Track statistics
        self.stats = {
            'total_movies': 0,
            'api_success': 0,
            'web_success': 0,
            'failed': 0,
            'processed': 0
        }
        
        # For pause/resume functionality
        self.checkpoint_file = None
        self.input_path = None
        self.output_path = None
        self.is_paused = False
        self.processed_indices = set()
    
    def _extract_year(self, date_value):
        """
        Extract year from various date formats.
        
        Args:
            date_value: Date in various formats (string, datetime, etc.)
            
        Returns:
            Integer year or None if extraction fails
        """
        if pd.isna(date_value):
            return None
            
        # If already an integer
        if isinstance(date_value, int) and 1900 <= date_value <= 2030:
            return date_value
            
        # If a string, try various formats
        if isinstance(date_value, str):
            # Try direct year extraction (e.g., "2015")
            if re.match(r'^(19|20)\d{2}$', date_value):
                return int(date_value)
                
            # Try format like "16-Dec-15"
            try:
                dt = datetime.strptime(date_value, '%d-%b-%y')
                return dt.year
            except ValueError:
                pass
                
            # Try format like "2015-12-16"
            try:
                dt = datetime.strptime(date_value, '%Y-%m-%d')
                return dt.year
            except ValueError:
                pass
                
            # Try format like "12/16/2015"
            try:
                dt = datetime.strptime(date_value, '%m/%d/%Y')
                return dt.year
            except ValueError:
                pass
                
            # Extract 4-digit number from string
            year_match = re.search(r'(19|20)\d{2}', date_value)
            if year_match:
                return int(year_match.group(0))
        
        # If datetime object
        if isinstance(date_value, (datetime, pd.Timestamp)):
            return date_value.year
            
        return None
    
    def _save_checkpoint(self, movies_df):
        """Save the current state to allow resuming later"""
        if self.checkpoint_file:
            checkpoint_data = {
                'processed_indices': self.processed_indices,
                'stats': self.stats,
                'last_processed_index': max(self.processed_indices) if self.processed_indices else -1
            }
            
            # Save the current DataFrame
            movies_df.to_csv(self.output_path + '.partial', index=False)
            
            # Save the checkpoint data
            with open(self.checkpoint_file, 'wb') as f:
                pickle.dump(checkpoint_data, f)
                
            print(f"\nCheckpoint saved: Processed {len(self.processed_indices)} of {self.stats['total_movies']} movies.")
    
    def _load_checkpoint(self):
        """Load the checkpoint data to resume processing"""
        if os.path.exists(self.checkpoint_file):
            try:
                with open(self.checkpoint_file, 'rb') as f:
                    checkpoint_data = pickle.load(f)
                
                self.processed_indices = checkpoint_data['processed_indices']
                self.stats = checkpoint_data['stats']
                
                print(f"Loaded checkpoint: Previously processed {len(self.processed_indices)} movies.")
                return True
            except Exception as e:
                print(f"Error loading checkpoint: {e}")
                return False
        return False
    
    def _handle_interrupt(self, signal, frame):
        """Handle keyboard interrupt (Ctrl+C)"""
        print("\n\nPausing the enrichment process...")
        self.is_paused = True
        
        # Let the main loop handle the actual saving
    
    def enrich_dataset(self, input_path, output_path, title_column='movie_title', year_column=None, limit=None, resume=True, checkpoint_interval=10):
        """
        Enriches movies dataset with detailed information.
        
        Args:
            input_path: Path to the input CSV file
            output_path: Path to save the enriched CSV file
            title_column: Column name containing movie titles
            year_column: Optional column name containing release years
            limit: Optional limit on number of movies to process (for testing)
            resume: Whether to try to resume from a previous run
            checkpoint_interval: How often to save checkpoints (number of movies)
        
        Returns:
            Pandas DataFrame with enriched movie data
        """
        # Set up checkpoint file
        self.input_path = input_path
        self.output_path = output_path
        self.checkpoint_file = output_path + '.checkpoint'
        
        # Set up signal handler
        signal.signal(signal.SIGINT, self._handle_interrupt)
        
        # Check for resumable session
        if resume and os.path.exists(self.output_path + '.partial') and self._load_checkpoint():
            print("Resuming from previous session...")
            movies_df = pd.read_csv(self.output_path + '.partial')
        else:
            print("Starting new enrichment process...")
            # Reset state
            self.processed_indices = set()
            self.stats = {
                'total_movies': 0,
                'api_success': 0,
                'web_success': 0,
                'failed': 0,
                'processed': 0
            }
            
            print("Loading dataset...")
            movies_df = pd.read_csv(input_path)
            
            # Ensure required columns exist
            if title_column not in movies_df.columns:
                title_options = [col for col in movies_df.columns if 'title' in col.lower() or 'movie' in col.lower()]
                if title_options:
                    title_column = title_options[0]
                    print(f"Using '{title_column}' as the title column")
                else:
                    raise ValueError(f"No title column found in dataset. Please specify the correct column name.")
            
            # Limit dataset size if specified (for testing)
            if limit and limit < len(movies_df):
                movies_df = movies_df.head(limit)
            
            # Prepare new columns
            new_columns = [
                'imdb_id', 'certificate', 'director', 'imdb_rating', 
                'runtime_minutes', 'main_stars', 'genres', 'country',
                'language', 'production_company', 'extracted_year'
            ]
            
            for col in new_columns:
                if col not in movies_df.columns:
                    movies_df[col] = np.nan
                    
            # Pre-process year data
            if year_column and year_column in movies_df.columns:
                movies_df['extracted_year'] = movies_df[year_column].apply(self._extract_year)
                print(f"Extracted years from {year_column} column")
        
        self.stats['total_movies'] = len(movies_df)
        
        # Process each movie
        print(f"Enriching data for {len(movies_df)} movies...")
        
        # Create a list of indices to process
        all_indices = set(movies_df.index.tolist())
        remaining_indices = all_indices - self.processed_indices
        
        # Convert to list and sort for consistent processing order
        remaining_indices = sorted(list(remaining_indices))
        
        # Create progress bar
        with tqdm(total=len(all_indices)) as progress_bar:
            # Update progress bar to show already processed items
            progress_bar.update(len(self.processed_indices))
            
            for index in remaining_indices:
                # Check if we should pause
                if self.is_paused:
                    break
                
                row = movies_df.loc[index]
                title = row[title_column]
                
                # Get year from extracted_year column
                year = row['extracted_year'] if pd.notna(row['extracted_year']) else None
                
                # Skip if no title
                if pd.isna(title) or not title:
                    self.processed_indices.add(index)
                    continue
                    
                # Try API first (if key is available)
                if self.api_key:
                    success = self._enrich_with_api(movies_df, index, title, year)
                    if success:
                        self.stats['api_success'] += 1
                        # Wait to respect API rate limits
                        time.sleep(1)
                    else:
                        # Fall back to web scraping
                        success = self._enrich_with_web_scraping(movies_df, index, title, year)
                        if success:
                            self.stats['web_success'] += 1
                        else:
                            self.stats['failed'] += 1
                        
                        # Wait between requests to avoid being blocked
                        time.sleep(2)
                else:
                    # Just use web scraping
                    success = self._enrich_with_web_scraping(movies_df, index, title, year)
                    if success:
                        self.stats['web_success'] += 1
                    else:
                        self.stats['failed'] += 1
                    
                    # Wait between requests to avoid being blocked
                    time.sleep(2)
                
                # Mark as processed
                self.processed_indices.add(index)
                self.stats['processed'] += 1
                
                # Update progress bar
                progress_bar.update(1)
                
                # Save checkpoint occasionally
                if self.stats['processed'] % checkpoint_interval == 0:
                    self._save_checkpoint(movies_df)
        
        # Save final results
        print("Saving enriched dataset...")
        
        # Optionally drop the temporary extracted_year column
        if 'extracted_year' in movies_df.columns:
            movies_df = movies_df.drop(columns=['extracted_year'])
        
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        # Save final dataset
        movies_df.to_csv(output_path, index=False)
        
        # If completed successfully, remove checkpoint files
        if len(self.processed_indices) == self.stats['total_movies']:
            if os.path.exists(self.checkpoint_file):
                os.remove(self.checkpoint_file)
            if os.path.exists(self.output_path + '.partial'):
                os.remove(self.output_path + '.partial')
        
        # Print statistics
        print("\nEnrichment Statistics:")
        print(f"Total movies processed: {self.stats['processed']} of {self.stats['total_movies']}")
        if self.api_key:
            print(f"Successfully enriched via API: {self.stats['api_success']} ({self.stats['api_success']/self.stats['processed']:.1%})")
        print(f"Successfully enriched via web scraping: {self.stats['web_success']} ({self.stats['web_success']/self.stats['processed']:.1%})")
        print(f"Failed to enrich: {self.stats['failed']} ({self.stats['failed']/self.stats['processed']:.1%})")
        
        if self.is_paused:
            print("\nProcess was paused. Run the script again with resume=True to continue.")
        
        return movies_df
    
    def _enrich_with_api(self, df, index, title, year=None):
        """
        Enriches a movie using the OMDb API.
        
        Args:
            df: DataFrame containing the movies
            index: Row index to update
            title: Movie title to search for
            year: Optional release year
            
        Returns:
            Boolean indicating success
        """
        # Build search URL
        params = {
            'apikey': self.api_key,
            't': title,
            'plot': 'short',
            'r': 'json'
        }
        
        # Only add year if it's a valid integer
        if year is not None and isinstance(year, (int, float)) and not np.isnan(year):
            params['y'] = int(year)
        
        try:
            response = requests.get('http://www.omdbapi.com/', params=params)
            data = response.json()
            
            # Check if we got a valid response
            if data.get('Response') == 'True':
                # Update DataFrame with API data
                df.at[index, 'imdb_id'] = data.get('imdbID', np.nan)
                df.at[index, 'certificate'] = data.get('Rated', np.nan)
                df.at[index, 'director'] = data.get('Director', np.nan)
                df.at[index, 'imdb_rating'] = data.get('imdbRating', np.nan)
                
                # Convert runtime to minutes
                if 'Runtime' in data and data['Runtime'] != 'N/A':
                    runtime_str = data['Runtime']
                    minutes = re.search(r'(\d+)', runtime_str)
                    if minutes:
                        df.at[index, 'runtime_minutes'] = int(minutes.group(1))
                
                df.at[index, 'main_stars'] = data.get('Actors', np.nan)
                df.at[index, 'genres'] = data.get('Genre', np.nan)
                df.at[index, 'country'] = data.get('Country', np.nan)
                df.at[index, 'language'] = data.get('Language', np.nan)
                df.at[index, 'production_company'] = data.get('Production', np.nan)
                
                return True
            return False
        except Exception as e:
            print(f"API error for '{title}': {e}")
            return False
    
    def _find_imdb_id(self, title, year=None):
        """
        Finds an IMDb ID by searching for a movie title.
        
        Args:
            title: Movie title to search for
            year: Optional release year
            
        Returns:
            IMDb ID or None if not found
        """
        search_query = title
        if year:
            search_query += f" {year}"
            
        search_url = f"https://www.imdb.com/find/?q={search_query}"
        
        try:
            response = self.session.get(search_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Look for search results
            search_results = soup.select('li.ipc-metadata-list-summary-item')
            
            for result in search_results:
                # Check if it's a movie/TV show result
                if result.select_one('.ipc-metadata-list-summary-item__tl'):
                    link = result.select_one('a')
                    if link and 'href' in link.attrs:
                        href = link['href']
                        imdb_id_match = re.search(r'/title/(tt\d+)/', href)
                        if imdb_id_match:
                            return imdb_id_match.group(1)
            
            return None
        except Exception as e:
            print(f"Error finding IMDb ID for '{title}': {e}")
            return None
    
    def _enrich_with_web_scraping(self, df, index, title, year=None):
        """
        Enriches a movie using web scraping from IMDb.
        
        Args:
            df: DataFrame containing the movies
            index: Row index to update
            title: Movie title to search for
            year: Optional release year
            
        Returns:
            Boolean indicating success
        """
        # First, try to find the IMDb ID
        imdb_id = self._find_imdb_id(title, year)
        
        if not imdb_id:
            return False
            
        # Store the IMDb ID
        df.at[index, 'imdb_id'] = imdb_id
        
        # Get the movie page
        movie_url = f"https://www.imdb.com/title/{imdb_id}/"
        
        try:
            response = self.session.get(movie_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract data using various selectors
            
            # Certificate
            certificate_elem = soup.select_one('a[href*="certificates"]')
            if certificate_elem:
                df.at[index, 'certificate'] = certificate_elem.text.strip()
            
            # Director
            director_elem = soup.select_one('a[href*="tt_ov_dr"]')
            if director_elem:
                df.at[index, 'director'] = director_elem.text.strip()
            else:
                # Try alternative selector
                directors = soup.select('.ipc-metadata-list__item:contains("Director") a')
                if directors:
                    df.at[index, 'director'] = ', '.join([d.text.strip() for d in directors[:2]])
            
            # IMDb Rating
            rating_elem = soup.select_one('.ipc-button__text .sc-bde20123-1')
            if rating_elem:
                df.at[index, 'imdb_rating'] = rating_elem.text.strip()
            
            # Runtime
            runtime_elem = soup.select_one('span[class*="sc-afe43def-4"]')
            if runtime_elem:
                runtime_text = runtime_elem.text.strip()
                hours_match = re.search(r'(\d+)h', runtime_text)
                minutes_match = re.search(r'(\d+)m', runtime_text)
                
                total_minutes = 0
                if hours_match:
                    total_minutes += int(hours_match.group(1)) * 60
                if minutes_match:
                    total_minutes += int(minutes_match.group(1))
                
                if total_minutes > 0:
                    df.at[index, 'runtime_minutes'] = total_minutes
            
            # Stars
            stars_elems = soup.select('.ipc-metadata-list__item:contains("Stars") a.ipc-metadata-list-item__list-content-item')
            if stars_elems:
                stars = [s.text.strip() for s in stars_elems if not 'See full cast' in s.text]
                df.at[index, 'main_stars'] = ', '.join(stars[:4])  # Get up to 4 stars
            
            # Genres
            genre_elems = soup.select('a[href*="genres"]')
            if genre_elems:
                genres = [g.text.strip() for g in genre_elems]
                df.at[index, 'genres'] = ', '.join(genres)
            
            # Country
            country_elem = soup.select_one('a[href*="country_of_origin"]')
            if country_elem:
                df.at[index, 'country'] = country_elem.text.strip()
            
            # Language
            language_elem = soup.select_one('a[href*="primary_language"]')
            if language_elem:
                df.at[index, 'language'] = language_elem.text.strip()
            
            # Production company (harder to reliably extract)
            company_elems = soup.select('.ipc-metadata-list__item:contains("Production companies") a')
            if company_elems:
                companies = [c.text.strip() for c in company_elems if not 'See more' in c.text]
                df.at[index, 'production_company'] = ', '.join(companies)
            
            return True
        except Exception as e:
            print(f"Web scraping error for '{title}' (IMDb ID: {imdb_id}): {e}")
            return False

# Example usage
if __name__ == "__main__":
    # Input and output file paths
    input_path = "./raw_data/numbers1.csv"
    output_path = "./processed_data/enriched_movies.csv"
    
    # Create the enricher
    # You can get a free API key at: http://www.omdbapi.com/apikey.aspx
    enricher = MovieDataEnricher(api_key="ccc9c4b6")  # Your API key
    
    # Check if we should resume
    checkpoint_file = output_path + '.checkpoint'
    resume = False
    
    if os.path.exists(checkpoint_file):
        resume_input = input("Found a previous session. Do you want to resume? (y/n): ")
        resume = resume_input.lower() == 'y'
    
    # Enrich the dataset
    enriched_df = enricher.enrich_dataset(
        input_path=input_path,
        output_path=output_path,
        title_column='movie_title',  # Update this to match your column name
        year_column='release_date',  # Update this to match your column name
        resume=resume,
        checkpoint_interval=5  # Save checkpoint every 5 movies
    )