In [None]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

def scrape_svu_episodes():
    base_url = "https://www.imdb.com/title/tt0203259/episodes/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    }
    
    all_episodes = []
    
    # Loop through each season
    season = 1
    max_seasons = 40  # Set a reasonable upper limit
    
    while season <= max_seasons:
        try:
            print(f"Scraping Season {season}...")
            season_url = f"{base_url}?season={season}"
            response = requests.get(season_url, headers=headers)
            
            if response.status_code != 200:
                print(f"Received status code {response.status_code}. Stopping.")
                break
                
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Check if we're on a valid season page
            if "Law" not in soup.title.text:
                print(f"Season {season} does not exist. Stopping.")
                break
                
            # Find all episode items based on the HTML structure shared
            episode_elements = soup.select('.episode-item-wrapper')
            
            if not episode_elements:
                print(f"Trying alternative selectors for Season {season}...")
                # Try different selectors based on IMDb's layout
                episode_elements = soup.select('[data-testid="episodes-list"] .ipc-metadata-list-summary-item') or \
                                  soup.select('.eplist .list_item') or \
                                  soup.select('.episodes-container .episode-container')
            
            if episode_elements:
                for episode in episode_elements:
                    # Extract title (now looking for the specific format in the provided HTML)
                    title_elem = episode.select_one('.ipc-title__text')
                    
                    if not title_elem:
                        title_elem = episode.select_one('a[itemprop="name"]') or \
                                     episode.select_one('a.ipc-link--inherit-color') or \
                                     episode.select_one('a strong') or \
                                     episode.select_one('.title a')
                    
                    # Extract air date
                    air_date_elem = episode.select_one('.larLSC') or episode.select_one('[class*="sc-"] span')
                    
                    # Extract description
                    desc_elem = episode.select_one('.ipc-html-content-inner-div') or episode.select_one('.item_description')
                    
                    # Extract rating
                    rating_elem = episode.select_one('.ipc-rating-star--rating') or episode.select_one('.ipc-rating-star')
                    
                    # Extract image URL - NEW ADDITION
                    image_elem = episode.select_one('.ipc-image')
                    image_url = None
                    if image_elem and image_elem.has_attr('src'):
                        image_url = image_elem['src']
                    
                    # Extract episode URL for detailed scraping
                    link_elem = episode.select_one('a[href*="/title/tt"]')
                    episode_url = None
                    if link_elem and link_elem.has_attr('href'):
                        episode_url = "https://www.imdb.com" + link_elem['href'].split('?')[0]
                    
                    # Process extracted data
                    if title_elem:
                        title_text = title_elem.text.strip()
                        # Handle format like "S1.E1 ∙ Payback"
                        if "∙" in title_text:
                            title_text = title_text.split("∙", 1)[1].strip()
                        # Or handle format with episode number prefix
                        elif ". " in title_text and title_text[0].isdigit():
                            title_text = title_text.split(". ", 1)[1]
                        
                        # Create episode data dictionary
                        episode_data = {
                            "Season": season,
                            "Episode": None,  # Will try to extract from title if possible
                            "Title": title_text,
                            "Air Date": air_date_elem.text.strip() if air_date_elem else None,
                            "Description": desc_elem.text.strip() if desc_elem else None,
                            "Rating": None,
                            "Image URL": image_url,
                            "Main Cast": []
                        }
                        
                        # Try to extract episode number if present in the format "S1.E1"
                        if title_elem and "S" in title_elem.text and ".E" in title_elem.text:
                            ep_match = re.search(r'S\d+\.E(\d+)', title_elem.text)
                            if ep_match:
                                episode_data["Episode"] = int(ep_match.group(1))
                        
                        # Extract rating if present
                        if rating_elem:
                            rating_text = rating_elem.text.strip()
                            rating_match = re.search(r'(\d+\.\d+)', rating_text)
                            if rating_match:
                                episode_data["Rating"] = float(rating_match.group(1))
                        
                        # --- NEW: Scrape main cast from episode page ---
                        main_cast = []
                        if episode_url:
                            try:
                                ep_resp = requests.get(episode_url, headers=headers)
                                if ep_resp.status_code == 200:
                                    ep_soup = BeautifulSoup(ep_resp.content, 'html.parser')
                                    cast_elems = ep_soup.select('a[data-testid="title-cast-item__actor"]')
                                    if not cast_elems:
                                        cast_elems = ep_soup.select('.cast_list tr .primary_photo + td a')
                                    for cast in cast_elems:   
                                        main_cast.append(cast.text.strip())
                                time.sleep(0.5)
                            except Exception as e:
                                print(f"Error fetching cast for episode: {episode_url} - {e}")
                        episode_data['Main Cast'] = main_cast
                        print(f"Season {season} Ep {episode_data.get('Episode')}: {episode_data['Title']} -> {main_cast}")  
                        # --- END NEW ---
                        
                        all_episodes.append(episode_data)
                
                print(f"Found {len(episode_elements)} episodes in Season {season}")
                season += 1
            else:
                print(f"No episodes found for Season {season}. This might be the last season.")
                break
            
            # Be respectful to the server
            time.sleep(1)
            
        except Exception as e:
            print(f"Error scraping Season {season}: {e}")
            break
    
    return all_episodes

# Scrape the episodes
episodes = scrape_svu_episodes()

# Convert to DataFrame for better display in Jupyter
df_episodes = pd.DataFrame(episodes)

# Display the results
print(f"\nFound {len(episodes)} episodes in total:")
df_episodes.head(10)  # Show first 10 episodes

# Save to CSV (optional)
df_episodes.to_csv("../output_data/law_and_order_svu_episodes.csv", index=False)


In [28]:
df_episodes

Unnamed: 0,Season,Episode,Title,Air Date,Description,Rating,Image URL,Main Cast
0,1,1,Payback,"Mon, Sep 20, 1999",Stabler and Benson investigate the murder and ...,8.2,https://m.media-amazon.com/images/M/MV5BNjkwNT...,"[Christopher Meloni, Mariska Hargitay, Richard..."
1,1,2,A Single Life,"Mon, Sep 27, 1999",A probable suicide becomes a murder investigat...,8.0,https://m.media-amazon.com/images/M/MV5BMTU3Nz...,"[Christopher Meloni, Mariska Hargitay, Richard..."
2,1,3,...Or Just Look Like One,"Mon, Oct 4, 1999","Two underage models are attacked, raped, and k...",7.7,https://m.media-amazon.com/images/M/MV5BMTY4ND...,"[Christopher Meloni, Mariska Hargitay, Richard..."
3,1,4,Hysteria,"Mon, Oct 11, 1999",Benson and Stabler investigate the murder of a...,8.1,https://m.media-amazon.com/images/M/MV5BMTUxMT...,"[Christopher Meloni, Mariska Hargitay, Richard..."
4,1,5,Wanderlust,"Mon, Oct 18, 1999","When a travel writer is found dead, gagged by ...",7.7,https://m.media-amazon.com/images/M/MV5BMTY4OD...,"[Christopher Meloni, Mariska Hargitay, Richard..."
...,...,...,...,...,...,...,...,...
569,26,19,Play with Fire Part 2,"Thu, Apr 17, 2025",SVU and 2-7 discover serial rapes and murders ...,8.1,https://m.media-amazon.com/images/M/MV5BM2M0OD...,"[Mariska Hargitay, Ice-T, Peter Scanavino, Oct..."
570,26,20,Shock Collar,"Thu, May 1, 2025",SVU investigates if a car theft involving a yo...,8.1,https://m.media-amazon.com/images/M/MV5BOTRhNz...,"[Mariska Hargitay, Ice-T, Peter Scanavino, Oct..."
571,26,21,Aperture,"Thu, May 8, 2025",When an assault at gunpoint is witnessed throu...,8.0,https://m.media-amazon.com/images/M/MV5BNjY2OT...,"[Mariska Hargitay, Ice-T, Peter Scanavino, Oct..."
572,26,22,Post-Rage,"Thu, May 15, 2025",SVU investigates a string of sexual assaults i...,7.4,https://m.media-amazon.com/images/M/MV5BN2UyMz...,"[Mariska Hargitay, Ice-T, Peter Scanavino, Oct..."
