In [None]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

def scrape_svu_episodes():
    base_url = "https://www.imdb.com/title/tt0203259/episodes/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    }
    
    all_episodes = []
    
    # Loop through each season
    season = 1
    max_seasons = 40  # Set a reasonable upper limit
    
    while season <= max_seasons:
        try:
            print(f"Scraping Season {season}...")
            season_url = f"{base_url}?season={season}"
            response = requests.get(season_url, headers=headers)
            
            if response.status_code != 200:
                print(f"Received status code {response.status_code}. Stopping.")
                break
                
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Check if we're on a valid season page
            if "Law" not in soup.title.text:
                print(f"Season {season} does not exist. Stopping.")
                break
                
            # Find all episode items based on the HTML structure shared
            episode_elements = soup.select('.episode-item-wrapper')
            
            if not episode_elements:
                print(f"Trying alternative selectors for Season {season}...")
                # Try different selectors based on IMDb's layout
                episode_elements = soup.select('[data-testid="episodes-list"] .ipc-metadata-list-summary-item') or \
                                  soup.select('.eplist .list_item') or \
                                  soup.select('.episodes-container .episode-container')
            
            if episode_elements:
                for episode in episode_elements:
                    # Extract title (now looking for the specific format in the provided HTML)
                    title_elem = episode.select_one('.ipc-title__text')
                    
                    if not title_elem:
                        title_elem = episode.select_one('a[itemprop="name"]') or \
                                     episode.select_one('a.ipc-link--inherit-color') or \
                                     episode.select_one('a strong') or \
                                     episode.select_one('.title a')
                    
                    # Extract air date
                    air_date_elem = episode.select_one('.larLSC') or episode.select_one('[class*="sc-"] span')
                    
                    # Extract description
                    desc_elem = episode.select_one('.ipc-html-content-inner-div') or episode.select_one('.item_description')
                    
                    # Extract rating
                    rating_elem = episode.select_one('.ipc-rating-star--rating') or episode.select_one('.ipc-rating-star')
                    
                    # Extract image URL - NEW ADDITION
                    image_elem = episode.select_one('.ipc-image')
                    image_url = None
                    if image_elem and image_elem.has_attr('src'):
                        image_url = image_elem['src']
                    
                    # Process extracted data
                    if title_elem:
                        title_text = title_elem.text.strip()
                        # Handle format like "S1.E1 ∙ Payback"
                        if "∙" in title_text:
                            title_text = title_text.split("∙", 1)[1].strip()
                        # Or handle format with episode number prefix
                        elif ". " in title_text and title_text[0].isdigit():
                            title_text = title_text.split(". ", 1)[1]
                        
                        # Create episode data dictionary
                        episode_data = {
                            "Season": season,
                            "Episode": None,  # Will try to extract from title if possible
                            "Title": title_text,
                            "Air Date": air_date_elem.text.strip() if air_date_elem else None,
                            "Description": desc_elem.text.strip() if desc_elem else None,
                            "Rating": None,
                            "Image URL": image_url  # Add the image URL
                        }
                        
                        # Try to extract episode number if present in the format "S1.E1"
                        if title_elem and "S" in title_elem.text and ".E" in title_elem.text:
                            ep_match = re.search(r'S\d+\.E(\d+)', title_elem.text)
                            if ep_match:
                                episode_data["Episode"] = int(ep_match.group(1))
                        
                        # Extract rating if present
                        if rating_elem:
                            rating_text = rating_elem.text.strip()
                            rating_match = re.search(r'(\d+\.\d+)', rating_text)
                            if rating_match:
                                episode_data["Rating"] = float(rating_match.group(1))
                        
                        all_episodes.append(episode_data)
                
                print(f"Found {len(episode_elements)} episodes in Season {season}")
                season += 1
            else:
                print(f"No episodes found for Season {season}. This might be the last season.")
                break
            
            # Be respectful to the server
            time.sleep(1)
            
        except Exception as e:
            print(f"Error scraping Season {season}: {e}")
            break
    
    return all_episodes

# Scrape the episodes
episodes = scrape_svu_episodes()

# Convert to DataFrame for better display in Jupyter
df_episodes = pd.DataFrame(episodes)

# Display the results
print(f"\nFound {len(episodes)} episodes in total:")
df_episodes.head(10)  # Show first 10 episodes

# Save to CSV (optional)
df_episodes.to_csv("output_data/law_and_order_svu_episodes.csv", index=False)

Scraping Season 1...
Found 22 episodes in Season 1
Scraping Season 2...
Found 21 episodes in Season 2
Scraping Season 3...
Found 23 episodes in Season 3
Scraping Season 4...
Found 25 episodes in Season 4
Scraping Season 5...
Found 25 episodes in Season 5
Scraping Season 6...
Found 23 episodes in Season 6
Scraping Season 7...
Found 22 episodes in Season 7
Scraping Season 8...
Found 22 episodes in Season 8
Scraping Season 9...
Found 19 episodes in Season 9
Scraping Season 10...
Found 22 episodes in Season 10
Scraping Season 11...
Found 24 episodes in Season 11
Scraping Season 12...
Found 24 episodes in Season 12
Scraping Season 13...
Found 23 episodes in Season 13
Scraping Season 14...
Found 24 episodes in Season 14
Scraping Season 15...
Found 24 episodes in Season 15
Scraping Season 16...
Found 23 episodes in Season 16
Scraping Season 17...
Found 23 episodes in Season 17
Scraping Season 18...
Found 21 episodes in Season 18
Scraping Season 19...
Found 24 episodes in Season 19
Scraping Se