## Data Analysis and Cleaning

In [1]:
import numpy as np
import pandas as pd

In [10]:
all_titles = pd.read_csv('data/title.basics.tsv', sep='\t')
all_ratings = pd.read_csv('data/title.ratings.tsv', sep='\t')

# Filter titles to only include movies and TV series
movies_and_series = all_titles[all_titles['titleType'].isin(['movie', 'tvSeries'])]

# Convert columns to correct data types
all_ratings['averageRating'] = pd.to_numeric(all_ratings['averageRating'], errors='coerce')
all_ratings['numVotes'] = pd.to_numeric(all_ratings['numVotes'], errors='coerce')

# Merge titles and ratings, keeping only movies and TV series
merged_data = movies_and_series.merge(all_ratings, on='tconst', how='inner')

# Filter movies/series with at least 1000 votes
df_filtered = merged_data[merged_data['numVotes'] >= 1000]

# Sort by rating (descending), then by numVotes (descending for tie-breaking)
df_sorted = df_filtered.sort_values(by=['averageRating', 'numVotes'], ascending=[False, False])

# Save to new TSV file
df_sorted.to_csv('data/filtered_sorted_with_ratings.tsv', sep='\t', index=False)

In [11]:
# IMDb-style weighted rating system for top 10,000 movies and series

# Calculate overall statistics for the weighted rating formula
overall_mean_rating = df_filtered['averageRating'].mean()
min_votes_required = df_filtered['numVotes'].quantile(0.75)  # Use 75th percentile as minimum

print(f"Overall mean rating: {overall_mean_rating:.2f}")
print(f"Minimum votes threshold (75th percentile): {min_votes_required:.0f}")

# Apply Bayesian weighted rating formula
# Weighted Rating = (v / (v + m)) * R + (m / (v + m)) * C
# Where: v = votes, m = min votes, R = average rating, C = overall mean
def calculate_weighted_rating(row):
    v = row['numVotes']
    R = row['averageRating']
    m = min_votes_required
    C = overall_mean_rating
    
    weighted_rating = (v / (v + m)) * R + (m / (v + m)) * C
    return weighted_rating

# Add weighted rating column
df_filtered['weightedRating'] = df_filtered.apply(calculate_weighted_rating, axis=1)

# Sort by weighted rating (descending), then by numVotes (descending for tie-breaking)
df_weighted_sorted = df_filtered.sort_values(by=['weightedRating', 'numVotes'], ascending=[False, False])

# Get top 10,000 based on weighted ratings
top_10000_weighted = df_weighted_sorted.head(10000)

# Save the weighted top 10,000 to file
top_10000_weighted.to_csv('data/top_10000_weighted_ratings.tsv', sep='\t', index=False)

print(f"\nTop 10 movies/series by weighted rating:")
print(top_10000_weighted[['primaryTitle', 'averageRating', 'numVotes', 'weightedRating']].head(10))

Overall mean rating: 6.39
Minimum votes threshold (75th percentile): 10202

Top 10 movies/series by weighted rating:
                      primaryTitle  averageRating  numVotes  weightedRating
175388                Breaking Bad            9.5   2405005        9.486874
67193     The Shawshank Redemption            9.3   3104334        9.290476
153336  Avatar: The Last Airbender            9.3    417720        9.230683
129412                    The Wire            9.3    413512        9.229994
176712             Game of Thrones            9.2   2485498        9.188523
39197                The Godfather            9.2   2163561        9.186824
80066                 The Sopranos            9.2    545115        9.148422
162488             The Dark Knight            9.1   3079402        9.091060
308327             Attack on Titan            9.1    649760        9.058146
234194                   Aspirants            9.1    316623        9.015484


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['weightedRating'] = df_filtered.apply(calculate_weighted_rating, axis=1)


## Adding Cast to data we already have

### Clean title.principals.tsv to only have movies/series in top 10000

In [33]:
# clean title.principals.tsv to only have movies/series in top 10000
all_principals = pd.read_csv('data/title.principals.tsv', sep='\t')
top_10000_df = pd.read_csv('data/top_10000_weighted_ratings.tsv', sep='\t')

# Filter all_principals to only include titles in top_10000_df
filtered_principals = all_principals[all_principals['tconst'].isin(top_10000_df['tconst'])]
filtered_principals.to_csv('data/filtered_title_principals_top_10000.tsv', sep='\t', index=False)

### Getting top 3 actor/actress names and their respective characters

In [3]:
principals = pd.read_csv('data/cast_data/filtered_title_principals_top_10000.tsv', sep='\t')
names = pd.read_csv('data/cast_data/name.basics.tsv', sep='\t')
top10000 = pd.read_csv('data/top_10000_weighted_ratings.tsv', sep='\t')

# Create a sample of top 10 entries for testing
top10_sample = top10000.head(10)

# Get each tconst from top10_sample and find the top 3 UNIQUE actors/actresses nconst in principals and their respective characters
def get_top_3_cast(tconst):
    cast = principals[principals['tconst'] == tconst]
    cast = cast[cast['category'].isin(['actor', 'actress'])]
    
    result = []
    seen_nconst = set()  # Track unique nconst values
    
    for _, row in cast.iterrows():
        nconst = row['nconst']
        
        # Skip if we've already seen this nconst
        if nconst in seen_nconst:
            continue
            
        character = row['characters']
        
        # Clean the character field - remove brackets and extra quotes
        if pd.notna(character) and character != '\\N':
            import json
            try:
                # Try to parse as JSON first
                character_list = json.loads(character)
                if isinstance(character_list, list) and len(character_list) > 0:
                    clean_character = character_list[0]  # Get first character name
                else:
                    clean_character = str(character)
            except (json.JSONDecodeError, ValueError):
                # If JSON parsing fails, try manual cleaning
                clean_character = character.strip('[]"').replace('""', '"')
        else:
            clean_character = "Unknown Character"
        
        name_row = names[names['nconst'] == nconst]
        
        if not name_row.empty:
            actor_name = name_row.iloc[0]['primaryName']
            result.append((actor_name, clean_character))
            seen_nconst.add(nconst)
            
            # Stop when we have 3 unique cast members
            if len(result) >= 3:
                break
    
    print(f"Processed {tconst}: Found {len(result)} unique cast members.")
    print(result)
    
    return result

# Apply function with progress bar
from tqdm import tqdm
tqdm.pandas(desc="Processing cast data")

print("Extracting top 3 unique cast members for each title (top 10 sample)...")
top10_sample['top_3_cast'] = top10_sample['tconst'].head(10).progress_apply(get_top_3_cast) # Remove .head(10) to process all 10,000 entries

# Save results
top10_sample.to_csv('data/cast_data/top_10_with_cast.tsv', sep='\t', index=False)
print(f"\nProcessing completed! Results saved to 'data/cast_data/top_10_with_cast.tsv'")

# Display sample results
print("\nSample results:")
print(top10_sample[['primaryTitle', 'top_3_cast']])

Extracting top 3 unique cast members for each title (top 10 sample)...


Processing cast data:  20%|██        | 2/10 [00:01<00:07,  1.09it/s]

Processed tt0903747: Found 3 unique cast members.
[('Bryan Cranston', 'Walter White'), ('Aaron Paul', 'Jesse Pinkman'), ('Anna Gunn', 'Skyler White')]


Processing cast data:  30%|███       | 3/10 [00:03<00:08,  1.25s/it]

Processed tt0111161: Found 3 unique cast members.
[('Tim Robbins', 'Andy Dufresne'), ('Morgan Freeman', "Ellis Boyd 'Red' Redding"), ('Bob Gunton', 'Warden Norton')]


Processing cast data:  40%|████      | 4/10 [00:05<00:08,  1.43s/it]

Processed tt0417299: Found 3 unique cast members.
[('Dee Bradley Baker', 'Appa'), ('Zach Tyler Eisen', 'Aang'), ('Mae Whitman', 'Katara')]


Processing cast data:  50%|█████     | 5/10 [00:06<00:07,  1.53s/it]

Processed tt0306414: Found 3 unique cast members.
[('Dominic West', "Detective James 'Jimmy' McNulty"), ('Lance Reddick', 'Lieutenant Cedric Daniels'), ('Sonja Sohn', "Detective Shakima 'Kima' Greggs")]


Processing cast data:  60%|██████    | 6/10 [00:08<00:06,  1.58s/it]

Processed tt0944947: Found 3 unique cast members.
[('Emilia Clarke', 'Daenerys Targaryen'), ('Peter Dinklage', 'Tyrion Lannister'), ('Kit Harington', 'Jon Snow')]


Processing cast data:  70%|███████   | 7/10 [00:10<00:04,  1.62s/it]

Processed tt0068646: Found 3 unique cast members.
[('Marlon Brando', 'Don Vito Corleone'), ('Al Pacino', 'Michael'), ('James Caan', 'Sonny')]


Processing cast data:  80%|████████  | 8/10 [00:12<00:03,  1.65s/it]

Processed tt0141842: Found 3 unique cast members.
[('James Gandolfini', 'Tony Soprano'), ('Lorraine Bracco', 'Dr. Jennifer Melfi'), ('Edie Falco', 'Carmela Soprano')]


Processing cast data:  90%|█████████ | 9/10 [00:13<00:01,  1.67s/it]

Processed tt0468569: Found 3 unique cast members.
[('Christian Bale', 'Bruce Wayne'), ('Heath Ledger', 'Joker'), ('Aaron Eckhart', 'Harvey Dent')]


Processing cast data: 100%|██████████| 10/10 [00:15<00:00,  1.68s/it]

Processed tt2560140: Found 3 unique cast members.
[('Jessie James Grelle', 'Armin Arlert'), ('Bryce Papenbrook', 'Eren Jaeger'), ('Trina Nishimura', 'Mikasa Ackermann')]


Processing cast data: 100%|██████████| 10/10 [00:17<00:00,  1.72s/it]

Processed tt14392248: Found 3 unique cast members.
[('Naveen Kasturia', 'Abhilash Sharma'), ('Shivankit Singh Parihar', 'Guri'), ('Abhilash Thapliyal', 'SK')]

Processing completed! Results saved to 'data/cast_data/top_10_with_cast.tsv'

Sample results:
                 primaryTitle  \
0                Breaking Bad   
1    The Shawshank Redemption   
2  Avatar: The Last Airbender   
3                    The Wire   
4             Game of Thrones   
5               The Godfather   
6                The Sopranos   
7             The Dark Knight   
8             Attack on Titan   
9                   Aspirants   

                                          top_3_cast  
0  [(Bryan Cranston, Walter White), (Aaron Paul, ...  
1  [(Tim Robbins, Andy Dufresne), (Morgan Freeman...  
2  [(Dee Bradley Baker, Appa), (Zach Tyler Eisen,...  
3  [(Dominic West, Detective James 'Jimmy' McNult...  
4  [(Emilia Clarke, Daenerys Targaryen), (Peter D...  
5  [(Marlon Brando, Don Vito Corleone), (Al Pacin...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top10_sample['top_3_cast'] = top10_sample['tconst'].head(10).progress_apply(get_top_3_cast) # Remove .head(10) to process all 10,000 entries


## Web Scraping for Movie/Series Description

In [1]:
# Web scraping script for StreamWithVPN data
%pip install beautifulsoup4
import requests
from bs4 import BeautifulSoup
import time
import re
from urllib.parse import quote
import pandas as pd
from tqdm import tqdm

# Load the top 10,000 weighted ratings data
top_10000_df = pd.read_csv('data/top_10000_weighted_ratings.tsv', sep='\t')

def clean_title_for_url(title):
    """
    Clean and format movie/series title for URL generation
    """
    # Remove special characters and replace spaces with hyphens
    cleaned = re.sub(r'[^\w\s-]', '', title)
    cleaned = re.sub(r'\s+', '-', cleaned.strip())
    return cleaned.lower()

def generate_streamwithvpn_url(title, year):
    """
    Generate StreamWithVPN URL based on title and year
    Example: "The Wolf's Call" (2019) -> "https://www.streamwithvpn.com/the-wolfs-call-2019"
    """
    clean_title = clean_title_for_url(title)
    # Handle cases where year might be NaN or missing
    if pd.isna(year):
        return f"https://www.streamwithvpn.com/{clean_title}"
    else:
        return f"https://www.streamwithvpn.com/{clean_title}-{int(year)}"

def scrape_movie_data(url, tconst, title, year, endYear, titleType, isAdult, runtime, genres, rating, numVotes):
    """
    Scrape movie/series data from StreamWithVPN
    Returns dictionary with description, cast, and streaming platforms
    """
    try:
        # Add delay to be respectful to the server
        time.sleep(1)
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Try the original URL first
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            # print(f"✓ Success with original URL: {url}")
        except requests.exceptions.HTTPError as e:
            if e.response.status_code in [404, 403]:  # Page not found or forbidden
                # Try without year
                url_without_year = generate_streamwithvpn_url(title, None)
                # print(f"⚠ Original URL failed ({e.response.status_code}), trying without year: {url_without_year}")
                
                try:
                    response = requests.get(url_without_year, headers=headers, timeout=10)
                    response.raise_for_status()
                    # print(f"✓ Success with URL without year: {url_without_year}")
                    # Update the URL in the data dictionary for accuracy
                    url = url_without_year
                except requests.exceptions.HTTPError:
                    # print(f"✗ Both URLs failed for {title}")
                    raise  # Re-raise the exception to be caught by outer try-catch
            else:
                raise  # Re-raise non-404/403 errors
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Initialize data dictionary
        movie_data = {
            'tconst': tconst,
            'titleType': titleType,
            'title': title,
            'year': year,
            'endYear': endYear,
            'isAdult': isAdult,
            'runtime': runtime,
            'genres': genres,
            'rating': rating,
            'numVotes': numVotes,
            'description': None,
            'cast': None,
            'streaming_platforms': None,
            'url': url,
            'scrape_status': 'success'
        }
        
        # Extract DESCRIPTION - multiple approaches
        description_element = soup.find('span', class_='rt-Text EntryDetailDescription_contentDescription__tXYGO EntryDetailDescription_expanded__3a0Gs')
        
        if not description_element:
            description_element = soup.find('span', class_=re.compile('EntryDetailDescription_contentDescription'))
        
        if not description_element:
            description_element = soup.find('span', class_=re.compile('contentDescription'))
        
        if not description_element:
            description_element = soup.select_one('span[class*="EntryDetailDescription_contentDescription"]')
        
        if description_element:
            movie_data['description'] = description_element.get_text(strip=True)
            # print(f"✓ Found description for {title}: {movie_data['description'][:100]}...")
        else:
            # print(f"✗ No description found for {title}")
            pass
        """
        # Extract CAST information
        cast_list = []
        # Target container div
        container_div = soup.find('div', class_='rt-Flex rt-r-fd-column rt-r-gap rt-r-px rt-r-pt rt-r-w', style='--gap: 2px; --pl: 16px; --pr: 16px; --pt: 8px; --width: 100%;')
        if container_div:
            cast_spans = container_div.find_all('span', {'data-accent-color': 'gray', 'class_': 'rt-Text rt-r-size-3 rt-r-weight-medium', 'style': 'min-width: 0px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;'})
            if cast_spans:
                for span in cast_spans:
                    cast_list.append(span.get_text(strip=True))
                movie_data['cast'] = ', '.join(cast_list)
                print(f"✓ Found cast for {title}: {movie_data['cast'][:100]}...")
            else:
                print(f"✗ Span not found for {title}")
        else:
            print(f"✗ Div not found for {title}")

        # Extract STREAMING PLATFORMS information
        platform_elements = soup.find_all('h2', class_='rt-Heading rt-r-size-5 rt-r-weight-medium rt-r-ta-left')
        if platform_elements:
            platforms = [elem.get_text(strip=True) for elem in platform_elements]
            movie_data['streaming_platforms'] = ', '.join(platforms)
        else:
            print(f"✗ No streaming platforms found for {title}")
        """
        
        return movie_data
        
    except requests.RequestException as e:
        print(f"Request error for {title}: {e}")
        return {
            'tconst': tconst,
            'title': title,
            'url': url,
            'description': None,
            'cast': None,
            'streaming_platforms': None,
            'scrape_status': f'request_error: {str(e)}'
        }
    except Exception as e:
        print(f"Parsing error for {title}: {e}")
        return {
            'tconst': tconst,
            'title': title,
            'url': url,
            'description': None,
            'cast': None,
            'streaming_platforms': None,
            'scrape_status': f'parsing_error: {str(e)}'
        }

# Initialize list to store scraped data
scraped_data = []

# Sample scraping for first 2 entries (for faster debugging)
print("Starting web scraping")
sample_df = top_10000_df.head(10) # Change to 10000 for full run (2 for testing)

for index, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Scraping movies"):
    tconst = row['tconst']
    title = row['primaryTitle']
    year = row['startYear']
    endYear = row['endYear']
    titleType = row['titleType']
    isAdult = row['isAdult']
    runtime = row['runtimeMinutes']
    genres = row['genres']
    rating = row['averageRating']
    numVotes = row['numVotes']
    
    
    # Generate URL
    url = generate_streamwithvpn_url(title, year)
    # print(f"\nScraping: {title} ({year}) - {url}")
    
    # Scrape data
    movie_data = scrape_movie_data(url, tconst, title, year, endYear, titleType, isAdult, runtime, genres, rating, numVotes)
    scraped_data.append(movie_data)

# Convert to DataFrame
scraped_df = pd.DataFrame(scraped_data)

# Display results
print(f"\nScraping completed! Found data for {len(scraped_df)} entries")
print(f"Success rate: {len(scraped_df[scraped_df['scrape_status'] == 'success'])} / {len(scraped_df)}")

# Show sample results
print("\nSample scraped data:")
print(scraped_df[['title', 'url', 'scrape_status', 'description', 'cast']].head())

# Save scraped data
scraped_df.to_csv('data/top10000_final.tsv', sep='\t', index=False)
print("\nSample data saved to 'data/top10000_final.tsv'")

Collecting beautifulsoup4Note: you may need to restart the kernel to use updated packages.

  Downloading beautifulsoup4-4.14.2-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.8-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.14.2-py3-none-any.whl (106 kB)
Downloading soupsieve-2.8-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4

   ---------------------------------------- 0/2 [soupsieve]
   ---------------------------------------- 0/2 [soupsieve]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   ---------------------------------------- 2/2 



Starting web scraping


Scraping movies: 100%|██████████| 10/10 [00:22<00:00,  2.25s/it]


Scraping completed! Found data for 10 entries
Success rate: 10 / 10

Sample scraped data:
                        title  \
0                Breaking Bad   
1    The Shawshank Redemption   
2  Avatar: The Last Airbender   
3                    The Wire   
4             Game of Thrones   

                                                 url scrape_status  \
0    https://www.streamwithvpn.com/breaking-bad-2008       success   
1  https://www.streamwithvpn.com/the-shawshank-re...       success   
2  https://www.streamwithvpn.com/avatar-the-last-...       success   
3        https://www.streamwithvpn.com/the-wire-2002       success   
4  https://www.streamwithvpn.com/game-of-thrones-...       success   

                                         description  cast  
0  Walter White, a New Mexico chemistry teacher, ...  None  
1  Imprisoned in the 1940s for the double murder ...  None  
2  In a war-torn world of elemental magic, a youn...  None  
3  Told from the points of view of both the B




In [None]:
# Web scraping script for StreamWithVPN data
%pip install beautifulsoup4
import requests
from bs4 import BeautifulSoup
import time
import re
from urllib.parse import quote
import pandas as pd
from tqdm import tqdm

# Load the top 10,000 weighted ratings data
top_10000_df = pd.read_csv('data/top_10000_weighted_ratings.tsv', sep='\t')

def clean_title_for_url(title):
    """
    Clean and format movie/series title for URL generation
    """
    # Remove special characters and replace spaces with hyphens
    cleaned = re.sub(r'[^\w\s-]', '', title)
    cleaned = re.sub(r'\s+', '-', cleaned.strip())
    return cleaned.lower()

def generate_streamwithvpn_url(title, year):
    """
    Generate StreamWithVPN URL based on title and year
    Example: "The Wolf's Call" (2019) -> "https://www.streamwithvpn.com/the-wolfs-call-2019"
    """
    clean_title = clean_title_for_url(title)
    # Handle cases where year might be NaN or missing
    if pd.isna(year):
        return f"https://www.streamwithvpn.com/{clean_title}"
    else:
        return f"https://www.streamwithvpn.com/{clean_title}-{int(year)}"

def scrape_movie_data(url, tconst, title, year, endYear, titleType, isAdult, runtime, genres, rating, numVotes):
    """
    Scrape movie/series data from StreamWithVPN
    Returns dictionary with description, cast, and streaming platforms
    """
    try:
        # Add delay to be respectful to the server
        time.sleep(1)
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            if e.response.status_code in [404, 403]:
                url_without_year = generate_streamwithvpn_url(title, None)
                try:
                    response = requests.get(url_without_year, headers=headers, timeout=10)
                    response.raise_for_status()
                    url = url_without_year
                except requests.exceptions.HTTPError:
                    raise
            else:
                raise
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        movie_data = {
            'tconst': tconst,
            'titleType': titleType,
            'title': title,
            'year': year,
            'endYear': endYear,
            'isAdult': isAdult,
            'runtime': runtime,
            'genres': genres,
            'rating': rating,
            'numVotes': numVotes,
            'description': None,
            'cast': None,
            'streaming_platforms': None,
            'url': url,
            'scrape_status': 'success'
        }
        
        # Extract description
        description_element = soup.find('span', class_='rt-Text EntryDetailDescription_contentDescription__tXYGO EntryDetailDescription_expanded__3a0Gs')
        
        if not description_element:
            description_element = soup.find('span', class_=re.compile('EntryDetailDescription_contentDescription'))
        if not description_element:
            description_element = soup.find('span', class_=re.compile('contentDescription'))
        if not description_element:
            description_element = soup.select_one('span[class*="EntryDetailDescription_contentDescription"]')
        
        if description_element:
            movie_data['description'] = description_element.get_text(strip=True)
        
        return movie_data
        
    except requests.RequestException as e:
        return {
            'tconst': tconst,
            'title': title,
            'url': url,
            'description': None,
            'cast': None,
            'streaming_platforms': None,
            'scrape_status': f'request_error: {str(e)}'
        }
    except Exception as e:
        return {
            'tconst': tconst,
            'title': title,
            'url': url,
            'description': None,
            'cast': None,
            'streaming_platforms': None,
            'scrape_status': f'parsing_error: {str(e)}'
        }

# Initialize variables
chunk_size = 500  # Process 500 movies at a time
total_chunks = len(top_10000_df) // chunk_size + 1
start_time = time.time()

# Process in chunks
for chunk_start in tqdm(range(0, len(top_10000_df), chunk_size), desc="Processing chunks"):
    chunk_end = min(chunk_start + chunk_size, len(top_10000_df))
    chunk_df = top_10000_df.iloc[chunk_start:chunk_end]
    chunk_data = []
    
    # Process each movie in the chunk
    for _, row in tqdm(chunk_df.iterrows(), total=len(chunk_df), desc=f"Chunk {chunk_start//chunk_size + 1}/{total_chunks}", leave=False):
        try:
            tconst = row['tconst']
            title = row['primaryTitle']
            year = row['startYear']
            endYear = row['endYear']
            titleType = row['titleType']
            isAdult = row['isAdult']
            runtime = row['runtimeMinutes']
            genres = row['genres']
            rating = row['averageRating']
            numVotes = row['numVotes']
            
            url = generate_streamwithvpn_url(title, year)
            movie_data = scrape_movie_data(url, tconst, title, year, endYear, 
                                         titleType, isAdult, runtime, genres, 
                                         rating, numVotes)
            chunk_data.append(movie_data)
            
        except Exception as e:
            print(f"\nError processing {title}: {str(e)}")
            continue
    
    # Save chunk progress
    chunk_df = pd.DataFrame(chunk_data)
    chunk_df.to_csv(f'data/temp_scrape_chunk_{chunk_start}.csv', sep='\t', index=False)
    
    # Print progress stats
    elapsed = time.time() - start_time
    processed = chunk_end
    remaining = len(top_10000_df) - processed
    rate = processed / elapsed
    eta = remaining / rate if rate > 0 else 0
    
    print(f"\nChunk {chunk_start}-{chunk_end} completed")
    print(f"Processed {processed:,}/{len(top_10000_df):,} movies in {elapsed/3600:.2f} hours")
    print(f"Estimated time remaining: {eta/3600:.2f} hours")
    print(f"Success rate in chunk: {len(chunk_df[chunk_df['scrape_status'] == 'success'])} / {len(chunk_df)}")

# Combine all chunks
print("\nCombining chunks...")
all_chunks = []
for chunk_start in range(0, len(top_10000_df), chunk_size):
    try:
        chunk = pd.read_csv(f'data/temp_scrape_chunk_{chunk_start}.csv', sep='\t')
        all_chunks.append(chunk)
    except Exception as e:
        print(f"Error reading chunk {chunk_start}: {str(e)}")

scraped_df = pd.concat(all_chunks, ignore_index=True)

# Save final results
scraped_df.to_csv('data/top10000_final.tsv', sep='\t', index=False)

# Print final statistics
total_time = time.time() - start_time
print(f"\nProcessing completed in {total_time/3600:.2f} hours")
print(f"Total movies processed: {len(scraped_df):,}")
print(f"Overall success rate: {len(scraped_df[scraped_df['scrape_status'] == 'success']):,} / {len(scraped_df):,}")

# Clean up temporary files
import os
for chunk_start in range(0, len(top_10000_df), chunk_size):
    try:
        os.remove(f'data/temp_scrape_chunk_{chunk_start}.csv')
    except:
        pass



Note: you may need to restart the kernel to use updated packages.


Processing chunks:   5%|▌         | 1/20 [17:18<5:28:53, 1038.60s/it]


Chunk 0-500 completed
Processed 500/10,000 movies in 0.29 hours
Estimated time remaining: 5.48 hours
Success rate in chunk: 447 / 500


Processing chunks:  10%|█         | 2/20 [34:34<5:11:01, 1036.76s/it]


Chunk 500-1000 completed
Processed 1,000/10,000 movies in 0.58 hours
Estimated time remaining: 5.19 hours
Success rate in chunk: 448 / 500


Processing chunks:  15%|█▌        | 3/20 [51:44<4:52:53, 1033.73s/it]


Chunk 1000-1500 completed
Processed 1,500/10,000 movies in 0.86 hours
Estimated time remaining: 4.89 hours
Success rate in chunk: 438 / 500


Processing chunks:  20%|██        | 4/20 [1:08:45<4:34:22, 1028.88s/it]


Chunk 1500-2000 completed
Processed 2,000/10,000 movies in 1.15 hours
Estimated time remaining: 4.58 hours
Success rate in chunk: 427 / 500


Processing chunks:  25%|██▌       | 5/20 [1:26:21<4:19:39, 1038.66s/it]


Chunk 2000-2500 completed
Processed 2,500/10,000 movies in 1.44 hours
Estimated time remaining: 4.32 hours
Success rate in chunk: 449 / 500


Processing chunks:  30%|███       | 6/20 [1:43:15<4:00:21, 1030.09s/it]


Chunk 2500-3000 completed
Processed 3,000/10,000 movies in 1.72 hours
Estimated time remaining: 4.02 hours
Success rate in chunk: 412 / 500


Processing chunks:  35%|███▌      | 7/20 [2:00:20<3:42:50, 1028.52s/it]


Chunk 3000-3500 completed
Processed 3,500/10,000 movies in 2.01 hours
Estimated time remaining: 3.72 hours
Success rate in chunk: 418 / 500


Processing chunks:  40%|████      | 8/20 [2:17:11<3:24:36, 1023.04s/it]


Chunk 3500-4000 completed
Processed 4,000/10,000 movies in 2.29 hours
Estimated time remaining: 3.43 hours
Success rate in chunk: 410 / 500


Processing chunks:  45%|████▌     | 9/20 [2:34:08<3:07:11, 1021.02s/it]


Chunk 4000-4500 completed
Processed 4,500/10,000 movies in 2.57 hours
Estimated time remaining: 3.14 hours
Success rate in chunk: 408 / 500


Processing chunks:  50%|█████     | 10/20 [2:50:59<2:49:39, 1017.97s/it]


Chunk 4500-5000 completed
Processed 5,000/10,000 movies in 2.85 hours
Estimated time remaining: 2.85 hours
Success rate in chunk: 407 / 500


Processing chunks:  55%|█████▌    | 11/20 [3:07:34<2:31:37, 1010.86s/it]


Chunk 5000-5500 completed
Processed 5,500/10,000 movies in 3.13 hours
Estimated time remaining: 2.56 hours
Success rate in chunk: 393 / 500


Processing chunks:  60%|██████    | 12/20 [3:24:20<2:14:36, 1009.59s/it]


Chunk 5500-6000 completed
Processed 6,000/10,000 movies in 3.41 hours
Estimated time remaining: 2.27 hours
Success rate in chunk: 390 / 500


Processing chunks:  65%|██████▌   | 13/20 [3:41:09<1:57:45, 1009.43s/it]


Chunk 6000-6500 completed
Processed 6,500/10,000 movies in 3.69 hours
Estimated time remaining: 1.98 hours
Success rate in chunk: 387 / 500


Processing chunks:  70%|███████   | 14/20 [3:57:39<1:40:19, 1003.33s/it]


Chunk 6500-7000 completed
Processed 7,000/10,000 movies in 3.96 hours
Estimated time remaining: 1.70 hours
Success rate in chunk: 378 / 500


Processing chunks:  75%|███████▌  | 15/20 [4:14:12<1:23:22, 1000.46s/it]


Chunk 7000-7500 completed
Processed 7,500/10,000 movies in 4.24 hours
Estimated time remaining: 1.41 hours
Success rate in chunk: 379 / 500


Processing chunks:  80%|████████  | 16/20 [4:30:48<1:06:35, 998.88s/it] 


Chunk 7500-8000 completed
Processed 8,000/10,000 movies in 4.51 hours
Estimated time remaining: 1.13 hours
Success rate in chunk: 384 / 500


Processing chunks:  85%|████████▌ | 17/20 [4:47:15<49:45, 995.33s/it]  


Chunk 8000-8500 completed
Processed 8,500/10,000 movies in 4.79 hours
Estimated time remaining: 0.84 hours
Success rate in chunk: 363 / 500


Processing chunks:  90%|█████████ | 18/20 [5:03:36<33:02, 991.08s/it]


Chunk 8500-9000 completed
Processed 9,000/10,000 movies in 5.06 hours
Estimated time remaining: 0.56 hours
Success rate in chunk: 346 / 500


Processing chunks:  95%|█████████▌| 19/20 [5:19:53<16:26, 986.78s/it]


Chunk 9000-9500 completed
Processed 9,500/10,000 movies in 5.33 hours
Estimated time remaining: 0.28 hours
Success rate in chunk: 352 / 500


Processing chunks: 100%|██████████| 20/20 [5:36:08<00:00, 1008.43s/it]



Chunk 9500-10000 completed
Processed 10,000/10,000 movies in 5.60 hours
Estimated time remaining: 0.00 hours
Success rate in chunk: 352 / 500

Combining chunks...

Processing completed in 5.60 hours
Total movies processed: 10,000
Overall success rate: 7,988 / 10,000
