## Data Analysis and Cleaning

In [2]:
import numpy as np
import pandas as pd

In [10]:
all_titles = pd.read_csv('data/title.basics.tsv', sep='\t')
all_ratings = pd.read_csv('data/title.ratings.tsv', sep='\t')

# Filter titles to only include movies and TV series
movies_and_series = all_titles[all_titles['titleType'].isin(['movie', 'tvSeries'])]

# Convert columns to correct data types
all_ratings['averageRating'] = pd.to_numeric(all_ratings['averageRating'], errors='coerce')
all_ratings['numVotes'] = pd.to_numeric(all_ratings['numVotes'], errors='coerce')

# Merge titles and ratings, keeping only movies and TV series
merged_data = movies_and_series.merge(all_ratings, on='tconst', how='inner')

# Filter movies/series with at least 1000 votes
df_filtered = merged_data[merged_data['numVotes'] >= 1000]

# Sort by rating (descending), then by numVotes (descending for tie-breaking)
df_sorted = df_filtered.sort_values(by=['averageRating', 'numVotes'], ascending=[False, False])

# Save to new TSV file
df_sorted.to_csv('data/filtered_sorted_with_ratings.tsv', sep='\t', index=False)

In [11]:
# IMDb-style weighted rating system for top 10,000 movies and series

# Calculate overall statistics for the weighted rating formula
overall_mean_rating = df_filtered['averageRating'].mean()
min_votes_required = df_filtered['numVotes'].quantile(0.75)  # Use 75th percentile as minimum

print(f"Overall mean rating: {overall_mean_rating:.2f}")
print(f"Minimum votes threshold (75th percentile): {min_votes_required:.0f}")

# Apply Bayesian weighted rating formula
# Weighted Rating = (v / (v + m)) * R + (m / (v + m)) * C
# Where: v = votes, m = min votes, R = average rating, C = overall mean
def calculate_weighted_rating(row):
    v = row['numVotes']
    R = row['averageRating']
    m = min_votes_required
    C = overall_mean_rating
    
    weighted_rating = (v / (v + m)) * R + (m / (v + m)) * C
    return weighted_rating

# Add weighted rating column
df_filtered['weightedRating'] = df_filtered.apply(calculate_weighted_rating, axis=1)

# Sort by weighted rating (descending), then by numVotes (descending for tie-breaking)
df_weighted_sorted = df_filtered.sort_values(by=['weightedRating', 'numVotes'], ascending=[False, False])

# Get top 10,000 based on weighted ratings
top_10000_weighted = df_weighted_sorted.head(10000)

# Save the weighted top 10,000 to file
top_10000_weighted.to_csv('data/top_10000_weighted_ratings.tsv', sep='\t', index=False)

print(f"\nTop 10 movies/series by weighted rating:")
print(top_10000_weighted[['primaryTitle', 'averageRating', 'numVotes', 'weightedRating']].head(10))

Overall mean rating: 6.39
Minimum votes threshold (75th percentile): 10202

Top 10 movies/series by weighted rating:
                      primaryTitle  averageRating  numVotes  weightedRating
175388                Breaking Bad            9.5   2405005        9.486874
67193     The Shawshank Redemption            9.3   3104334        9.290476
153336  Avatar: The Last Airbender            9.3    417720        9.230683
129412                    The Wire            9.3    413512        9.229994
176712             Game of Thrones            9.2   2485498        9.188523
39197                The Godfather            9.2   2163561        9.186824
80066                 The Sopranos            9.2    545115        9.148422
162488             The Dark Knight            9.1   3079402        9.091060
308327             Attack on Titan            9.1    649760        9.058146
234194                   Aspirants            9.1    316623        9.015484


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['weightedRating'] = df_filtered.apply(calculate_weighted_rating, axis=1)


## Adding Cast to data we already have

### Clean title.principals.tsv to only have movies/series in top 10000

In [33]:
# clean title.principals.tsv to only have movies/series in top 10000
all_principals = pd.read_csv('data/title.principals.tsv', sep='\t')
top_10000_df = pd.read_csv('data/top_10000_weighted_ratings.tsv', sep='\t')

# Filter all_principals to only include titles in top_10000_df
filtered_principals = all_principals[all_principals['tconst'].isin(top_10000_df['tconst'])]
filtered_principals.to_csv('data/filtered_title_principals_top_10000.tsv', sep='\t', index=False)

### Getting top 3 actor/actress names and their respective characters

In [4]:
principals = pd.read_csv('data/cast_data/filtered_title_principals_top_10000.tsv', sep='\t')
names = pd.read_csv('data/cast_data/name.basics.tsv', sep='\t')
top10000 = pd.read_csv('data/top_10000_weighted_ratings.tsv', sep='\t')

# Sample of top 10 entries for testing - change to top10000 for full processing
top10_sample = top10000.head(10)  # Change to top10000 for full dataset

# Get each tconst from dataset and find the top 3 UNIQUE actors/actresses nconst in principals and their respective characters
def get_top_3_cast(tconst):
    cast = principals[principals['tconst'] == tconst]
    cast = cast[cast['category'].isin(['actor', 'actress'])]
    
    result = []
    seen_nconst = set()  # Track unique nconst values
    
    for _, row in cast.iterrows():
        nconst = row['nconst']
        
        # Skip if we've already seen this nconst
        if nconst in seen_nconst:
            continue
            
        character = row['characters']
        
        # Clean the character field - remove brackets and extra quotes
        if pd.notna(character) and character != '\\N':
            import json
            try:
                # Try to parse as JSON first
                character_list = json.loads(character)
                if isinstance(character_list, list) and len(character_list) > 0:
                    clean_character = character_list[0]  # Get first character name
                else:
                    clean_character = str(character)
            except (json.JSONDecodeError, ValueError):
                # If JSON parsing fails, try manual cleaning
                clean_character = character.strip('[]"').replace('""', '"')
        else:
            clean_character = "Unknown Character"
        
        name_row = names[names['nconst'] == nconst]
        
        if not name_row.empty:
            actor_name = name_row.iloc[0]['primaryName']
            result.append((actor_name, clean_character))
            seen_nconst.add(nconst)
            
            # Stop when we have 3 unique cast members
            if len(result) >= 3:
                break
    
    return result

# Chunk-based processing with automatic saving
from tqdm import tqdm
import os

# Configuration
chunk_size = 500
dataset = top10000  # Change to top10000 for full processing
output_dir = 'data/cast_data/chunks'
final_output = 'data/cast_data/top_10_with_cast.tsv'  # Change to top_10000_with_cast.tsv for full processing

# Create chunks directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

print(f"Processing {len(dataset)} entries in chunks of {chunk_size}...")

# Process in chunks
all_results = []
for chunk_start in tqdm(range(0, len(dataset), chunk_size), desc="Processing chunks"):
    chunk_end = min(chunk_start + chunk_size, len(dataset))
    chunk_data = dataset.iloc[chunk_start:chunk_end].copy()
    
    print(f"\nProcessing chunk {chunk_start//chunk_size + 1}: entries {chunk_start+1} to {chunk_end}")
    
    # Process each entry in the chunk
    chunk_results = []
    for idx, (_, row) in enumerate(tqdm(chunk_data.iterrows(), total=len(chunk_data), desc=f"Chunk {chunk_start//chunk_size + 1}")):
        try:
            cast_result = get_top_3_cast(row['tconst'])
            chunk_results.append(cast_result)
            
            # Print progress every 50 entries within chunk
            if (idx + 1) % 50 == 0:
                print(f"  Processed {idx + 1}/{len(chunk_data)} in current chunk")
                
        except Exception as e:
            print(f"  Error processing {row['tconst']}: {e}")
            chunk_results.append([])  # Empty result for failed entries
    
    # Add results to chunk data
    chunk_data['top_3_cast'] = chunk_results
    
    # Save chunk to file
    chunk_filename = f"{output_dir}/chunk_{chunk_start//chunk_size + 1:03d}_{chunk_start+1:05d}_to_{chunk_end:05d}.tsv"
    chunk_data.to_csv(chunk_filename, sep='\t', index=False)
    print(f"  Saved chunk to: {chunk_filename}")
    
    # Add to all results
    all_results.append(chunk_data)
    
    print(f"  Chunk {chunk_start//chunk_size + 1} completed successfully!")

# Combine all chunks
print("\nCombining all chunks...")
final_dataset = pd.concat(all_results, ignore_index=True)

# Save final combined result
final_dataset.to_csv(final_output, sep='\t', index=False)
print(f"\nProcessing completed! Final results saved to '{final_output}'")
print(f"Total entries processed: {len(final_dataset)}")

# Display sample results
print("\nSample results:")
print(final_dataset[['primaryTitle', 'top_3_cast']].head(10))

# Clean up chunk files (optional - comment out if you want to keep them)
print(f"\nChunk files saved in '{output_dir}' for backup")
print("You can delete the chunks folder once you've verified the final file is correct")

Processing 10000 entries in chunks of 500...


Processing chunks:   0%|          | 0/20 [00:00<?, ?it/s]


Processing chunk 1: entries 1 to 500


Chunk 1:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 1:  10%|█         | 51/500 [01:25<12:46,  1.71s/it][A

  Processed 100/500 in current chunk


Chunk 1:  20%|██        | 101/500 [02:48<11:20,  1.71s/it][A

  Processed 150/500 in current chunk


Chunk 1:  30%|███       | 151/500 [04:12<09:53,  1.70s/it][A

  Processed 200/500 in current chunk


Chunk 1:  40%|████      | 201/500 [05:44<08:54,  1.79s/it][A

  Processed 250/500 in current chunk


Chunk 1:  50%|█████     | 251/500 [07:10<07:12,  1.74s/it][A

  Processed 300/500 in current chunk


Chunk 1:  60%|██████    | 301/500 [08:33<05:48,  1.75s/it][A

  Processed 350/500 in current chunk


Chunk 1:  70%|███████   | 351/500 [10:02<04:40,  1.88s/it][A

  Processed 400/500 in current chunk


[A

  Processed 450/500 in current chunk


Chunk 1: 100%|██████████| 500/500 [14:12<00:00,  1.70s/it][A
Chunk 1: 100%|██████████| 500/500 [14:12<00:00,  1.70s/it]52.11s/it]
Processing chunks:   5%|▌         | 1/20 [14:12<4:29:50, 852.11s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_001_00001_to_00500.tsv
  Chunk 1 completed successfully!

Processing chunk 2: entries 501 to 1000


Chunk 2:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


[A

  Processed 100/500 in current chunk


Chunk 2:  20%|██        | 101/500 [02:54<09:30,  1.43s/it][A

  Processed 150/500 in current chunk


Chunk 2:  30%|███       | 151/500 [04:15<10:07,  1.74s/it][A

  Processed 200/500 in current chunk


[A

  Processed 250/500 in current chunk


Chunk 2:  50%|█████     | 251/500 [07:05<06:15,  1.51s/it][A

  Processed 300/500 in current chunk


Chunk 2:  60%|██████    | 301/500 [08:26<04:37,  1.40s/it][A

  Processed 350/500 in current chunk


Chunk 2:  70%|███████   | 351/500 [09:49<03:21,  1.35s/it][A

  Processed 400/500 in current chunk


Chunk 2:  80%|████████  | 401/500 [11:10<02:12,  1.33s/it][A

  Processed 450/500 in current chunk


Chunk 2: 100%|██████████| 500/500 [13:50<00:00,  1.66s/it][A
Processing chunks:  10%|█         | 2/20 [28:02<4:11:50, 839.49s/it]
Processing chunks:  10%|█         | 2/20 [28:02<4:11:50, 839.49s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_002_00501_to_01000.tsv
  Chunk 2 completed successfully!

Processing chunk 3: entries 1001 to 1500


Chunk 3:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 3:  10%|█         | 51/500 [01:24<12:46,  1.71s/it][A

  Processed 100/500 in current chunk


Chunk 3:  20%|██        | 101/500 [02:50<11:16,  1.70s/it][A

  Processed 150/500 in current chunk


Chunk 3:  30%|███       | 151/500 [04:15<08:07,  1.40s/it][A

  Processed 200/500 in current chunk


Chunk 3:  40%|████      | 201/500 [05:40<08:29,  1.70s/it][A

  Processed 250/500 in current chunk


Chunk 3:  50%|█████     | 251/500 [07:04<07:04,  1.71s/it][A

  Processed 300/500 in current chunk


Chunk 3:  60%|██████    | 301/500 [08:23<05:40,  1.71s/it][A

  Processed 350/500 in current chunk


Chunk 3:  70%|███████   | 351/500 [09:44<04:12,  1.70s/it][A

  Processed 400/500 in current chunk


Chunk 3:  80%|████████  | 402/500 [11:07<02:09,  1.32s/it][A

  Processed 450/500 in current chunk


Chunk 3: 100%|██████████| 500/500 [13:51<00:00,  1.66s/it][A
Chunk 3: 100%|██████████| 500/500 [13:51<00:00,  1.66s/it]35.81s/it]
Processing chunks:  15%|█▌        | 3/20 [41:54<3:56:48, 835.81s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_003_01001_to_01500.tsv
  Chunk 3 completed successfully!

Processing chunk 4: entries 1501 to 2000


Chunk 4:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 4:  10%|█         | 51/500 [01:25<12:45,  1.71s/it][A

  Processed 100/500 in current chunk


Chunk 4:  20%|██        | 101/500 [02:49<11:12,  1.69s/it][A

  Processed 150/500 in current chunk


Chunk 4:  30%|███       | 151/500 [04:10<08:10,  1.40s/it][A

  Processed 200/500 in current chunk


Chunk 4:  40%|████      | 201/500 [05:28<07:03,  1.42s/it][A

  Processed 250/500 in current chunk


Chunk 4:  50%|█████     | 251/500 [06:50<07:01,  1.69s/it][A

  Processed 300/500 in current chunk


Chunk 4:  60%|██████    | 301/500 [08:12<05:36,  1.69s/it][A

  Processed 350/500 in current chunk


Chunk 4:  70%|███████   | 351/500 [09:32<04:12,  1.69s/it][A

  Processed 400/500 in current chunk


Chunk 4:  80%|████████  | 401/500 [10:54<02:49,  1.71s/it][A

  Processed 450/500 in current chunk


Chunk 4: 100%|██████████| 500/500 [13:38<00:00,  1.64s/it][A
Processing chunks:  20%|██        | 4/20 [55:32<3:41:04, 829.01s/it]
Processing chunks:  20%|██        | 4/20 [55:32<3:41:04, 829.01s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_004_01501_to_02000.tsv
  Chunk 4 completed successfully!

Processing chunk 5: entries 2001 to 2500


Chunk 5:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 5:  10%|█         | 51/500 [01:24<12:45,  1.70s/it][A

  Processed 100/500 in current chunk


Chunk 5:  20%|██        | 101/500 [02:44<10:58,  1.65s/it][A

  Processed 150/500 in current chunk


Chunk 5:  30%|███       | 151/500 [04:07<09:43,  1.67s/it][A

  Processed 200/500 in current chunk


Chunk 5:  40%|████      | 202/500 [05:24<06:27,  1.30s/it][A

  Processed 250/500 in current chunk


Chunk 5:  50%|█████     | 251/500 [06:45<06:10,  1.49s/it][A

  Processed 300/500 in current chunk


Chunk 5:  60%|██████    | 301/500 [08:09<05:37,  1.70s/it][A

  Processed 350/500 in current chunk


Chunk 5:  70%|███████   | 351/500 [09:23<03:25,  1.38s/it][A

  Processed 400/500 in current chunk


Chunk 5:  80%|████████  | 401/500 [10:47<02:47,  1.69s/it][A

  Processed 450/500 in current chunk


Chunk 5: 100%|██████████| 500/500 [13:31<00:00,  1.62s/it][A
Chunk 5: 100%|██████████| 500/500 [13:31<00:00,  1.62s/it] 822.59s/it]
Processing chunks:  25%|██▌       | 5/20 [1:09:03<3:25:38, 822.59s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_005_02001_to_02500.tsv
  Chunk 5 completed successfully!

Processing chunk 6: entries 2501 to 3000


Chunk 6:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 6:  10%|█         | 51/500 [01:17<12:22,  1.65s/it][A

  Processed 100/500 in current chunk


Chunk 6:  20%|██        | 101/500 [02:40<08:39,  1.30s/it][A

  Processed 150/500 in current chunk


Chunk 6:  30%|███       | 151/500 [04:05<09:49,  1.69s/it][A

  Processed 200/500 in current chunk


Chunk 6:  40%|████      | 201/500 [05:28<08:27,  1.70s/it][A

  Processed 250/500 in current chunk


Chunk 6:  50%|█████     | 251/500 [06:49<07:01,  1.69s/it][A

  Processed 300/500 in current chunk


Chunk 6:  60%|██████    | 301/500 [08:11<05:34,  1.68s/it][A

  Processed 350/500 in current chunk


Chunk 6:  70%|███████   | 351/500 [09:30<04:11,  1.69s/it][A

  Processed 400/500 in current chunk


Chunk 6:  80%|████████  | 401/500 [10:52<02:46,  1.68s/it][A

  Processed 450/500 in current chunk


Chunk 6: 100%|██████████| 500/500 [13:33<00:00,  1.63s/it][A
Chunk 6: 100%|██████████| 500/500 [13:33<00:00,  1.63s/it] 819.56s/it]
Processing chunks:  30%|███       | 6/20 [1:22:37<3:11:13, 819.56s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_006_02501_to_03000.tsv
  Chunk 6 completed successfully!

Processing chunk 7: entries 3001 to 3500


Chunk 7:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 7:  10%|█         | 51/500 [01:23<10:51,  1.45s/it][A

  Processed 100/500 in current chunk


Chunk 7:  20%|██        | 101/500 [02:43<10:28,  1.58s/it][A

  Processed 150/500 in current chunk


Chunk 7:  30%|███       | 151/500 [04:01<09:52,  1.70s/it][A

  Processed 200/500 in current chunk


Chunk 7:  40%|████      | 201/500 [05:22<08:25,  1.69s/it][A

  Processed 250/500 in current chunk


Chunk 7:  50%|█████     | 251/500 [06:45<07:02,  1.70s/it][A

  Processed 300/500 in current chunk


Chunk 7:  60%|██████    | 301/500 [08:06<05:35,  1.69s/it][A

  Processed 350/500 in current chunk


Chunk 7:  70%|███████   | 351/500 [09:26<04:11,  1.69s/it][A

  Processed 400/500 in current chunk


Chunk 7:  80%|████████  | 401/500 [10:50<02:47,  1.69s/it][A

  Processed 450/500 in current chunk


Chunk 7: 100%|██████████| 500/500 [13:28<00:00,  1.62s/it][A
Chunk 7: 100%|██████████| 500/500 [13:28<00:00,  1.62s/it] 815.99s/it]
Processing chunks:  35%|███▌      | 7/20 [1:36:06<2:56:47, 815.99s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_007_03001_to_03500.tsv
  Chunk 7 completed successfully!

Processing chunk 8: entries 3501 to 4000


Chunk 8:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 8:  10%|█         | 51/500 [01:22<12:30,  1.67s/it][A

  Processed 100/500 in current chunk


Chunk 8:  20%|██        | 101/500 [02:39<09:03,  1.36s/it][A

  Processed 150/500 in current chunk


Chunk 8:  30%|███       | 151/500 [04:00<09:49,  1.69s/it][A

  Processed 200/500 in current chunk


[A

  Processed 250/500 in current chunk


Chunk 8:  50%|█████     | 251/500 [06:26<06:59,  1.68s/it][A

  Processed 300/500 in current chunk


Chunk 8:  60%|██████    | 301/500 [07:41<05:20,  1.61s/it][A

  Processed 350/500 in current chunk


Chunk 8:  70%|███████   | 351/500 [09:02<04:35,  1.85s/it][A

  Processed 400/500 in current chunk


Chunk 8:  80%|████████  | 401/500 [10:24<02:44,  1.66s/it][A

  Processed 450/500 in current chunk


Chunk 8: 100%|██████████| 500/500 [13:09<00:00,  1.58s/it][A
Chunk 8: 100%|██████████| 500/500 [13:09<00:00,  1.58s/it] 807.41s/it]
Processing chunks:  40%|████      | 8/20 [1:49:15<2:41:28, 807.41s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_008_03501_to_04000.tsv
  Chunk 8 completed successfully!

Processing chunk 9: entries 4001 to 4500


Chunk 9:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 9:  10%|█         | 51/500 [01:20<12:39,  1.69s/it][A

  Processed 100/500 in current chunk


Chunk 9:  20%|██        | 101/500 [02:42<11:18,  1.70s/it][A

  Processed 150/500 in current chunk


Chunk 9:  30%|███       | 151/500 [04:05<09:52,  1.70s/it][A

  Processed 200/500 in current chunk


Chunk 9:  40%|████      | 201/500 [05:28<08:03,  1.62s/it][A

  Processed 250/500 in current chunk


Chunk 9:  50%|█████     | 252/500 [06:41<05:09,  1.25s/it][A

  Processed 300/500 in current chunk


Chunk 9:  60%|██████    | 301/500 [07:52<06:19,  1.91s/it][A

  Processed 350/500 in current chunk


Chunk 9:  70%|███████   | 351/500 [09:15<04:14,  1.71s/it][A

  Processed 400/500 in current chunk


Chunk 9:  80%|████████  | 401/500 [10:36<02:47,  1.69s/it][A

  Processed 450/500 in current chunk


Chunk 9: 100%|██████████| 500/500 [13:13<00:00,  1.59s/it][A
Chunk 9: 100%|██████████| 500/500 [13:13<00:00,  1.59s/it] 803.07s/it]
Processing chunks:  45%|████▌     | 9/20 [2:02:28<2:27:13, 803.07s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_009_04001_to_04500.tsv
  Chunk 9 completed successfully!

Processing chunk 10: entries 4501 to 5000


Chunk 10:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 10:  10%|█         | 51/500 [01:21<09:45,  1.30s/it][A

  Processed 100/500 in current chunk


Chunk 10:  20%|██        | 101/500 [02:42<11:02,  1.66s/it][A

  Processed 150/500 in current chunk


Chunk 10:  30%|███       | 151/500 [04:00<07:03,  1.21s/it][A

  Processed 200/500 in current chunk


Chunk 10:  40%|████      | 202/500 [05:18<06:16,  1.26s/it][A

  Processed 250/500 in current chunk


Chunk 10:  50%|█████     | 251/500 [06:34<05:30,  1.33s/it][A

  Processed 300/500 in current chunk


Chunk 10:  60%|██████    | 301/500 [07:54<05:20,  1.61s/it][A

  Processed 350/500 in current chunk


Chunk 10:  70%|███████   | 352/500 [09:16<03:11,  1.30s/it][A

  Processed 400/500 in current chunk


Chunk 10:  80%|████████  | 401/500 [10:31<02:29,  1.51s/it][A

  Processed 450/500 in current chunk


Chunk 10: 100%|██████████| 500/500 [13:12<00:00,  1.59s/it][A
Chunk 10: 100%|██████████| 500/500 [13:12<00:00,  1.59s/it] 799.93s/it]
Processing chunks:  50%|█████     | 10/20 [2:15:41<2:13:19, 799.93s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_010_04501_to_05000.tsv
  Chunk 10 completed successfully!

Processing chunk 11: entries 5001 to 5500


Chunk 11:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 11:  10%|█         | 51/500 [01:18<12:42,  1.70s/it][A

  Processed 100/500 in current chunk


Chunk 11:  20%|██        | 101/500 [02:33<09:21,  1.41s/it][A

  Processed 150/500 in current chunk


Chunk 11:  30%|███       | 151/500 [03:55<09:38,  1.66s/it][A

  Processed 200/500 in current chunk


Chunk 11:  40%|████      | 201/500 [05:08<08:21,  1.68s/it][A

  Processed 250/500 in current chunk


Chunk 11:  50%|█████     | 251/500 [06:22<05:42,  1.38s/it][A

  Processed 300/500 in current chunk


Chunk 11:  60%|██████    | 301/500 [07:42<05:28,  1.65s/it][A

  Processed 350/500 in current chunk


Chunk 11:  70%|███████   | 351/500 [09:01<04:11,  1.69s/it][A

  Processed 400/500 in current chunk


Chunk 11:  80%|████████  | 401/500 [10:22<02:47,  1.69s/it][A

  Processed 450/500 in current chunk


Chunk 11: 100%|██████████| 500/500 [12:59<00:00,  1.56s/it][A
Chunk 11: 100%|██████████| 500/500 [12:59<00:00,  1.56s/it] 793.60s/it]
Processing chunks:  55%|█████▌    | 11/20 [2:28:41<1:59:02, 793.60s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_011_05001_to_05500.tsv
  Chunk 11 completed successfully!

Processing chunk 12: entries 5501 to 6000


Chunk 12:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 12:  10%|█         | 51/500 [01:23<12:34,  1.68s/it][A

  Processed 100/500 in current chunk


Chunk 12:  20%|██        | 102/500 [02:43<08:35,  1.29s/it][A

  Processed 150/500 in current chunk


Chunk 12:  30%|███       | 151/500 [04:02<09:51,  1.69s/it][A

  Processed 200/500 in current chunk


Chunk 12:  40%|████      | 201/500 [05:23<08:29,  1.70s/it][A

  Processed 250/500 in current chunk


Chunk 12:  50%|█████     | 251/500 [06:39<06:52,  1.66s/it][A

  Processed 300/500 in current chunk


Chunk 12:  60%|██████    | 301/500 [07:58<05:37,  1.70s/it][A

  Processed 350/500 in current chunk


Chunk 12:  70%|███████   | 351/500 [09:15<04:08,  1.66s/it][A

  Processed 400/500 in current chunk


Chunk 12:  80%|████████  | 401/500 [10:32<02:19,  1.41s/it][A

  Processed 450/500 in current chunk


Chunk 12: 100%|██████████| 500/500 [13:13<00:00,  1.59s/it][A
Chunk 12: 100%|██████████| 500/500 [13:13<00:00,  1.59s/it] 793.54s/it]
Processing chunks:  60%|██████    | 12/20 [2:41:54<1:45:48, 793.54s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_012_05501_to_06000.tsv
  Chunk 12 completed successfully!

Processing chunk 13: entries 6001 to 6500


Chunk 13:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 13:  10%|█         | 51/500 [01:21<12:49,  1.71s/it][A

  Processed 100/500 in current chunk


Chunk 13:  20%|██        | 101/500 [02:43<11:17,  1.70s/it][A

  Processed 150/500 in current chunk


Chunk 13:  30%|███       | 151/500 [04:05<09:55,  1.71s/it][A

  Processed 200/500 in current chunk


Chunk 13:  40%|████      | 202/500 [05:21<04:41,  1.06it/s][A

  Processed 250/500 in current chunk


Chunk 13:  50%|█████     | 251/500 [06:42<07:04,  1.71s/it][A

  Processed 300/500 in current chunk


Chunk 13:  60%|██████    | 301/500 [08:05<04:40,  1.41s/it][A

  Processed 350/500 in current chunk


Chunk 13:  70%|███████   | 351/500 [09:26<03:27,  1.39s/it][A

  Processed 400/500 in current chunk


Chunk 13:  80%|████████  | 401/500 [10:37<01:21,  1.21it/s][A

  Processed 450/500 in current chunk


Chunk 13: 100%|██████████| 500/500 [13:04<00:00,  1.57s/it][A
Chunk 13: 100%|██████████| 500/500 [13:04<00:00,  1.57s/it] 790.71s/it]
Processing chunks:  65%|██████▌   | 13/20 [2:54:58<1:32:14, 790.71s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_013_06001_to_06500.tsv
  Chunk 13 completed successfully!

Processing chunk 14: entries 6501 to 7000


Chunk 14:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 14:  10%|█         | 51/500 [01:18<10:35,  1.42s/it][A

  Processed 100/500 in current chunk


Chunk 14:  20%|██        | 101/500 [02:35<11:16,  1.69s/it][A

  Processed 150/500 in current chunk


Chunk 14:  30%|███       | 151/500 [03:57<09:59,  1.72s/it][A

  Processed 200/500 in current chunk


Chunk 14:  40%|████      | 201/500 [05:16<08:32,  1.71s/it][A

  Processed 250/500 in current chunk


Chunk 14:  50%|█████     | 252/500 [06:38<05:26,  1.32s/it][A

  Processed 300/500 in current chunk


Chunk 14:  60%|██████    | 301/500 [07:54<05:27,  1.65s/it][A

  Processed 350/500 in current chunk


Chunk 14:  70%|███████   | 352/500 [09:16<02:54,  1.18s/it][A

  Processed 400/500 in current chunk


Chunk 14:  80%|████████  | 401/500 [10:35<02:32,  1.54s/it][A

  Processed 450/500 in current chunk


Chunk 14: 100%|██████████| 500/500 [13:02<00:00,  1.57s/it][A
Processing chunks:  70%|███████   | 14/20 [3:08:01<1:18:49, 788.30s/it]
Processing chunks:  70%|███████   | 14/20 [3:08:01<1:18:49, 788.30s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_014_06501_to_07000.tsv
  Chunk 14 completed successfully!

Processing chunk 15: entries 7001 to 7500


Chunk 15:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 15:  10%|█         | 51/500 [01:16<09:36,  1.28s/it][A

  Processed 100/500 in current chunk


Chunk 15:  20%|██        | 101/500 [02:39<11:19,  1.70s/it][A

  Processed 150/500 in current chunk


Chunk 15:  30%|███       | 151/500 [04:02<09:53,  1.70s/it][A

  Processed 200/500 in current chunk


Chunk 15:  40%|████      | 201/500 [05:20<06:15,  1.26s/it][A

  Processed 250/500 in current chunk


Chunk 15:  50%|█████     | 251/500 [06:45<07:04,  1.71s/it][A

  Processed 300/500 in current chunk


Chunk 15:  60%|██████    | 301/500 [08:05<05:06,  1.54s/it][A

  Processed 350/500 in current chunk


Chunk 15:  70%|███████   | 351/500 [09:24<04:07,  1.66s/it][A

  Processed 400/500 in current chunk


Chunk 15:  80%|████████  | 401/500 [10:47<02:48,  1.70s/it][A

  Processed 450/500 in current chunk


Chunk 15: 100%|██████████| 500/500 [13:14<00:00,  1.59s/it][A
Chunk 15: 100%|██████████| 500/500 [13:14<00:00,  1.59s/it] 790.17s/it]
Processing chunks:  75%|███████▌  | 15/20 [3:21:15<1:05:50, 790.17s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_015_07001_to_07500.tsv
  Chunk 15 completed successfully!

Processing chunk 16: entries 7501 to 8000


Chunk 16:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 16:  10%|█         | 51/500 [01:21<11:08,  1.49s/it][A

  Processed 100/500 in current chunk


Chunk 16:  20%|██        | 101/500 [02:40<11:12,  1.69s/it][A

  Processed 150/500 in current chunk


Chunk 16:  30%|███       | 151/500 [03:58<07:44,  1.33s/it][A

  Processed 200/500 in current chunk


Chunk 16:  40%|████      | 201/500 [05:16<05:18,  1.06s/it][A

  Processed 250/500 in current chunk


Chunk 16:  50%|█████     | 251/500 [06:36<07:00,  1.69s/it][A

  Processed 300/500 in current chunk


Chunk 16:  60%|██████    | 301/500 [07:50<03:50,  1.16s/it][A

  Processed 350/500 in current chunk


Chunk 16:  70%|███████   | 351/500 [09:13<04:15,  1.72s/it][A

  Processed 400/500 in current chunk


Chunk 16:  80%|████████  | 401/500 [10:33<02:13,  1.35s/it][A

  Processed 450/500 in current chunk


Chunk 16: 100%|██████████| 500/500 [13:06<00:00,  1.57s/it][A
Processing chunks:  80%|████████  | 16/20 [3:34:22<52:35, 788.97s/it]  
Processing chunks:  80%|████████  | 16/20 [3:34:22<52:35, 788.97s/it]  

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_016_07501_to_08000.tsv
  Chunk 16 completed successfully!

Processing chunk 17: entries 8001 to 8500


Chunk 17:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 17:  10%|█         | 51/500 [01:17<12:28,  1.67s/it][A

  Processed 100/500 in current chunk


Chunk 17:  20%|██        | 101/500 [02:40<11:21,  1.71s/it][A

  Processed 150/500 in current chunk


Chunk 17:  30%|███       | 151/500 [04:00<09:50,  1.69s/it][A

  Processed 200/500 in current chunk


Chunk 17:  40%|████      | 201/500 [05:23<07:53,  1.58s/it][A

  Processed 250/500 in current chunk


Chunk 17:  50%|█████     | 251/500 [06:39<07:01,  1.69s/it][A

  Processed 300/500 in current chunk


Chunk 17:  60%|██████    | 302/500 [08:01<03:53,  1.18s/it][A

  Processed 350/500 in current chunk


Chunk 17:  70%|███████   | 351/500 [09:15<03:30,  1.41s/it][A

  Processed 400/500 in current chunk


Chunk 17:  80%|████████  | 401/500 [10:33<01:57,  1.19s/it][A

  Processed 450/500 in current chunk


Chunk 17: 100%|██████████| 500/500 [13:09<00:00,  1.58s/it][A
Chunk 17: 100%|██████████| 500/500 [13:09<00:00,  1.58s/it]89.28s/it]
Processing chunks:  85%|████████▌ | 17/20 [3:47:32<39:27, 789.28s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_017_08001_to_08500.tsv
  Chunk 17 completed successfully!

Processing chunk 18: entries 8501 to 9000


Chunk 18:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 18:  10%|█         | 51/500 [01:41<12:48,  1.71s/it][A

  Processed 100/500 in current chunk


Chunk 18:  20%|██        | 101/500 [03:02<10:16,  1.55s/it][A

  Processed 150/500 in current chunk


Chunk 18:  30%|███       | 151/500 [04:18<08:17,  1.42s/it][A

  Processed 200/500 in current chunk


Chunk 18:  40%|████      | 201/500 [05:37<08:31,  1.71s/it][A

  Processed 250/500 in current chunk


Chunk 18:  50%|█████     | 251/500 [06:55<07:05,  1.71s/it][A

  Processed 300/500 in current chunk


Chunk 18:  60%|██████    | 301/500 [08:11<05:41,  1.71s/it][A

  Processed 350/500 in current chunk


Chunk 18:  70%|███████   | 351/500 [09:32<03:50,  1.54s/it][A

  Processed 400/500 in current chunk


Chunk 18:  80%|████████  | 401/500 [10:48<02:20,  1.42s/it][A

  Processed 450/500 in current chunk


Chunk 18: 100%|██████████| 500/500 [13:19<00:00,  1.60s/it][A
Processing chunks:  90%|█████████ | 18/20 [4:00:51<26:24, 792.33s/it]
Processing chunks:  90%|█████████ | 18/20 [4:00:51<26:24, 792.33s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_018_08501_to_09000.tsv
  Chunk 18 completed successfully!

Processing chunk 19: entries 9001 to 9500


Chunk 19:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 19:  10%|█         | 51/500 [01:20<11:11,  1.50s/it][A

  Processed 100/500 in current chunk


Chunk 19:  20%|██        | 101/500 [02:39<08:49,  1.33s/it][A

  Processed 150/500 in current chunk


Chunk 19:  30%|███       | 151/500 [03:53<07:31,  1.29s/it][A

  Processed 200/500 in current chunk


Chunk 19:  40%|████      | 201/500 [05:13<06:34,  1.32s/it][A

  Processed 250/500 in current chunk


Chunk 19:  51%|█████     | 253/500 [06:32<03:49,  1.08it/s][A

  Processed 300/500 in current chunk


Chunk 19:  60%|██████    | 302/500 [07:48<03:55,  1.19s/it][A

  Processed 350/500 in current chunk


Chunk 19:  70%|███████   | 351/500 [09:05<04:18,  1.74s/it][A

  Processed 400/500 in current chunk


Chunk 19:  80%|████████  | 401/500 [10:24<02:38,  1.60s/it][A

  Processed 450/500 in current chunk


Chunk 19: 100%|██████████| 500/500 [12:53<00:00,  1.55s/it][A
Chunk 19: 100%|██████████| 500/500 [12:53<00:00,  1.55s/it]86.81s/it]
Processing chunks:  95%|█████████▌| 19/20 [4:13:45<13:06, 786.81s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_019_09001_to_09500.tsv
  Chunk 19 completed successfully!

Processing chunk 20: entries 9501 to 10000


Chunk 20:   0%|          | 0/500 [00:00<?, ?it/s][A

  Processed 50/500 in current chunk


Chunk 20:  10%|█         | 51/500 [01:25<12:28,  1.67s/it][A

  Processed 100/500 in current chunk


Chunk 20:  20%|██        | 101/500 [02:41<08:45,  1.32s/it][A

  Processed 150/500 in current chunk


Chunk 20:  30%|███       | 151/500 [03:57<09:24,  1.62s/it][A

  Processed 200/500 in current chunk


Chunk 20:  40%|████      | 201/500 [05:18<08:27,  1.70s/it][A

  Processed 250/500 in current chunk


Chunk 20:  50%|█████     | 251/500 [06:41<06:36,  1.59s/it][A

  Processed 300/500 in current chunk


[A

  Processed 350/500 in current chunk


Chunk 20:  70%|███████   | 351/500 [09:12<04:04,  1.64s/it][A

  Processed 400/500 in current chunk


Chunk 20:  80%|████████  | 401/500 [10:25<02:47,  1.69s/it][A

  Processed 450/500 in current chunk


Chunk 20: 100%|██████████| 500/500 [12:58<00:00,  1.56s/it][A
Chunk 20: 100%|██████████| 500/500 [12:58<00:00,  1.56s/it]00.19s/it]
Processing chunks: 100%|██████████| 20/20 [4:26:43<00:00, 800.19s/it]

  Processed 500/500 in current chunk
  Saved chunk to: data/cast_data/chunks/chunk_020_09501_to_10000.tsv
  Chunk 20 completed successfully!

Combining all chunks...

Processing completed! Final results saved to 'data/cast_data/top_10_with_cast.tsv'
Total entries processed: 10000

Sample results:
                 primaryTitle  \
0                Breaking Bad   
1    The Shawshank Redemption   
2  Avatar: The Last Airbender   
3                    The Wire   
4             Game of Thrones   
5               The Godfather   
6                The Sopranos   
7             The Dark Knight   
8             Attack on Titan   
9                   Aspirants   

                                          top_3_cast  
0  [(Bryan Cranston, Walter White), (Aaron Paul, ...  
1  [(Tim Robbins, Andy Dufresne), (Morgan Freeman...  
2  [(Dee Bradley Baker, Appa), (Zach Tyler Eisen,...  
3  [(Dominic West, Detective James 'Jimmy' McNult...  
4  [(Emilia Clarke, Daenerys Targaryen), (Peter D...  
5  [(Mar




## Web Scraping for Movie/Series Description

In [32]:
# Web scraping script for StreamWithVPN data
%pip install beautifulsoup4
import requests
from bs4 import BeautifulSoup
import time
import re
from urllib.parse import quote
import pandas as pd
from tqdm import tqdm

# Load the top 10,000 weighted ratings data
top_10000_df = pd.read_csv('data/top_10000_weighted_ratings.tsv', sep='\t')

def clean_title_for_url(title):
    """
    Clean and format movie/series title for URL generation
    """
    # Remove special characters and replace spaces with hyphens
    cleaned = re.sub(r'[^\w\s-]', '', title)
    cleaned = re.sub(r'\s+', '-', cleaned.strip())
    return cleaned.lower()

def generate_streamwithvpn_url(title, year):
    """
    Generate StreamWithVPN URL based on title and year
    Example: "The Wolf's Call" (2019) -> "https://www.streamwithvpn.com/the-wolfs-call-2019"
    """
    clean_title = clean_title_for_url(title)
    # Handle cases where year might be NaN or missing
    if pd.isna(year):
        return f"https://www.streamwithvpn.com/{clean_title}"
    else:
        return f"https://www.streamwithvpn.com/{clean_title}-{int(year)}"

def scrape_movie_data(url, tconst, title, year, endYear, titleType, isAdult, runtime, genres, rating, numVotes):
    """
    Scrape movie/series data from StreamWithVPN
    Returns dictionary with description, cast, and streaming platforms
    """
    try:
        # Add delay to be respectful to the server
        time.sleep(1)
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Try the original URL first
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            # print(f"✓ Success with original URL: {url}")
        except requests.exceptions.HTTPError as e:
            if e.response.status_code in [404, 403]:  # Page not found or forbidden
                # Try without year
                url_without_year = generate_streamwithvpn_url(title, None)
                # print(f"⚠ Original URL failed ({e.response.status_code}), trying without year: {url_without_year}")
                
                try:
                    response = requests.get(url_without_year, headers=headers, timeout=10)
                    response.raise_for_status()
                    # print(f"✓ Success with URL without year: {url_without_year}")
                    # Update the URL in the data dictionary for accuracy
                    url = url_without_year
                except requests.exceptions.HTTPError:
                    # print(f"✗ Both URLs failed for {title}")
                    raise  # Re-raise the exception to be caught by outer try-catch
            else:
                raise  # Re-raise non-404/403 errors
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Initialize data dictionary
        movie_data = {
            'tconst': tconst,
            'titleType': titleType,
            'title': title,
            'year': year,
            'endYear': endYear,
            'isAdult': isAdult,
            'runtime': runtime,
            'genres': genres,
            'rating': rating,
            'numVotes': numVotes,
            'description': None,
            'cast': None,
            'streaming_platforms': None,
            'url': url,
            'scrape_status': 'success'
        }
        
        # Extract DESCRIPTION - multiple approaches
        description_element = soup.find('span', class_='rt-Text EntryDetailDescription_contentDescription__tXYGO EntryDetailDescription_expanded__3a0Gs')
        
        if not description_element:
            description_element = soup.find('span', class_=re.compile('EntryDetailDescription_contentDescription'))
        
        if not description_element:
            description_element = soup.find('span', class_=re.compile('contentDescription'))
        
        if not description_element:
            description_element = soup.select_one('span[class*="EntryDetailDescription_contentDescription"]')
        
        if description_element:
            movie_data['description'] = description_element.get_text(strip=True)
            # print(f"✓ Found description for {title}: {movie_data['description'][:100]}...")
        else:
            # print(f"✗ No description found for {title}")
            pass
        """
        # Extract CAST information
        cast_list = []
        # Target container div
        container_div = soup.find('div', class_='rt-Flex rt-r-fd-column rt-r-gap rt-r-px rt-r-pt rt-r-w', style='--gap: 2px; --pl: 16px; --pr: 16px; --pt: 8px; --width: 100%;')
        if container_div:
            cast_spans = container_div.find_all('span', {'data-accent-color': 'gray', 'class_': 'rt-Text rt-r-size-3 rt-r-weight-medium', 'style': 'min-width: 0px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;'})
            if cast_spans:
                for span in cast_spans:
                    cast_list.append(span.get_text(strip=True))
                movie_data['cast'] = ', '.join(cast_list)
                print(f"✓ Found cast for {title}: {movie_data['cast'][:100]}...")
            else:
                print(f"✗ Span not found for {title}")
        else:
            print(f"✗ Div not found for {title}")

        # Extract STREAMING PLATFORMS information
        platform_elements = soup.find_all('h2', class_='rt-Heading rt-r-size-5 rt-r-weight-medium rt-r-ta-left')
        if platform_elements:
            platforms = [elem.get_text(strip=True) for elem in platform_elements]
            movie_data['streaming_platforms'] = ', '.join(platforms)
        else:
            print(f"✗ No streaming platforms found for {title}")
        """
        
        return movie_data
        
    except requests.RequestException as e:
        print(f"Request error for {title}: {e}")
        return {
            'tconst': tconst,
            'title': title,
            'url': url,
            'description': None,
            'cast': None,
            'streaming_platforms': None,
            'scrape_status': f'request_error: {str(e)}'
        }
    except Exception as e:
        print(f"Parsing error for {title}: {e}")
        return {
            'tconst': tconst,
            'title': title,
            'url': url,
            'description': None,
            'cast': None,
            'streaming_platforms': None,
            'scrape_status': f'parsing_error: {str(e)}'
        }

# Initialize list to store scraped data
scraped_data = []

# Sample scraping for first 2 entries (for faster debugging)
print("Starting web scraping")
sample_df = top_10000_df.head(10) # Change to 10000 for full run (2 for testing)

for index, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Scraping movies"):
    tconst = row['tconst']
    title = row['primaryTitle']
    year = row['startYear']
    endYear = row['endYear']
    titleType = row['titleType']
    isAdult = row['isAdult']
    runtime = row['runtimeMinutes']
    genres = row['genres']
    rating = row['averageRating']
    numVotes = row['numVotes']
    
    
    # Generate URL
    url = generate_streamwithvpn_url(title, year)
    # print(f"\nScraping: {title} ({year}) - {url}")
    
    # Scrape data
    movie_data = scrape_movie_data(url, tconst, title, year, endYear, titleType, isAdult, runtime, genres, rating, numVotes)
    scraped_data.append(movie_data)

# Convert to DataFrame
scraped_df = pd.DataFrame(scraped_data)

# Display results
print(f"\nScraping completed! Found data for {len(scraped_df)} entries")
print(f"Success rate: {len(scraped_df[scraped_df['scrape_status'] == 'success'])} / {len(scraped_df)}")

# Show sample results
print("\nSample scraped data:")
print(scraped_df[['title', 'url', 'scrape_status', 'description', 'cast']].head())

# Save scraped data
scraped_df.to_csv('data/top10000_final.tsv', sep='\t', index=False)
print("\nSample data saved to 'data/top10000_final.tsv'")

Collecting beautifulsoup4Note: you may need to restart the kernel to use updated packages.

  Downloading beautifulsoup4-4.14.2-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.8-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.14.2-py3-none-any.whl (106 kB)
Downloading soupsieve-2.8-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4

   ---------------------------------------- 0/2 [soupsieve]
   ---------------------------------------- 0/2 [soupsieve]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   -------------------- ------------------- 1/2 [beautifulsoup4]
   ---------------------------------------- 2/2 



Starting web scraping


Scraping movies: 100%|██████████| 10/10 [00:22<00:00,  2.25s/it]


Scraping completed! Found data for 10 entries
Success rate: 10 / 10

Sample scraped data:
                        title  \
0                Breaking Bad   
1    The Shawshank Redemption   
2  Avatar: The Last Airbender   
3                    The Wire   
4             Game of Thrones   

                                                 url scrape_status  \
0    https://www.streamwithvpn.com/breaking-bad-2008       success   
1  https://www.streamwithvpn.com/the-shawshank-re...       success   
2  https://www.streamwithvpn.com/avatar-the-last-...       success   
3        https://www.streamwithvpn.com/the-wire-2002       success   
4  https://www.streamwithvpn.com/game-of-thrones-...       success   

                                         description  cast  
0  Walter White, a New Mexico chemistry teacher, ...  None  
1  Imprisoned in the 1940s for the double murder ...  None  
2  In a war-torn world of elemental magic, a youn...  None  
3  Told from the points of view of both the B




In [None]:
# Web scraping script for StreamWithVPN data
%pip install beautifulsoup4
import requests
from bs4 import BeautifulSoup
import time
import re
from urllib.parse import quote
import pandas as pd
from tqdm import tqdm

# Load the top 10,000 weighted ratings data
top_10000_df = pd.read_csv('data/top_10000_weighted_ratings.tsv', sep='\t')

def clean_title_for_url(title):
    """
    Clean and format movie/series title for URL generation
    """
    # Remove special characters and replace spaces with hyphens
    cleaned = re.sub(r'[^\w\s-]', '', title)
    cleaned = re.sub(r'\s+', '-', cleaned.strip())
    return cleaned.lower()

def generate_streamwithvpn_url(title, year):
    """
    Generate StreamWithVPN URL based on title and year
    Example: "The Wolf's Call" (2019) -> "https://www.streamwithvpn.com/the-wolfs-call-2019"
    """
    clean_title = clean_title_for_url(title)
    # Handle cases where year might be NaN or missing
    if pd.isna(year):
        return f"https://www.streamwithvpn.com/{clean_title}"
    else:
        return f"https://www.streamwithvpn.com/{clean_title}-{int(year)}"

def scrape_movie_data(url, tconst, title, year, endYear, titleType, isAdult, runtime, genres, rating, numVotes):
    """
    Scrape movie/series data from StreamWithVPN
    Returns dictionary with description, cast, and streaming platforms
    """
    try:
        # Add delay to be respectful to the server
        time.sleep(1)
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            if e.response.status_code in [404, 403]:
                url_without_year = generate_streamwithvpn_url(title, None)
                try:
                    response = requests.get(url_without_year, headers=headers, timeout=10)
                    response.raise_for_status()
                    url = url_without_year
                except requests.exceptions.HTTPError:
                    raise
            else:
                raise
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        movie_data = {
            'tconst': tconst,
            'titleType': titleType,
            'title': title,
            'year': year,
            'endYear': endYear,
            'isAdult': isAdult,
            'runtime': runtime,
            'genres': genres,
            'rating': rating,
            'numVotes': numVotes,
            'description': None,
            'cast': None,
            'streaming_platforms': None,
            'url': url,
            'scrape_status': 'success'
        }
        
        # Extract description
        description_element = soup.find('span', class_='rt-Text EntryDetailDescription_contentDescription__tXYGO EntryDetailDescription_expanded__3a0Gs')
        
        if not description_element:
            description_element = soup.find('span', class_=re.compile('EntryDetailDescription_contentDescription'))
        if not description_element:
            description_element = soup.find('span', class_=re.compile('contentDescription'))
        if not description_element:
            description_element = soup.select_one('span[class*="EntryDetailDescription_contentDescription"]')
        
        if description_element:
            movie_data['description'] = description_element.get_text(strip=True)
            print(f"✓ Found description for {title}: {movie_data['description'][:100]}...")
        else:
            print(f"✗ No description found for {title}")
        
        """
        # Extract CAST information
        cast_list = []
        # Target container div
        container_div = soup.find('div', class_='rt-Flex rt-r-fd-column rt-r-gap rt-r-px rt-r-pt rt-r-w', style='--gap: 2px; --pl: 16px; --pr: 16px; --pt: 8px; --width: 100%;')
        if container_div:
            cast_spans = container_div.find_all('span', {'data-accent-color': 'gray', 'class_': 'rt-Text rt-r-size-3 rt-r-weight-medium', 'style': 'min-width: 0px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;'})
            if cast_spans:
                for span in cast_spans:
                    cast_list.append(span.get_text(strip=True))
                movie_data['cast'] = ', '.join(cast_list)
                print(f"✓ Found cast for {title}: {movie_data['cast'][:100]}...")
            else:
                print(f"✗ Span not found for {title}")
        else:
            print(f"✗ Div not found for {title}")

        # Extract STREAMING PLATFORMS information
        platform_elements = soup.find_all('h2', class_='rt-Heading rt-r-size-5 rt-r-weight-medium rt-r-ta-left')
        if platform_elements:
            platforms = [elem.get_text(strip=True) for elem in platform_elements]
            movie_data['streaming_platforms'] = ', '.join(platforms)
        else:
            print(f"✗ No streaming platforms found for {title}")
        """
        
        return movie_data
        
    except requests.RequestException as e:
        return {
            'tconst': tconst,
            'title': title,
            'url': url,
            'description': None,
            'cast': None,
            'streaming_platforms': None,
            'scrape_status': f'request_error: {str(e)}'
        }
    except Exception as e:
        return {
            'tconst': tconst,
            'title': title,
            'url': url,
            'description': None,
            'cast': None,
            'streaming_platforms': None,
            'scrape_status': f'parsing_error: {str(e)}'
        }

# Initialize variables
chunk_size = 500  # Process 500 movies at a time
total_chunks = len(top_10000_df) // chunk_size + 1
start_time = time.time()

# Process in chunks
for chunk_start in tqdm(range(0, len(top_10000_df), chunk_size), desc="Processing chunks"):
    chunk_end = min(chunk_start + chunk_size, len(top_10000_df))
    chunk_df = top_10000_df.iloc[chunk_start:chunk_end]
    chunk_data = []
    
    # Process each movie in the chunk
    for _, row in tqdm(chunk_df.iterrows(), total=len(chunk_df), desc=f"Chunk {chunk_start//chunk_size + 1}/{total_chunks}", leave=False):
        try:
            tconst = row['tconst']
            title = row['primaryTitle']
            year = row['startYear']
            endYear = row['endYear']
            titleType = row['titleType']
            isAdult = row['isAdult']
            runtime = row['runtimeMinutes']
            genres = row['genres']
            rating = row['averageRating']
            numVotes = row['numVotes']
            
            url = generate_streamwithvpn_url(title, year)
            movie_data = scrape_movie_data(url, tconst, title, year, endYear, 
                                         titleType, isAdult, runtime, genres, 
                                         rating, numVotes)
            chunk_data.append(movie_data)
            
        except Exception as e:
            print(f"\nError processing {title}: {str(e)}")
            continue
    
    # Save chunk progress
    chunk_df = pd.DataFrame(chunk_data)
    chunk_df.to_csv(f'data/temp_scrape_chunk_{chunk_start}.csv', sep='\t', index=False)
    
    # Print progress stats
    elapsed = time.time() - start_time
    processed = chunk_end
    remaining = len(top_10000_df) - processed
    rate = processed / elapsed
    eta = remaining / rate if rate > 0 else 0
    
    print(f"\nChunk {chunk_start}-{chunk_end} completed")
    print(f"Processed {processed:,}/{len(top_10000_df):,} movies in {elapsed/3600:.2f} hours")
    print(f"Estimated time remaining: {eta/3600:.2f} hours")
    print(f"Success rate in chunk: {len(chunk_df[chunk_df['scrape_status'] == 'success'])} / {len(chunk_df)}")

# Combine all chunks
print("\nCombining chunks...")
all_chunks = []
for chunk_start in range(0, len(top_10000_df), chunk_size):
    try:
        chunk = pd.read_csv(f'data/temp_scrape_chunk_{chunk_start}.csv', sep='\t')
        all_chunks.append(chunk)
    except Exception as e:
        print(f"Error reading chunk {chunk_start}: {str(e)}")

scraped_df = pd.concat(all_chunks, ignore_index=True)

# Save final results
scraped_df.to_csv('data/top10000_final.tsv', sep='\t', index=False)

# Print final statistics
total_time = time.time() - start_time
print(f"\nProcessing completed in {total_time/3600:.2f} hours")
print(f"Total movies processed: {len(scraped_df):,}")
print(f"Overall success rate: {len(scraped_df[scraped_df['scrape_status'] == 'success']):,} / {len(scraped_df):,}")

# Clean up temporary files
import os
for chunk_start in range(0, len(top_10000_df), chunk_size):
    try:
        os.remove(f'data/temp_scrape_chunk_{chunk_start}.csv')
    except:
        pass



Note: you may need to restart the kernel to use updated packages.


Processing chunks:   5%|▌         | 1/20 [17:18<5:28:53, 1038.60s/it]


Chunk 0-500 completed
Processed 500/10,000 movies in 0.29 hours
Estimated time remaining: 5.48 hours
Success rate in chunk: 447 / 500


Processing chunks:  10%|█         | 2/20 [34:34<5:11:01, 1036.76s/it]


Chunk 500-1000 completed
Processed 1,000/10,000 movies in 0.58 hours
Estimated time remaining: 5.19 hours
Success rate in chunk: 448 / 500


Processing chunks:  15%|█▌        | 3/20 [51:44<4:52:53, 1033.73s/it]


Chunk 1000-1500 completed
Processed 1,500/10,000 movies in 0.86 hours
Estimated time remaining: 4.89 hours
Success rate in chunk: 438 / 500


Processing chunks:  20%|██        | 4/20 [1:08:45<4:34:22, 1028.88s/it]


Chunk 1500-2000 completed
Processed 2,000/10,000 movies in 1.15 hours
Estimated time remaining: 4.58 hours
Success rate in chunk: 427 / 500


Processing chunks:  25%|██▌       | 5/20 [1:26:21<4:19:39, 1038.66s/it]


Chunk 2000-2500 completed
Processed 2,500/10,000 movies in 1.44 hours
Estimated time remaining: 4.32 hours
Success rate in chunk: 449 / 500


Processing chunks:  30%|███       | 6/20 [1:43:15<4:00:21, 1030.09s/it]


Chunk 2500-3000 completed
Processed 3,000/10,000 movies in 1.72 hours
Estimated time remaining: 4.02 hours
Success rate in chunk: 412 / 500


Processing chunks:  35%|███▌      | 7/20 [2:00:20<3:42:50, 1028.52s/it]


Chunk 3000-3500 completed
Processed 3,500/10,000 movies in 2.01 hours
Estimated time remaining: 3.72 hours
Success rate in chunk: 418 / 500


Processing chunks:  40%|████      | 8/20 [2:17:11<3:24:36, 1023.04s/it]


Chunk 3500-4000 completed
Processed 4,000/10,000 movies in 2.29 hours
Estimated time remaining: 3.43 hours
Success rate in chunk: 410 / 500


Processing chunks:  45%|████▌     | 9/20 [2:34:08<3:07:11, 1021.02s/it]


Chunk 4000-4500 completed
Processed 4,500/10,000 movies in 2.57 hours
Estimated time remaining: 3.14 hours
Success rate in chunk: 408 / 500


Processing chunks:  50%|█████     | 10/20 [2:50:59<2:49:39, 1017.97s/it]


Chunk 4500-5000 completed
Processed 5,000/10,000 movies in 2.85 hours
Estimated time remaining: 2.85 hours
Success rate in chunk: 407 / 500


Processing chunks:  55%|█████▌    | 11/20 [3:07:34<2:31:37, 1010.86s/it]


Chunk 5000-5500 completed
Processed 5,500/10,000 movies in 3.13 hours
Estimated time remaining: 2.56 hours
Success rate in chunk: 393 / 500


Processing chunks:  60%|██████    | 12/20 [3:24:20<2:14:36, 1009.59s/it]


Chunk 5500-6000 completed
Processed 6,000/10,000 movies in 3.41 hours
Estimated time remaining: 2.27 hours
Success rate in chunk: 390 / 500


Processing chunks:  65%|██████▌   | 13/20 [3:41:09<1:57:45, 1009.43s/it]


Chunk 6000-6500 completed
Processed 6,500/10,000 movies in 3.69 hours
Estimated time remaining: 1.98 hours
Success rate in chunk: 387 / 500


Processing chunks:  70%|███████   | 14/20 [3:57:39<1:40:19, 1003.33s/it]


Chunk 6500-7000 completed
Processed 7,000/10,000 movies in 3.96 hours
Estimated time remaining: 1.70 hours
Success rate in chunk: 378 / 500


Processing chunks:  75%|███████▌  | 15/20 [4:14:12<1:23:22, 1000.46s/it]


Chunk 7000-7500 completed
Processed 7,500/10,000 movies in 4.24 hours
Estimated time remaining: 1.41 hours
Success rate in chunk: 379 / 500


Processing chunks:  80%|████████  | 16/20 [4:30:48<1:06:35, 998.88s/it] 


Chunk 7500-8000 completed
Processed 8,000/10,000 movies in 4.51 hours
Estimated time remaining: 1.13 hours
Success rate in chunk: 384 / 500


Processing chunks:  85%|████████▌ | 17/20 [4:47:15<49:45, 995.33s/it]  


Chunk 8000-8500 completed
Processed 8,500/10,000 movies in 4.79 hours
Estimated time remaining: 0.84 hours
Success rate in chunk: 363 / 500


Processing chunks:  90%|█████████ | 18/20 [5:03:36<33:02, 991.08s/it]


Chunk 8500-9000 completed
Processed 9,000/10,000 movies in 5.06 hours
Estimated time remaining: 0.56 hours
Success rate in chunk: 346 / 500


Processing chunks:  95%|█████████▌| 19/20 [5:19:53<16:26, 986.78s/it]


Chunk 9000-9500 completed
Processed 9,500/10,000 movies in 5.33 hours
Estimated time remaining: 0.28 hours
Success rate in chunk: 352 / 500


Processing chunks: 100%|██████████| 20/20 [5:36:08<00:00, 1008.43s/it]



Chunk 9500-10000 completed
Processed 10,000/10,000 movies in 5.60 hours
Estimated time remaining: 0.00 hours
Success rate in chunk: 352 / 500

Combining chunks...

Processing completed in 5.60 hours
Total movies processed: 10,000
Overall success rate: 7,988 / 10,000


In [4]:
# Load the previously scraped data
scraped_df = pd.read_csv('data/top10000_final.tsv', sep='\t')

# Identify failed scraping attempts
failed_scrapes = scraped_df[scraped_df['scrape_status'] != 'success']
print(f"\nFound {len(failed_scrapes)} failed scraping attempts")

# Initialize variables for retry
chunk_size = 500
total_chunks = len(failed_scrapes) // chunk_size + 1
start_time = time.time()
retry_data = []

# Process failed scrapes in chunks
for chunk_start in tqdm(range(0, len(failed_scrapes), chunk_size), desc="Retrying failed scrapes"):
    chunk_end = min(chunk_start + chunk_size, len(failed_scrapes))
    chunk_df = failed_scrapes.iloc[chunk_start:chunk_end]
    chunk_data = []
    
    # Process each failed movie in the chunk
    for _, row in tqdm(chunk_df.iterrows(), total=len(chunk_df), desc=f"Retry Chunk {chunk_start//chunk_size + 1}/{total_chunks}", leave=False):
        try:
            tconst = row['tconst']
            title = row['title']
            year = row['year']
            endYear = row['endYear']
            titleType = row['titleType']
            isAdult = row['isAdult']
            runtime = row['runtime']
            genres = row['genres']
            rating = row['rating']
            numVotes = row['numVotes']
            
            # Try scraping again with increased delay
            time.sleep(2)  # Increased delay for retry attempts
            url = generate_streamwithvpn_url(title, year)
            movie_data = scrape_movie_data(url, tconst, title, year, endYear, 
                                         titleType, isAdult, runtime, genres, 
                                         rating, numVotes)
            chunk_data.append(movie_data)
            
        except Exception as e:
            print(f"\nError processing {title}: {str(e)}")
            continue
    
    # Save retry progress
    retry_chunk_df = pd.DataFrame(chunk_data)
    retry_data.extend(chunk_data)
    
    # Print progress stats
    elapsed = time.time() - start_time
    processed = chunk_end
    remaining = len(failed_scrapes) - processed
    rate = processed / elapsed
    eta = remaining / rate if rate > 0 else 0
    
    print(f"\nRetry Chunk {chunk_start}-{chunk_end} completed")
    print(f"Processed {processed:,}/{len(failed_scrapes):,} movies in {elapsed/3600:.2f} hours")
    print(f"Estimated time remaining: {eta/3600:.2f} hours")
    print(f"Success rate in retry chunk: {len(retry_chunk_df[retry_chunk_df['scrape_status'] == 'success'])} / {len(retry_chunk_df)}")

# Convert retry results to DataFrame
retry_df = pd.DataFrame(retry_data)

# Update original DataFrame with successful retries
successful_retries = retry_df[retry_df['scrape_status'] == 'success']
for _, retry_row in successful_retries.iterrows():
    scraped_df.loc[scraped_df['tconst'] == retry_row['tconst']] = retry_row

# Save updated results
scraped_df.to_csv('data/top10000_final_with_retries.tsv', sep='\t', index=False)

# Print final statistics
total_time = time.time() - start_time
print(f"\nRetry processing completed in {total_time/3600:.2f} hours")
print(f"Initial failed scrapes: {len(failed_scrapes):,}")
print(f"Successful retries: {len(successful_retries):,}")
print(f"Final success rate: {len(scraped_df[scraped_df['scrape_status'] == 'success']):,} / {len(scraped_df):,}")


Found 2012 failed scraping attempts


Retrying failed scrapes:  20%|██        | 1/5 [31:14<2:04:57, 1874.31s/it]


Retry Chunk 0-500 completed
Processed 500/2,012 movies in 0.52 hours
Estimated time remaining: 1.57 hours
Success rate in retry chunk: 0 / 500


Retrying failed scrapes:  40%|████      | 2/5 [1:02:29<1:33:43, 1874.54s/it]


Retry Chunk 500-1000 completed
Processed 1,000/2,012 movies in 1.04 hours
Estimated time remaining: 1.05 hours
Success rate in retry chunk: 0 / 500


Retrying failed scrapes:  60%|██████    | 3/5 [1:33:46<1:02:31, 1875.78s/it]


Retry Chunk 1000-1500 completed
Processed 1,500/2,012 movies in 1.56 hours
Estimated time remaining: 0.53 hours
Success rate in retry chunk: 0 / 500


Retrying failed scrapes:  80%|████████  | 4/5 [2:05:03<31:16, 1876.44s/it]  


Retry Chunk 1500-2000 completed
Processed 2,000/2,012 movies in 2.08 hours
Estimated time remaining: 0.01 hours
Success rate in retry chunk: 0 / 500


Retrying failed scrapes: 100%|██████████| 5/5 [2:05:49<00:00, 1509.84s/it]


Retry Chunk 2000-2012 completed
Processed 2,012/2,012 movies in 2.10 hours
Estimated time remaining: 0.00 hours
Success rate in retry chunk: 0 / 12

Retry processing completed in 2.10 hours
Initial failed scrapes: 2,012
Successful retries: 0
Final success rate: 7,988 / 10,000





## Unify Description and Cast data in 1 file

In [6]:
import numpy as np
import pandas as pd

top10000_cast = pd.read_csv('data/cast_data/top_10000_with_cast.tsv', sep='\t')
top10000_description = pd.read_csv('data/top10000_final.tsv', sep='\t')

# Merge cast and reviews data on the 'tconst' column by getting only the top10000_cast columns and adding the description from top10000_description
merged_df = top10000_cast.merge(top10000_description[['tconst', 'description']], on='tconst', how='inner')
# Save the merged dataframe to a new TSV file
merged_df.to_csv('data/top_10000_with_cast_and_description.tsv', sep='\t', index=False)



In [None]:
top10000 = pd.read_csv('data/top_10000_with_cast_and_description.tsv', sep='\t')

# Try to read the reviews file with error handling
try:
    reviews = pd.read_csv('data/reviews_data/filtered_reviews2.csv', sep='\t')
    print("Reviews loaded successfully")
except pd.errors.ParserError as e:
    print(f"Parser error: {e}")
    # Try with different options to handle malformed data
    try:
        reviews = pd.read_csv('data/reviews_data/filtered_reviews2.csv', 
                            sep='\t', 
                            on_bad_lines='skip',  # Skip bad lines
                            engine='python')     # Use Python engine for better error handling
        print("Reviews loaded with some lines skipped due to parsing errors")
    except Exception as e2:
        print(f"Still failed: {e2}")
        # Try with comma separator instead
        try:
            reviews = pd.read_csv('data/reviews_data/filtered_reviews2.csv', 
                                sep=',',  # Try comma separator
                                on_bad_lines='skip')
            print("Reviews loaded using comma separator")
        except Exception as e3:
            print(f"All attempts failed: {e3}")
            reviews = None

# Get different values of isAdult
#print(top10000['isAdult'].value_counts())
# Remove isAdult column
#top10000 = top10000.drop(columns=['isAdult'])
#top10000.to_csv('data/top_10000_with_cast_and_description.tsv', sep='\t', index=False)

#print((top10000[top10000['primaryTitle'] != top10000['originalTitle']]).shape[0])

# Total null descriptions
total_nulls = top10000[top10000['description'].isna()].shape[0]
print(f"Total null descriptions: {total_nulls}")

# Analyze null descriptions in chunks of 500
chunk_size = 500
total_entries = len(top10000)

print("\nNull descriptions by 500-entry chunks:")
print("-" * 40)

for chunk_start in range(0, total_entries, chunk_size):
    chunk_end = min(chunk_start + chunk_size, total_entries)
    chunk_data = top10000.iloc[chunk_start:chunk_end]
    
    null_count = chunk_data[chunk_data['description'].isna()].shape[0]
    chunk_total = len(chunk_data)
    percentage = (null_count / chunk_total) * 100 if chunk_total > 0 else 0
    
    print(f"Entries {chunk_start+1:4d}-{chunk_end:4d}: {null_count:3d} nulls out of {chunk_total:3d} ({percentage:5.1f}%)")

print("-" * 40)
print(f"Overall: {total_nulls} nulls out of {total_entries} ({(total_nulls/total_entries)*100:.1f}%)")

'''
# Only show reviews if they loaded successfully
if reviews is not None:
    print(f"\nReviews data shape: {reviews.shape}")
    print(reviews[reviews['tconst']=='tt0903747'])
else:
    print("\nCould not load reviews data")

# Remove lines with no description
top10000_cleaned = top10000.dropna(subset=['description'])
print(f"\nAfter removing null descriptions, new shape: {top10000_cleaned.shape}")
top10000_cleaned.to_csv('data/FinalData.tsv', sep='\t', index=False)
'''

Parser error: Error tokenizing data. C error: Expected 1 fields in line 5967, saw 2

Reviews loaded with some lines skipped due to parsing errors
Total null descriptions: 2024

Null descriptions by 500-entry chunks:
----------------------------------------
Entries    1- 500:  54 nulls out of 500 ( 10.8%)
Entries  501-1000:  52 nulls out of 500 ( 10.4%)
Entries 1001-1500:  62 nulls out of 500 ( 12.4%)
Entries 1501-2000:  75 nulls out of 500 ( 15.0%)
Entries 2001-2500:  51 nulls out of 500 ( 10.2%)
Entries 2501-3000:  88 nulls out of 500 ( 17.6%)
Entries 3001-3500:  82 nulls out of 500 ( 16.4%)
Entries 3501-4000:  90 nulls out of 500 ( 18.0%)
Entries 4001-4500:  92 nulls out of 500 ( 18.4%)
Entries 4501-5000:  94 nulls out of 500 ( 18.8%)
Entries 5001-5500: 108 nulls out of 500 ( 21.6%)
Entries 5501-6000: 111 nulls out of 500 ( 22.2%)
Entries 6001-6500: 113 nulls out of 500 ( 22.6%)
Entries 6501-7000: 122 nulls out of 500 ( 24.4%)
Entries 7001-7500: 122 nulls out of 500 ( 24.4%)
Entries 