# REVIEWS

In [None]:
from fuzzywuzzy import fuzz
import pandas as pd
from tqdm import tqdm
import time
import numpy as np

# Read the CSV file with the correct separator
df = pd.read_csv('data/reviews_data/movies_reviews2.csv', sep=',', skipinitialspace=True)

# Clean up column names by removing any leading/trailing commas
df.columns = df.columns.str.strip(',')

# Clean movie titles by removing content within parentheses
df['title'] = df['title'].str.replace(r'\s*\([^)]*\)', '', regex=True)

# Select only the desired columns
filtered_df = df[[
    'title',
    'quote',
    'author'
]]

# Save the filtered data to a new CSV
filtered_df.to_csv('data/reviews_data/filtered_reviews2.csv', index=False)

# Print the first few rows to verify the changes
print("\nFirst few rows of filtered data:")
print(filtered_df.head())

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
# Read the TSV file
movies_df = pd.read_csv('data/top_10000_weighted_ratings.tsv', sep='\t')

# Read the filtered reviews CSV
reviews_df = pd.read_csv('data/reviews_data/filtered_reviews2.csv')

# Function to find best matching movie title with error handling
def find_best_match(title, movie_titles_dict, threshold=80):
    try:
        # Handle NaN or non-string values
        if pd.isna(title) or not isinstance(title, str):
            return None
            
        best_match = None
        best_score = 0
        
        for movie_title, movie_id in movie_titles_dict.items():
            score = fuzz.ratio(title.lower(), movie_title.lower())
            if score > threshold and score > best_score:
                best_score = score
                best_match = movie_id
        
        return best_match
    except Exception as e:
        print(f"Error processing title: {title}")
        print(f"Error details: {str(e)}")
        return None

# Create dictionary of movie titles to tconst
title_to_tconst = dict(zip(movies_df['primaryTitle'], movies_df['tconst']))

# Start time measurement
start_time = time.time()

# Process in chunks to allow saving progress
chunk_size = 10000
total_chunks = len(reviews_df) // chunk_size + 1

for chunk_start in tqdm(range(0, len(reviews_df), chunk_size), desc="Processing chunks"):
    chunk_end = min(chunk_start + chunk_size, len(reviews_df))
    chunk = reviews_df.iloc[chunk_start:chunk_end].copy()
    
    # Process chunk
    chunk['tconst'] = chunk['title'].apply(lambda x: find_best_match(x, title_to_tconst))
    
    # Save progress to temporary file
    chunk.to_csv(f'data/temp_chunk_{chunk_start}.csv', index=False)
    
    # Print progress
    if chunk_start % (chunk_size * 10) == 0:
        elapsed = time.time() - start_time
        print(f"\nProcessed {chunk_end:,} / {len(reviews_df):,} reviews in {elapsed:.2f} seconds")

# Combine all chunks
all_chunks = []
for chunk_start in range(0, len(reviews_df), chunk_size):
    chunk = pd.read_csv(f'data/temp_chunk_{chunk_start}.csv')
    all_chunks.append(chunk)

reviews_df = pd.concat(all_chunks, ignore_index=True)

# Remove rows where no match was found
reviews_df = reviews_df.dropna(subset=['tconst'])

# Save the final filtered data
reviews_df.to_csv('data/filtered_reviews2.csv', index=False)

# Print final statistics
elapsed_time = time.time() - start_time
print(f"\nProcessing completed in {elapsed_time/3600:.2f} hours")
print(f"Number of reviews after fuzzy matching: {len(reviews_df):,}")
print(f"Number of unique movies in filtered reviews: {reviews_df['title'].nunique():,}")

Processing chunks:   1%|▏         | 1/67 [02:55<3:12:55, 175.38s/it]


Processed 10,000 / 667,536 reviews in 175.38 seconds


Processing chunks:  16%|█▋        | 11/67 [32:48<2:47:36, 179.59s/it]


Processed 110,000 / 667,536 reviews in 1968.97 seconds


Processing chunks:  31%|███▏      | 21/67 [1:02:33<2:16:42, 178.31s/it]


Processed 210,000 / 667,536 reviews in 3753.94 seconds


Processing chunks:  46%|████▋     | 31/67 [1:32:32<1:48:09, 180.26s/it]


Processed 310,000 / 667,536 reviews in 5552.18 seconds


Processing chunks:  61%|██████    | 41/67 [2:02:55<1:18:16, 180.62s/it]


Processed 410,000 / 667,536 reviews in 7375.83 seconds


Processing chunks:  76%|███████▌  | 51/67 [2:32:34<47:22, 177.68s/it]  


Processed 510,000 / 667,536 reviews in 9154.18 seconds


Processing chunks:  91%|█████████ | 61/67 [3:01:58<17:33, 175.55s/it]


Processed 610,000 / 667,536 reviews in 10918.34 seconds


Processing chunks: 100%|██████████| 67/67 [3:18:53<00:00, 178.11s/it]



Processing completed in 3.32 hours
Number of reviews after fuzzy matching: 358,152
Number of unique movies in filtered reviews: 5,532
