# REVIEWS

## Movies

In [1]:
from fuzzywuzzy import fuzz
import pandas as pd
from tqdm import tqdm
import time
import numpy as np

# Read the CSV file with the correct separator
df = pd.read_csv('data/reviews_data/movies_reviews2.csv', sep=',', skipinitialspace=True)

# Clean up column names by removing any leading/trailing commas
df.columns = df.columns.str.strip(',')

# Clean movie titles by removing content within parentheses
df['title'] = df['title'].str.replace(r'\s*\([^)]*\)', '', regex=True)

# Select only the desired columns
filtered_df = df[[
    'title',
    'quote',
    'author'
]]

# Save the filtered data to a new CSV
filtered_df.to_csv('data/reviews_data/filtered_reviews2.csv', index=False)

# Print the first few rows to verify the changes
print("\nFirst few rows of filtered data:")
print(filtered_df.head())

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
# Read the TSV file
movies_df = pd.read_csv('data/top_10000_weighted_ratings.tsv', sep='\t')

# Read the filtered reviews CSV
reviews_df = pd.read_csv('data/reviews_data/filtered_reviews2.csv')

# Function to find best matching movie title with error handling
def find_best_match(title, movie_titles_dict, threshold=80):
    try:
        # Handle NaN or non-string values
        if pd.isna(title) or not isinstance(title, str):
            return None
            
        best_match = None
        best_score = 0
        
        for movie_title, movie_id in movie_titles_dict.items():
            score = fuzz.ratio(title.lower(), movie_title.lower())
            if score > threshold and score > best_score:
                best_score = score
                best_match = movie_id
        
        return best_match
    except Exception as e:
        print(f"Error processing title: {title}")
        print(f"Error details: {str(e)}")
        return None

# Create dictionary of movie titles to tconst
title_to_tconst = dict(zip(movies_df['primaryTitle'], movies_df['tconst']))

# Start time measurement
start_time = time.time()

# Process in chunks to allow saving progress
chunk_size = 10000
total_chunks = len(reviews_df) // chunk_size + 1

for chunk_start in tqdm(range(0, len(reviews_df), chunk_size), desc="Processing chunks"):
    chunk_end = min(chunk_start + chunk_size, len(reviews_df))
    chunk = reviews_df.iloc[chunk_start:chunk_end].copy()
    
    # Process chunk
    chunk['tconst'] = chunk['title'].apply(lambda x: find_best_match(x, title_to_tconst))
    
    # Save progress to temporary file
    chunk.to_csv(f'data/temp_chunk_{chunk_start}.csv', index=False)
    
    # Print progress
    if chunk_start % (chunk_size * 10) == 0:
        elapsed = time.time() - start_time
        print(f"\nProcessed {chunk_end:,} / {len(reviews_df):,} reviews in {elapsed:.2f} seconds")

# Combine all chunks
all_chunks = []
for chunk_start in range(0, len(reviews_df), chunk_size):
    chunk = pd.read_csv(f'data/temp_chunk_{chunk_start}.csv')
    all_chunks.append(chunk)

reviews_df = pd.concat(all_chunks, ignore_index=True)

# Remove rows where no match was found
reviews_df = reviews_df.dropna(subset=['tconst'])

# Save the final filtered data
reviews_df.to_csv('data/filtered_reviews2.csv', index=False)

# Print final statistics
elapsed_time = time.time() - start_time
print(f"\nProcessing completed in {elapsed_time/3600:.2f} hours")
print(f"Number of reviews after fuzzy matching: {len(reviews_df):,}")
print(f"Number of unique movies in filtered reviews: {reviews_df['title'].nunique():,}")

Processing chunks:   1%|▏         | 1/67 [02:55<3:12:55, 175.38s/it]


Processed 10,000 / 667,536 reviews in 175.38 seconds


Processing chunks:  16%|█▋        | 11/67 [32:48<2:47:36, 179.59s/it]


Processed 110,000 / 667,536 reviews in 1968.97 seconds


Processing chunks:  31%|███▏      | 21/67 [1:02:33<2:16:42, 178.31s/it]


Processed 210,000 / 667,536 reviews in 3753.94 seconds


Processing chunks:  46%|████▋     | 31/67 [1:32:32<1:48:09, 180.26s/it]


Processed 310,000 / 667,536 reviews in 5552.18 seconds


Processing chunks:  61%|██████    | 41/67 [2:02:55<1:18:16, 180.62s/it]


Processed 410,000 / 667,536 reviews in 7375.83 seconds


Processing chunks:  76%|███████▌  | 51/67 [2:32:34<47:22, 177.68s/it]  


Processed 510,000 / 667,536 reviews in 9154.18 seconds


Processing chunks:  91%|█████████ | 61/67 [3:01:58<17:33, 175.55s/it]


Processed 610,000 / 667,536 reviews in 10918.34 seconds


Processing chunks: 100%|██████████| 67/67 [3:18:53<00:00, 178.11s/it]



Processing completed in 3.32 hours
Number of reviews after fuzzy matching: 358,152
Number of unique movies in filtered reviews: 5,532


## Series

In [4]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('data/reviews_data/series_reviews1.csv')

# Keep only needed columns
df = df[['title', 'review_content', 'author']]

# Clean the title by removing season number and any text in parentheses 
def clean_title(title):
    try:
        if pd.isna(title) or not isinstance(title, str):
            return None
        # Remove season part
        title = title.split(':')[0]
        # Remove text in parentheses
        title = title.split('(')[0]
        # Remove extra whitespace and convert to title case
        title = ' '.join(title.split()).strip().title()
        return title if title else None
    except:
        return None

# Clean review content
def clean_review(review):
    try:
        if pd.isna(review) or not isinstance(review, str):
            return None
        # Remove extra whitespace and newlines
        review = ' '.join(review.split())
        # Remove empty reviews
        return review if len(review.strip()) > 0 else None
    except:
        return None

# Clean author name
def clean_author(author):
    try:
        if pd.isna(author) or not isinstance(author, str):
            return None
        # Remove extra whitespace and convert to title case
        author = ' '.join(author.split()).strip().title()
        return author if author else None
    except:
        return None

# Apply cleaning functions
df['title'] = df['title'].apply(clean_title)
df['review_content'] = df['review_content'].apply(clean_review)
df['author'] = df['author'].apply(clean_author)

# Remove rows with any null values
df = df.dropna()

# Remove duplicates
df = df.drop_duplicates()

# Reset index after cleaning
df = df.reset_index(drop=True)

# Save cleaned dataset
df.to_csv('data/reviews_data/cleaned_series_reviews.csv', index=False)

# Display statistics
print("\nCleaning Statistics:")
print(f"Total rows after cleaning: {len(df)}")
print(f"Unique series titles: {df['title'].nunique()}")
print(f"Unique authors: {df['author'].nunique()}")
print("\nSample of cleaned data:")
print(df.head())
print("\nDataset Info:")
print(df.info())


Cleaning Statistics:
Total rows after cleaning: 52675
Unique series titles: 2519
Unique authors: 1500

Sample of cleaned data:
     title                                     review_content  \
0  Rectify  It allows us to know and care for these charac...   
1  Rectify  Rectify, a drama entering its final season on ...   
2  Rectify  No other series so poignantly probes the human...   
3  Rectify  None of these characters is particularly happy...   
4  Rectify  Rectify is the best series I have ever seen on...   

              author  
0      Allison Keene  
1   James Poniewozik  
2  Melanie Mcfarland  
3         Ken Tucker  
4      Malcolm Jones  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52675 entries, 0 to 52674
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           52675 non-null  object
 1   review_content  52675 non-null  object
 2   author          52675 non-null  object


In [None]:
import pandas as pd
from fuzzywuzzy import fuzz
import numpy as np
from tqdm import tqdm
import time
import os

# Read the datasets
series_reviews = pd.read_csv('data/reviews_data/cleaned_series_reviews.csv')
final_data = pd.read_csv('data/FinalData.tsv', sep='\t')

# Filter only TV series from final_data
tv_series = final_data[final_data['titleType'] == 'tvSeries']

def find_best_match(title, series_dict, threshold=80):
    try:
        if pd.isna(title) or not isinstance(title, str):
            return None
            
        best_match = None
        best_score = 0
        
        for series_title, series_id in series_dict.items():
            score = fuzz.ratio(title.lower(), series_title.lower())
            if score > threshold and score > best_score:
                best_score = score
                best_match = series_id
        
        return best_match
    except Exception as e:
        print(f"Error processing title: {title}")
        print(f"Error details: {str(e)}")
        return None

# Create dictionary of series titles to tconst
title_to_tconst = dict(zip(tv_series['primaryTitle'], tv_series['tconst']))

# Start time measurement
start_time = time.time()

# Process in chunks
chunk_size = 1000  # Reduced chunk size for better tracking
total_chunks = len(series_reviews) // chunk_size + 1

# Dictionary to store chunk processing status
chunk_status = {}

# First pass: Process chunks with verification
for chunk_start in tqdm(range(0, len(series_reviews), chunk_size), desc="Processing chunks"):
    chunk_end = min(chunk_start + chunk_size, len(series_reviews))
    chunk = series_reviews.iloc[chunk_start:chunk_end].copy()
    
    try:
        # Process chunk
        chunk['tconst'] = chunk['title'].apply(lambda x: find_best_match(x, title_to_tconst))
        
        # Verify chunk has expected number of rows
        if len(chunk) == (chunk_end - chunk_start):
            chunk.to_csv(f'data/temp_series_chunk_{chunk_start}.csv', index=False)
            chunk_status[chunk_start] = 'success'
        else:
            chunk_status[chunk_start] = 'size_mismatch'
    except Exception as e:
        print(f"\nError processing chunk {chunk_start}-{chunk_end}: {str(e)}")
        chunk_status[chunk_start] = 'error'
    
    # Print progress
    if chunk_start % (chunk_size * 2) == 0:
        elapsed = time.time() - start_time
        print(f"\nProcessed {chunk_end:,} / {len(series_reviews):,} reviews in {elapsed:.2f} seconds")

# Verify and reprocess failed chunks
failed_chunks = [start for start, status in chunk_status.items() if status != 'success']
if failed_chunks:
    print(f"\nReprocessing {len(failed_chunks)} failed chunks...")
    for chunk_start in tqdm(failed_chunks):
        chunk_end = min(chunk_start + chunk_size, len(series_reviews))
        chunk = series_reviews.iloc[chunk_start:chunk_end].copy()
        
        try:
            chunk['tconst'] = chunk['title'].apply(lambda x: find_best_match(x, title_to_tconst))
            chunk.to_csv(f'data/temp_series_chunk_{chunk_start}.csv', index=False)
            chunk_status[chunk_start] = 'success'
        except Exception as e:
            print(f"\nFailed to reprocess chunk {chunk_start}-{chunk_end}: {str(e)}")

# Combine all successful chunks
print("\nCombining successful chunks...")
all_chunks = []
for chunk_start in range(0, len(series_reviews), chunk_size):
    if chunk_status.get(chunk_start) == 'success':
        try:
            chunk = pd.read_csv(f'data/temp_series_chunk_{chunk_start}.csv')
            all_chunks.append(chunk)
        except Exception as e:
            print(f"Error reading chunk {chunk_start}: {str(e)}")

# Combine and verify
series_reviews = pd.concat(all_chunks, ignore_index=True)

# Verify data integrity
original_count = len(series_reviews)
processed_count = sum(len(chunk) for chunk in all_chunks)
print(f"\nData integrity check:")
print(f"Original records: {original_count}")
print(f"Processed records: {processed_count}")

# Remove rows where no match was found
series_reviews = series_reviews.dropna(subset=['tconst'])

# Save the final filtered data
series_reviews.to_csv('data/reviews_data/filtered_reviews1.csv', index=False)

# Print final statistics
elapsed_time = time.time() - start_time
print(f"\nProcessing completed in {elapsed_time/3600:.2f} hours")
print(f"Number of reviews after fuzzy matching: {len(series_reviews):,}")
print(f"Number of unique series in filtered reviews: {series_reviews['title'].nunique():,}")
print(f"Success rate: {(len(series_reviews)/original_count)*100:.2f}%")

# Display sample of matched reviews
print("\nSample of matched reviews:")
matched_reviews_df = series_reviews.merge(
    tv_series[['tconst', 'primaryTitle']], 
    on='tconst', 
    how='left'
)
print(matched_reviews_df[['title', 'primaryTitle', 'tconst']].head())

# Clean up temporary files
print("\nCleaning up temporary files...")
for chunk_start in range(0, len(series_reviews), chunk_size):
    try:
        os.remove(f'data/temp_series_chunk_{chunk_start}.csv')
    except:
        pass

Processing chunks:   2%|▏         | 1/53 [00:05<04:20,  5.01s/it]


Processed 1,000 / 52,675 reviews in 5.01 seconds


Processing chunks:   6%|▌         | 3/53 [00:14<03:52,  4.65s/it]


Processed 3,000 / 52,675 reviews in 14.14 seconds


Processing chunks:   9%|▉         | 5/53 [00:22<03:35,  4.48s/it]


Processed 5,000 / 52,675 reviews in 22.89 seconds


Processing chunks:  13%|█▎        | 7/53 [00:31<03:23,  4.43s/it]


Processed 7,000 / 52,675 reviews in 31.64 seconds


Processing chunks:  17%|█▋        | 9/53 [00:40<03:15,  4.45s/it]


Processed 9,000 / 52,675 reviews in 40.57 seconds


Processing chunks:  21%|██        | 11/53 [00:49<03:07,  4.46s/it]


Processed 11,000 / 52,675 reviews in 49.54 seconds


Processing chunks:  25%|██▍       | 13/53 [00:58<02:56,  4.42s/it]


Processed 13,000 / 52,675 reviews in 58.29 seconds


Processing chunks:  28%|██▊       | 15/53 [01:07<02:49,  4.47s/it]


Processed 15,000 / 52,675 reviews in 67.27 seconds


Processing chunks:  32%|███▏      | 17/53 [01:16<02:39,  4.43s/it]


Processed 17,000 / 52,675 reviews in 76.08 seconds


Processing chunks:  36%|███▌      | 19/53 [01:25<02:34,  4.55s/it]


Processed 19,000 / 52,675 reviews in 85.41 seconds


Processing chunks:  40%|███▉      | 21/53 [01:34<02:27,  4.61s/it]


Processed 21,000 / 52,675 reviews in 94.76 seconds


Processing chunks:  43%|████▎     | 23/53 [01:43<02:15,  4.52s/it]


Processed 23,000 / 52,675 reviews in 103.66 seconds


Processing chunks:  47%|████▋     | 25/53 [01:52<02:05,  4.47s/it]


Processed 25,000 / 52,675 reviews in 112.50 seconds


Processing chunks:  51%|█████     | 27/53 [02:01<01:56,  4.48s/it]


Processed 27,000 / 52,675 reviews in 121.52 seconds


Processing chunks:  55%|█████▍    | 29/53 [02:10<01:47,  4.46s/it]


Processed 29,000 / 52,675 reviews in 130.41 seconds


Processing chunks:  58%|█████▊    | 31/53 [02:19<01:37,  4.43s/it]


Processed 31,000 / 52,675 reviews in 139.21 seconds


Processing chunks:  62%|██████▏   | 33/53 [02:27<01:27,  4.39s/it]


Processed 33,000 / 52,675 reviews in 147.92 seconds


Processing chunks:  66%|██████▌   | 35/53 [02:36<01:19,  4.44s/it]


Processed 35,000 / 52,675 reviews in 156.87 seconds


Processing chunks:  70%|██████▉   | 37/53 [02:45<01:11,  4.47s/it]


Processed 37,000 / 52,675 reviews in 165.90 seconds


Processing chunks:  74%|███████▎  | 39/53 [02:54<01:02,  4.46s/it]


Processed 39,000 / 52,675 reviews in 174.80 seconds


Processing chunks:  77%|███████▋  | 41/53 [03:04<00:56,  4.68s/it]


Processed 41,000 / 52,675 reviews in 184.45 seconds


Processing chunks:  81%|████████  | 43/53 [03:14<00:47,  4.75s/it]


Processed 43,000 / 52,675 reviews in 194.15 seconds


Processing chunks:  85%|████████▍ | 45/53 [03:23<00:36,  4.60s/it]


Processed 45,000 / 52,675 reviews in 203.10 seconds


Processing chunks:  89%|████████▊ | 47/53 [03:31<00:27,  4.51s/it]


Processed 47,000 / 52,675 reviews in 211.97 seconds


Processing chunks:  92%|█████████▏| 49/53 [03:41<00:18,  4.57s/it]


Processed 49,000 / 52,675 reviews in 221.20 seconds


Processing chunks:  96%|█████████▌| 51/53 [03:50<00:08,  4.48s/it]


Processed 51,000 / 52,675 reviews in 230.01 seconds


Processing chunks: 100%|██████████| 53/53 [03:57<00:00,  4.48s/it]


Processed 52,675 / 52,675 reviews in 237.52 seconds

Combining successful chunks...






Data integrity check:
Original records: 52675
Processed records: 52675

Processing completed in 0.07 hours
Number of reviews after fuzzy matching: 27,298
Number of unique series in filtered reviews: 960
Success rate: 51.82%

Sample of matched reviews:
     title primaryTitle     tconst
0  Rectify      Rectify  tt2183404
1  Rectify      Rectify  tt2183404
2  Rectify      Rectify  tt2183404
3  Rectify      Rectify  tt2183404
4  Rectify      Rectify  tt2183404

Cleaning up temporary files...


In [None]:
import pandas as pd

# Read both review files
movies_reviews = pd.read_csv('data/reviews_data/filtered_reviews2.csv')
series_reviews = pd.read_csv('data/reviews_data/filtered_reviews1.csv')

# Rename columns in movies_reviews to match series_reviews
movies_reviews = movies_reviews.rename(columns={
    'quote': 'review_content'
})

# Combine the dataframes
combined_reviews = pd.concat([
    movies_reviews[['title', 'review_content', 'author', 'tconst']],
    series_reviews[['title', 'review_content', 'author', 'tconst']]
], ignore_index=True)

# Remove any duplicates
combined_reviews = combined_reviews.drop_duplicates()

# Remove rows where tconst is null
combined_reviews = combined_reviews.dropna(subset=['tconst'])

# Save combined dataset
combined_reviews.to_csv('data/FinalReviews.tsv', sep='\t', index=False)

# Print statistics
print("\nCombined Reviews Statistics:")
print(f"Total reviews: {len(combined_reviews)}")
print(f"Unique titles: {combined_reviews['title'].nunique()}")
print(f"Unique authors: {combined_reviews['author'].nunique()}")
print(f"Reviews with tconst: {combined_reviews['tconst'].notna().sum()}")

# Display sample
print("\nSample of combined reviews:")
print(combined_reviews.head())


Combined Reviews Statistics:
Total reviews: 385435
Unique titles: 6355
Unique authors: 75359
Reviews with tconst: 385435

Sample of combined reviews:
         title                                     review_content  \
0  The Leopard  Superb direction.\r\nPainfully accurate for th...   
1  The Leopard  "The Leopard" is the epic war drama that "Gone...   
2  The Leopard  The Leopard, or better "Il Gattopardo", is a m...   
3  The Leopard  You see, people, not only the Leopard is a suc...   
4  The Leopard  Excellent characterization. Passionate story. ...   

            author     tconst  
0           OldFrt  tt0057091  
1    ryancarroll88  tt0057091  
2            Lumax  tt0057091  
3   EpicLadySponge  tt0057091  
4  Serrao_Brochado  tt0057091  
