# Data Preperation


## Converting raw Rotten Tomatoes datasets into final dataset.

Import Libraries.

In [1]:
import pandas as pd

Merge the two datasets.

In [2]:
#combines the two datasets
def merge_datasets(df_movie, df_critic):

    #remove duplicate movies
    removed_duplicate_movies = df_movie.drop_duplicates(subset='rotten_tomatoes_link', keep='first')

    df_merged = pd.merge(df_critic,removed_duplicate_movies,on='rotten_tomatoes_link', how='inner')

    return df_merged

Clean the dataset.

In [3]:
#chooses columns we want and filters out anything we dont want
def fix_filter_dataset(df: pd.DataFrame) -> pd.DataFrame:

    wanted_columns = [
        'critic_name',
        'rotten_tomatoes_link',
        'movie_title',
        'content_rating',
        'genres',
        'original_release_date',
        'review_content',
        'review_type',
        'movie_info'
    ]

    #remove empty rows
    df_fixed = df.dropna(subset=wanted_columns)

    # take wanted columns. Use .copy() to avoid SettingWithCopyWarning
    df_filtered = df_fixed[wanted_columns].copy()

    #lowercase dataset
    for col in df_filtered.select_dtypes(include=['object']).columns:
        df_filtered.loc[:, col] = df_filtered[col].str.lower()

    #only take year from release date
    df_filtered['original_release_date'] = df_filtered['original_release_date'].str.split('-').str[0]

    return df_filtered

Save the dataset as a csv file.

In [4]:
def create_csv(df: pd.DataFrame):
    df.to_csv('../datasets/final_dataset.csv', index=False) 

Load data.

In [5]:
# Load datasets
df_critic_reviews = pd.read_csv('../datasets/rotten_tomatoes_critic_reviews.csv')
df_movies = pd.read_csv('../datasets/rotten_tomatoes_movies.csv')

print(f"Loaded {len(df_movies)} movies")
print(f"Loaded {len(df_critic_reviews)} reviews")

Loaded 17712 movies
Loaded 1130017 reviews


Run the code.

In [6]:
# Run the data preparation pipeline
merged = merge_datasets(df_movies, df_critic_reviews)
print(f"After merge: {len(merged)} rows")

df_final = fix_filter_dataset(merged)
print(f"After cleaning: {len(df_final)} rows")
print(f"Unique movies: {df_final['rotten_tomatoes_link'].nunique()}")


After merge: 1129887 rows
After cleaning: 1024931 rows
Unique movies: 16355


## Optional: Create Sampled Dataset

For faster processing during development, you can create a smaller sampled dataset. This reduces processing time from ~45 minutes to ~5-10 minutes.

**Configuration:**
- `CREATE_SAMPLE = True`: Creates a sampled dataset
- `CREATE_SAMPLE = False`: Uses the full dataset
- `SAMPLE_SIZE = 2000`: Number of movies to include in sample
- `DEMO_MOVIES`: Ensures specific movies are included for demonstrations

In [None]:
# Configuration
CREATE_SAMPLE = True  # Set to False for full dataset
SAMPLE_SIZE = 50     # Number of movies to include

# Movies to ensure are in the sample (for demonstrations)
DEMO_MOVIES = ['aliens', 'toy story', 'the godfather']

if CREATE_SAMPLE:
    print(f"Creating sampled dataset with {SAMPLE_SIZE} movies...")
    
    # Group by movie to get unique movies
    movie_groups = df_final.groupby('rotten_tomatoes_link')
    unique_movies = list(movie_groups.groups.keys())
    
    print(f"Total unique movies: {len(unique_movies)}")
    
    # Find demo movies
    demo_links = []
    for link in unique_movies:
        movie_title = movie_groups.get_group(link)['movie_title'].iloc[0]
        if movie_title in DEMO_MOVIES:
            demo_links.append(link)
            print(f"  Found demo movie: {movie_title}")
    
    # Separate demo movies from others
    other_links = [link for link in unique_movies if link not in demo_links]
    
    # Sample from other movies
    import random
    random.seed(42)
    remaining_sample_size = SAMPLE_SIZE - len(demo_links)
    sampled_links = random.sample(other_links, min(remaining_sample_size, len(other_links)))
    
    # Combine demo movies with sampled movies
    final_links = demo_links + sampled_links
    
    # Filter dataframe to include only selected movies
    df_final = df_final[df_final['rotten_tomatoes_link'].isin(final_links)]
    
    print(f"\nSampled dataset created:")
    print(f"  Total movies: {df_final['rotten_tomatoes_link'].nunique()}")
    print(f"  Total reviews: {len(df_final)}")
    print(f"  Demo movies included: {len(demo_links)}")
else:
    print(f"Using full dataset: {df_final['rotten_tomatoes_link'].nunique()} movies")


Creating sampled dataset with 2000 movies...
Total unique movies: 16355
  Found demo movie: aliens
  Found demo movie: the godfather
  Found demo movie: toy story

Sampled dataset created:
  Total movies: 2000
  Total reviews: 128569
  Demo movies included: 3


In [8]:
# Save the final dataset
create_csv(df_final)
print("Final dataset saved to ../datasets/final_dataset.csv")


Final dataset saved to ../datasets/final_dataset.csv


Verification

In [9]:
# Quick verification
print(f"\nFinal dataset: {len(df_final)} rows, {df_final.shape[1]} columns")
print(f"Unique movies: {df_final['rotten_tomatoes_link'].nunique()}")
df_final.head()



Final dataset: 128569 rows, 9 columns
Unique movies: 2000


Unnamed: 0,critic_name,rotten_tomatoes_link,movie_title,content_rating,genres,original_release_date,review_content,review_type,movie_info
545,mike barnard,m/1000121-39_steps,the 39 steps,nr,"action & adventure, classics, mystery & suspense",1935,"if you haven't seen it, do.",fresh,"while on vacation in london, canadian richard ..."
546,marjorie baumgarten,m/1000121-39_steps,the 39 steps,nr,"action & adventure, classics, mystery & suspense",1935,laced with comic romantic touches.,fresh,"while on vacation in london, canadian richard ..."
548,andre sennwald,m/1000121-39_steps,the 39 steps,nr,"action & adventure, classics, mystery & suspense",1935,if you can imagine anatole france writing a de...,fresh,"while on vacation in london, canadian richard ..."
549,tim dirks,m/1000121-39_steps,the 39 steps,nr,"action & adventure, classics, mystery & suspense",1935,the 39 steps (1935) is one of the earlier alfr...,fresh,"while on vacation in london, canadian richard ..."
550,ted prigge,m/1000121-39_steps,the 39 steps,nr,"action & adventure, classics, mystery & suspense",1935,it's almost cleansing to see a film like this ...,fresh,"while on vacation in london, canadian richard ..."
