# Data Preperation


## Converting raw Rotten Tomatoes datasets into final dataset.

Import Libraries.

In [48]:
import pandas as pd

Merge the two datasets.

In [49]:
#combines the two datasets
def merge_datasets(df_movie, df_critic):

    #remove duplicate movies
    removed_duplicate_movies = df_movie.drop_duplicates(subset='rotten_tomatoes_link', keep='first')

    df_merged = pd.merge(df_critic,removed_duplicate_movies,on='rotten_tomatoes_link', how='inner')

    return df_merged

Clean the dataset.

In [50]:
#chooses columns we want and filters out anything we dont want
def fix_filter_dataset(df: pd.DataFrame) -> pd.DataFrame:

    wanted_columns = [
        'critic_name',
        'rotten_tomatoes_link',
        'movie_title',
        'content_rating',
        'genres',
        'original_release_date',
        'review_content',
        'review_type',
        'movie_info'
    ]

    #remove empty rows
    df_fixed = df.dropna(subset=wanted_columns)

    # take wanted columns. Use .copy() to avoid SettingWithCopyWarning
    df_filtered = df_fixed[wanted_columns].copy()

    #lowercase dataset
    for col in df_filtered.select_dtypes(include=['object']).columns:
        df_filtered.loc[:, col] = df_filtered[col].str.lower()

    #only take year from release date
    df_filtered['original_release_date'] = df_filtered['original_release_date'].str.split('-').str[0]

    return df_filtered

Save the dataset as a csv file.

In [51]:
def create_csv(df: pd.DataFrame):
    df.to_csv('../datasets/final_dataset.csv', index=False) 

Load data.

In [52]:
# Load datasets
df_critic_reviews = pd.read_csv('../datasets/rotten_tomatoes_critic_reviews.csv')
df_movies = pd.read_csv('../datasets/rotten_tomatoes_movies.csv')

print(f"Loaded {len(df_movies)} movies")
print(f"Loaded {len(df_critic_reviews)} reviews")

Loaded 17712 movies
Loaded 1130017 reviews


Run the code.

In [53]:
# Run the data preparation pipeline
merged = merge_datasets(df_movies, df_critic_reviews)
print(f"After merge: {len(merged)} rows")

cleaned = fix_filter_dataset(merged)
print(f"After cleaning: {len(cleaned)} rows")

create_csv(cleaned)
print("Final dataset saved")

After merge: 1129887 rows
After cleaning: 1024931 rows
Final dataset saved


Verification

In [54]:
# Quick verification
df_final = pd.read_csv('../datasets/final_dataset.csv')
print(f"\nFinal dataset: {len(df_final)} rows, {df_final.shape[1]} columns")
print(f"Unique movies: {df_final['rotten_tomatoes_link'].nunique()}")
df_final.head()


Final dataset: 1024931 rows, 9 columns
Unique movies: 16355


Unnamed: 0,critic_name,rotten_tomatoes_link,movie_title,content_rating,genres,original_release_date,review_content,review_type,movie_info
0,andrew l. urban,m/0814255,percy jackson & the olympians: the lightning t...,pg,"action & adventure, comedy, drama, science fic...",2010,a fantasy adventure that fuses greek mythology...,fresh,"always trouble-prone, the life of teenager per..."
1,louise keller,m/0814255,percy jackson & the olympians: the lightning t...,pg,"action & adventure, comedy, drama, science fic...",2010,"uma thurman as medusa, the gorgon with a coiff...",fresh,"always trouble-prone, the life of teenager per..."
2,ben mceachen,m/0814255,percy jackson & the olympians: the lightning t...,pg,"action & adventure, comedy, drama, science fic...",2010,whether audiences will get behind the lightnin...,fresh,"always trouble-prone, the life of teenager per..."
3,ethan alter,m/0814255,percy jackson & the olympians: the lightning t...,pg,"action & adventure, comedy, drama, science fic...",2010,what's really lacking in the lightning thief i...,rotten,"always trouble-prone, the life of teenager per..."
4,david germain,m/0814255,percy jackson & the olympians: the lightning t...,pg,"action & adventure, comedy, drama, science fic...",2010,it's more a list of ingredients than a movie-m...,rotten,"always trouble-prone, the life of teenager per..."
