# Import Required Libraries and Dataset

In [60]:
import pandas as pd
import numpy as np
import re
import requests
import zipfile
import tarfile
import os

## Downloading Datasets

In [None]:
link_ml = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

# Download the zip file
print(f"Downloading from: {link_ml}")
response = requests.get(link_ml, stream=True)
response.raise_for_status()  # Raise an exception for bad status codes

# Get the filename from the URL 
filename = os.path.basename(link_ml)

# Save the zip file locally
with open(filename, 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)
print(f"Downloaded: {filename}")

# Extract the zip file
with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall()
print("Extraction complete!")

Downloading from: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Downloaded: ml-latest-small.zip
Extraction complete!


In [None]:
link_cmu = "https://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz"

# Download the tar file
print(f"Downloading from: {link_cmu}")
response = requests.get(link_cmu, stream=True)
response.raise_for_status()  # Raise an exception for bad status codes

# Get the filename from the URL 
filename = os.path.basename(link_cmu)

# Save the tar file locally
with open(filename, 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)
print(f"Downloaded: {filename}")

# Extract the tar.gz file
with tarfile.open("test.tar.xz", 'r:gz') as tar_ref:
    tar_ref.extractall()
print("Extraction complete!")

Downloading from: https://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz
Downloaded: MovieSummaries.tar.gz



## Using MovieLens small dataset for efficient testing

Dataset Source: https://grouplens.org/datasets/movielens/

In [5]:
# Load the MovieLens dataset
ml_movies = pd.read_csv("ml-latest-small/movies.csv")
print(f"Loaded {len(ml_movies)} movies from MovieLens dataset.")
ml_movies

Loaded 9742 movies from MovieLens dataset.


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [24]:
ml_ratings = pd.read_csv("ml-latest-small/ratings.csv")
print(f"Loaded {len(ml_ratings)} ratings from MovieLens dataset.")
ml_ratings

Loaded 100836 ratings from MovieLens dataset.


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [6]:
ml_tags = pd.read_csv("ml-latest-small/tags.csv")
print(f"Loaded {len(ml_tags)} tags from MovieLens.")
ml_tags

Loaded 3683 tags from MovieLens.


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


## Importing CMU Movie Summary Dataset
This dataset is used to get plot summaries as well as actors in a movie.

Dataset Source: https://www.cs.cmu.edu/~ark/personas/

In [8]:
# Add column names for cmu dataset
cmu_colnames = [
    'wikipedia_id', 'freebase_id', 'movie_name', 'release_date',
    'box_office_revenue', 'runtime', 'languages', 'countries', 'genres'
]

# Load the CMU Movie Summaries dataset
cmu_movies = pd.read_csv("MovieSummaries/movie.metadata.tsv", sep='\t', header=None, names=cmu_colnames, quoting=3) # quoting=3 to handle potential quotes within fields
print(f"Loaded {len(cmu_movies)} movies from CMU Corpus.")
cmu_movies

Loaded 81741 movies from CMU Corpus.


Unnamed: 0,wikipedia_id,freebase_id,movie_name,release_date,box_office_revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"
...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}"
81737,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0..."
81738,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}"
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ..."


In [9]:
# Load CMU actors - only need Wikipedia movie ID (col 0) and actor name (col 3)
char_colnames = [
    'wikipedia_id', 'freebase_movie_id', 'release_date', 'character_name',
    'actor_dob', 'actor_gender', 'actor_height', 'actor_ethnicity',
    'actor_name', 'actor_age', 'map_id', 'char_id', 'actor_id'
]
# Use usecols to load only what's needed
cmu_chars = pd.read_csv("MovieSummaries/character.metadata.tsv", sep='\t', header=None, names=char_colnames, usecols=[0, 8])
print(f"Loaded {len(cmu_chars)} actor entries from CMU.")
cmu_chars

Loaded 450669 actor entries from CMU.


Unnamed: 0,wikipedia_id,actor_name
0,975900,Wanda De Jesus
1,975900,Natasha Henstridge
2,975900,Ice Cube
3,975900,Jason Statham
4,975900,Clea DuVall
...,...,...
450664,913762,Dorothy Elias-Fahn
450665,913762,Jonathan Fahn
450666,28308153,David Hemmings
450667,28308153,Roberta Paterson


In [10]:
# Load CMU Plot Summaries (Wikipedia ID <tab> Summary)
cmu_plots = pd.read_csv("MovieSummaries/plot_summaries.txt", sep='\t', header=None, names=['wikipedia_id', 'plot_summary'])
print(f"Loaded {len(cmu_plots)} plot summaries from CMU.")
cmu_plots

Loaded 42303 plot summaries from CMU.


Unnamed: 0,wikipedia_id,plot_summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
...,...,...
42298,34808485,"The story is about Reema , a young Muslim scho..."
42299,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
42300,35102018,American Luthier focuses on Randy Parsons’ tra...
42301,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


# Linking MovieLens and CMU Movie Summary Corpus

## Preprocessing MovieLens Data
Preprocessing steps: 
1. **Separate the Year Suffix**: MovieLens dataset had years in the title instead of a separate column. So it was split.
2. **Remove parenthetical suffixes**: Some titles in the MovieLens data had parenthetical information in titles (e.g. year, translated title). These are dropped
3. **Moving suffixed articles(The, A, An) to the beginning**: Some titles had the noun articles at the end rather than the beginning. So they are moved to the start

In [11]:
# --- 2. Preprocess MovieLens Data ---
print("\nPreprocessing MovieLens data...")

def extract_year_title_ml(title):
    """
    Extracts year from MovieLens title, cleans the title by:
    1. Removing the year suffix (e.g., "(1995)").
    2. Iteratively removing other *trailing* parenthetical suffixes
       (e.g., "(a.k.a. ...)", "(Subtitle)", "(Postino, Il)").
    3. Moving ", The", ", A", ", An", or ", Les" from the end to the beginning.
    """
    title = title.strip()
    year = np.nan
    cleaned_title = title

    # 1. Extract year from the very end
    year_match = re.search(r'\s*\((\d{4})\)$', title)
    if year_match:
        year = int(year_match.group(1))
        # Get the part before the year for further cleaning
        cleaned_title = title[:year_match.start()].strip()
    # else: year remains np.nan, cleaned_title is the original title for now


    # 2. Iteratively remove any OTHER trailing parenthetical content
    #    Loop as long as a pattern like ' (anything)' exists at the end
    paren_pattern = r'\s*\([^)]*\)$' # Matches ' (<non-closing-parentheses>) ' at the end
    while True:
        match = re.search(paren_pattern, cleaned_title)
        if match:
            # Remove the matched parenthesis group
            cleaned_title = cleaned_title[:match.start()].strip()
        else:
            # No more trailing parentheses found
            break # Exit the loop

    # 3. Handle ", The", ", A", ", An", ", Les" at the end
    #    Regex: ^(.*?)      - Start, capture anything non-greedily
    #           ,\s*        - Followed by a comma and optional space
    #           (The|A|An|Les) - Capture one of the articles
    #           $           - End of the string
    article_match = re.search(r'^(.*?),\s*(The|A|An|Les)$', cleaned_title, re.IGNORECASE)
    if article_match:
        base_title = article_match.group(1).strip()
        article = article_match.group(2)
        # Reconstruct title: e.g., "The " + "Usual Suspects"
        cleaned_title = f"{article.capitalize()} {base_title}"

    return cleaned_title, year


# --- Applying the preprocessing to MovieLens DataFrame ---

print("\nPreprocessing MovieLens data with extract_year_title_ml function...")

# Apply the REVISED (v3) function
ml_movies[['cleaned_title', 'year']] = ml_movies['title'].apply(
    lambda x: pd.Series(extract_year_title_ml(x))
)

# Ensure year type is correct
ml_movies['year'] = ml_movies['year'].astype('Int64')

print("\n--- Preprocessed MovieLens head ---")
print(ml_movies[['movieId', 'title', 'cleaned_title', 'year']].head())

# Display how the previously failed examples to confirm if they are fixed:
print("\n--- Checking Previously Failed Examples for Debugging---")
example_ids = [11, 29, 30, 32, 47, 49, 50, 52, 58, 60, 65, 73, 74, 85] #
print(ml_movies[ml_movies['movieId'].isin(example_ids)][['movieId', 'title', 'cleaned_title', 'year']])


Preprocessing MovieLens data...

Preprocessing MovieLens data with extract_year_title_ml function...

--- Preprocessed MovieLens head ---
   movieId                               title                cleaned_title  \
0        1                    Toy Story (1995)                    Toy Story   
1        2                      Jumanji (1995)                      Jumanji   
2        3             Grumpier Old Men (1995)             Grumpier Old Men   
3        4            Waiting to Exhale (1995)            Waiting to Exhale   
4        5  Father of the Bride Part II (1995)  Father of the Bride Part II   

   year  
0  1995  
1  1995  
2  1995  
3  1995  
4  1995  

--- Checking Previously Failed Examples for Debugging---
    movieId                                              title  \
10       11                     American President, The (1995)   
28       29  City of Lost Children, The (Cité des enfants p...   
29       30  Shanghai Triad (Yao a yao yao dao waipo qiao) ...   
31  

## Preprocessing CMU Corpus data

Preprocessing steps:
1. Extract year from release date.
2. Trim Whitespace in Movie title

In [12]:
# --- Preprocess CMU Corpus Data ---
print("\nPreprocessing CMU Corpus data...")

# Function to extract year from release date
def extract_year_cmu(date_str):
    if pd.isna(date_str):
        return np.nan
    # Match YYYY-MM-DD or YYYY-MM or just YYYY at the beginning
    match = re.match(r'^(\d{4})', str(date_str))
    if match:
        return int(match.group(1))
    else:
        return np.nan # Or handle other formats if needed

# Apply the function
cmu_movies['year'] = cmu_movies['release_date'].apply(extract_year_cmu)

# Convert year to nullable integer type
cmu_movies['year'] = cmu_movies['year'].astype('Int64')

# Clean the movie name (trim whitespace)
cmu_movies['cleaned_title'] = cmu_movies['movie_name'].str.strip()

print("\n--- Preprocessed CMU Corpus head ---")
print(cmu_movies[['wikipedia_id', 'movie_name', 'release_date', 'cleaned_title', 'year']].head())



Preprocessing CMU Corpus data...

--- Preprocessed CMU Corpus head ---
   wikipedia_id                                         movie_name  \
0        975900                                     Ghosts of Mars   
1       3196793  Getting Away with Murder: The JonBenét Ramsey ...   
2      28463795                                        Brun bitter   
3       9363483                                   White Of The Eye   
4        261236                                  A Woman in Flames   

  release_date                                      cleaned_title  year  
0   2001-08-24                                     Ghosts of Mars  2001  
1   2000-02-16  Getting Away with Murder: The JonBenét Ramsey ...  2000  
2         1988                                        Brun bitter  1988  
3         1987                                   White Of The Eye  1987  
4         1983                                  A Woman in Flames  1983  


## Linking MovieLens rows to Wikipedia ID in CMU

Movie were matched using title and year. This is done to link MovieLens movies to CMU movies to get their summaries.

In [13]:
# Convert titles to lowercase for case-insensitive matching
ml_movies['match_title'] = ml_movies['cleaned_title'].str.lower()
cmu_movies['match_title'] = cmu_movies['cleaned_title'].str.lower()

# Only keep relevant columns from CMU dataset
cmu_cols_to_keep = ['wikipedia_id', 'match_title', 'year', 'movie_name', 'release_date']
cmu_movies_subset = cmu_movies[cmu_cols_to_keep].copy()

# Merge on lowercase title and exact year
# Use indicator=True to easily identify matched/unmatched rows
merged_df = pd.merge(
    ml_movies,
    cmu_movies_subset,
    on=['match_title', 'year'],
    how='left'
)

# Check the merge result
merged_df

Unnamed: 0,movieId,title,genres,cleaned_title,year,match_title,wikipedia_id,movie_name,release_date
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,toy story,53085.0,Toy Story,1995-11-19
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995,jumanji,3700174.0,Jumanji,1995-12-15
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,grumpier old men,1934035.0,Grumpier Old Men,1995-12-22
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995,waiting to exhale,972970.0,Waiting to Exhale,1995-12-22
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,father of the bride part ii,3303622.0,Father of the Bride Part II,1995-12-08
...,...,...,...,...,...,...,...,...,...
9764,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017,black butler: book of the atlantic,,,
9765,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,No Game No Life: Zero,2017,no game no life: zero,,,
9766,193585,Flint (2017),Drama,Flint,2017,flint,,,
9767,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Bungo Stray Dogs: Dead Apple,2018,bungo stray dogs: dead apple,,,


In [14]:
# Display the number of matched and unmatched rows
matched_count = merged_df['wikipedia_id'].notna().sum()
unmatched_count = merged_df['wikipedia_id'].isna().sum()
print(f"\n--- Merge Results ---")
print(f"Matched rows: {matched_count}")
print(f"Unmatched rows: {unmatched_count}")

# Display some unmatched rows for debugging
print("\n--- Unmatched Rows in Merged DataFrame ---")
print(merged_df[merged_df['wikipedia_id'].isna()].head())


--- Merge Results ---
Matched rows: 7153
Unmatched rows: 2616

--- Unmatched Rows in Merged DataFrame ---
    movieId                              title                  genres  \
45       49       When Night Is Falling (1995)           Drama|Romance   
46       50         Usual Suspects, The (1995)  Crime|Mystery|Thriller   
52       58  Postman, The (Postino, Il) (1994)    Comedy|Drama|Romance   
68       76                   Screamers (1995)  Action|Sci-Fi|Thriller   
69       77                   Nico Icon (1995)             Documentary   

            cleaned_title  year            match_title  wikipedia_id  \
45  When Night Is Falling  1995  when night is falling           NaN   
46     The Usual Suspects  1995     the usual suspects           NaN   
52            The Postman  1994            the postman           NaN   
68              Screamers  1995              screamers           NaN   
69              Nico Icon  1995              nico icon           NaN   

   movie_name r

In [27]:
# Dropping unnecessary columns
final_merged_df = merged_df.drop(columns=['match_title', 'movie_name', 'release_date'])
final_merged_df = final_merged_df.drop_duplicates(subset=['movieId'], keep='first')

# drop rows where wikipedia_id is NaN
final_merged_df = final_merged_df.dropna(subset=['wikipedia_id'])

# Convert wikipedia id to integer
final_merged_df['wikipedia_id'] = final_merged_df['wikipedia_id'].astype('Int64')

final_merged_df


Unnamed: 0,movieId,title,genres,cleaned_title,year,wikipedia_id
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,53085
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995,3700174
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,1934035
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995,972970
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,3303622
...,...,...,...,...,...,...
9697,182731,Pixel Perfect (2004),Children|Comedy|Sci-Fi,Pixel Perfect,2004,1956661
9706,183301,The Tale of the Bunny Picnic (1986),Children,The Tale of the Bunny Picnic,1986,8554766
9707,183317,Patti Rocks (1988),Comedy|Drama,Patti Rocks,1988,34053640
9714,184053,Battle Planet (2008),Action|Sci-Fi,Battle Planet,2008,20869935


# Merging average ratings, tags, plot summaries, and actors

## Calculating and Merging average Ratings from MovieLens

In [34]:
# --- Calculate and Merge Average Ratings ---
print("\nCalculating and Merging Average Ratings...")

# Group by movieId and take the average rating
ratings_avg = ml_ratings.groupby('movieId')['rating'].mean().reset_index()

print(f"Calculated Average Ratings for {len(ratings_avg)} movies.")
print(ratings_avg.head())

# Merge tags into the main dataframe
final_merged_df = pd.merge(
    final_merged_df,
    ratings_avg,
    on='movieId',
    how='left' # Keep all movies, even if they have no tags
)

# Replace NaN tags with 0
final_merged_df['rating'] = final_merged_df['rating'].fillna(0)

# Display the final merged DataFrame
final_merged_df[['movieId', 'wikipedia_id', 'title', 'rating']]


Calculating and Merging Average Ratings...
Calculated Average Ratings for 9724 movies.
   movieId    rating
0        1  3.920930
1        2  3.431818
2        3  3.259615
3        4  2.357143
4        5  3.071429


Unnamed: 0,movieId,wikipedia_id,title,rating
0,1,53085,Toy Story (1995),3.920930
1,2,3700174,Jumanji (1995),3.431818
2,3,1934035,Grumpier Old Men (1995),3.259615
3,4,972970,Waiting to Exhale (1995),2.357143
4,5,3303622,Father of the Bride Part II (1995),3.071429
...,...,...,...,...
7121,182731,1956661,Pixel Perfect (2004),4.500000
7122,183301,8554766,The Tale of the Bunny Picnic (1986),3.000000
7123,183317,34053640,Patti Rocks (1988),4.500000
7124,184053,20869935,Battle Planet (2008),2.000000


## Merging Tags from MovieLens

In [35]:
# --- Process and Merge Tags ---
print("\nProcessing and merging tags...")

# Drop rows with missing tags and convert tag to string just in case
ml_tags = ml_tags.dropna(subset=['tag'])
ml_tags['tag'] = ml_tags['tag'].astype(str)

# Group by movieId and aggregate tags into a pipe-separated string
# Using unique() within the aggregation to avoid duplicate tags per movie
tags_agg = ml_tags.groupby('movieId')['tag'].agg(lambda x: '|'.join(x.unique())).reset_index()
tags_agg = tags_agg.rename(columns={'tag': 'tags'}) # Rename column for clarity

print(f"Aggregated tags for {len(tags_agg)} movies.")
print(tags_agg.head())

# Merge tags into the main dataframe
final_merged_df = pd.merge(
    final_merged_df,
    tags_agg,
    on='movieId',
    how='left' # Keep all movies, even if they have no tags
)

# Replace NaN tags with empty string
final_merged_df['tags'] = final_merged_df['tags'].fillna('')

# Display the final merged DataFrame
final_merged_df[['movieId', 'wikipedia_id', 'title', 'tags']]


Processing and merging tags...
Aggregated tags for 1572 movies.
   movieId                                          tags
0        1                                     pixar|fun
1        2  fantasy|magic board game|Robin Williams|game
2        3                                     moldy|old
3        5                              pregnancy|remake
4        7                                        remake


Unnamed: 0,movieId,wikipedia_id,title,tags
0,1,53085,Toy Story (1995),pixar|fun
1,2,3700174,Jumanji (1995),fantasy|magic board game|Robin Williams|game
2,3,1934035,Grumpier Old Men (1995),moldy|old
3,4,972970,Waiting to Exhale (1995),
4,5,3303622,Father of the Bride Part II (1995),pregnancy|remake
...,...,...,...,...
7121,182731,1956661,Pixel Perfect (2004),
7122,183301,8554766,The Tale of the Bunny Picnic (1986),
7123,183317,34053640,Patti Rocks (1988),
7124,184053,20869935,Battle Planet (2008),


## Merging Actors from CMU corpus

In [None]:
# --- Process and Merge actors ---
print("\nProcessing and merging actors...")

# Drop rows with missing actor names or missing wikipedia_id
cmu_chars = cmu_chars.dropna(subset=['wikipedia_id', 'actor_name'])

# Ensure data types are correct
cmu_chars['wikipedia_id'] = cmu_chars['wikipedia_id'].astype(int)
cmu_chars['actor_name'] = cmu_chars['actor_name'].astype(str)

# Group by wikipedia_id and aggregate actor names
# Using unique() to avoid duplicate actor names per movie
chars_agg = cmu_chars.groupby('wikipedia_id')['actor_name'].agg(lambda x: '|'.join(x.unique())).reset_index()
chars_agg = chars_agg.rename(columns={'actor_name': 'actors'}) # Rename column

print(f"Aggregated actors for {len(chars_agg)} movies.")
print(chars_agg.head())

# Merge actors into the main dataframe
final_merged_df = pd.merge(
    final_merged_df,
    chars_agg,
    on='wikipedia_id',
    how='left' # Keep all movies, even if they don't have a wikipedia_id match
)

# Replace NaN actors with empty string
final_merged_df['actors'] = final_merged_df['actors'].fillna('')

# Display the final merged DataFrame with actors
final_merged_df[['movieId', 'wikipedia_id', 'title', 'actors']]


Processing and merging actors...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cmu_chars['wikipedia_id'] = cmu_chars['wikipedia_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cmu_chars['actor_name'] = cmu_chars['actor_name'].astype(str)


Aggregated actors for 64258 movies.
   wikipedia_id                                             actors
0           330  Rosa Maria Sardà|Mercè Pons|Anna Lizaran|Núria...
1          3217  Ted Raimi|Ivan Raimi|Bruce Campbell|Bridget Fo...
2          3333  Lillian Gish|Henry B. Walthall|Mae Marsh|Spott...
3          3746  William Sanderson|Harrison Ford|Rutger Hauer|S...
4          3837  Jack Starrett|Slim Pickens|Dave Sharpe|Liam Du...


Unnamed: 0,movieId,wikipedia_id,title,actors
0,1,53085,Toy Story (1995),John Ratzenberger|Tim Allen|Annie Potts|Don Ri...
1,2,3700174,Jumanji (1995),David Alan Grier|Robin Williams|Jonathan Hyde|...
2,3,1934035,Grumpier Old Men (1995),Katie Sagona|Walter Matthau|Kevin Pollak|Daryl...
3,4,972970,Waiting to Exhale (1995),Whitney Houston|Angela Bassett|Loretta Devine|...
4,5,3303622,Father of the Bride Part II (1995),Steve Martin|Diane Keaton|Martin Short|Kimberl...
...,...,...,...,...
7121,182731,1956661,Pixel Perfect (2004),Ricky Ullman|Leah Pipes|Spencer Redford
7122,183301,8554766,The Tale of the Bunny Picnic (1986),Steve Whitmire|Martin P. Robinson|Richard Hunt...
7123,183317,34053640,Patti Rocks (1988),Chris Mulkey
7124,184053,20869935,Battle Planet (2008),John Duerler|Zack Ward|Monica May|Colleen Smit...


## Merging Plot Summaries from CMU Corpus

In [None]:
# --- Process and Merge Plot Summaries ---
print("\nProcessing and merging plot summaries...")

# Drop rows with missing plot summaries or wikipedia_id
cmu_plots = cmu_plots.dropna(subset=['wikipedia_id', 'plot_summary'])

# Ensure wikipedia_id is integer type before potential type casting for merge
cmu_plots['wikipedia_id'] = cmu_plots['wikipedia_id'].astype(int)

# Rename column for clarity before merge
cmu_plots = cmu_plots.rename(columns={'plot_summary': 'plot'})

print(f"Loaded plot summaries for {len(cmu_plots)} movies.")
print(cmu_plots.head())

# Merge plots into the main dataframe
final_merged_df = pd.merge(
    final_merged_df,
    cmu_plots[['wikipedia_id', 'plot']], # Select only needed columns
    on='wikipedia_id',
    how='left' # Keep all movies
)

# Replace NaN plots with empty string
final_merged_df['plot'] = final_merged_df['plot'].fillna('')

# Display the final merged DataFrame with plots
final_merged_df[['movieId', 'wikipedia_id', 'title', 'plot']]


Processing and merging plot summaries...
Loaded plot summaries for 42303 movies.
   wikipedia_id                                               plot
0      23890098  Shlykov, a hard-working taxi driver and Lyosha...
1      31186339  The nation of Panem consists of a wealthy Capi...
2      20663735  Poovalli Induchoodan  is sentenced for six yea...
3       2231378  The Lemon Drop Kid , a New York City swindler,...
4        595909  Seventh-day Adventist Church pastor Michael Ch...


Unnamed: 0,movieId,wikipedia_id,title,plot
0,1,53085,Toy Story (1995),Woody is a pull-string cowboy doll and leader...
1,2,3700174,Jumanji (1995),"In 1869, two boys bury a game board in a fore..."
2,3,1934035,Grumpier Old Men (1995),The lifelong feud between Max and John has c...
3,4,972970,Waiting to Exhale (1995),Waiting to Exhale is a story about four Africa...
4,5,3303622,Father of the Bride Part II (1995),George Banks must accept the reality of what h...
...,...,...,...,...
7121,182731,1956661,Pixel Perfect (2004),The movie starts with sixteen-year old Roscoe ...
7122,183301,8554766,The Tale of the Bunny Picnic (1986),The story is actually told by an elderly Bean ...
7123,183317,34053640,Patti Rocks (1988),
7124,184053,20869935,Battle Planet (2008),"In the not-so-distant future, Captain Jordan S..."


## Final Dataframe after Merge

In [42]:
# --- Final Review ---
print("\n--- Final Merged DataFrame ---")
print(f"\nFinal DataFrame shape: {final_merged_df.shape}")
print(f"\nFinal DataFrame columns: {final_merged_df.columns.tolist()}")
print(f"\nFinal DataFrame data types:\n{final_merged_df.dtypes}")

final_merged_df


--- Final Merged DataFrame ---

Final DataFrame shape: (7126, 10)

Final DataFrame columns: ['movieId', 'title', 'genres', 'cleaned_title', 'year', 'wikipedia_id', 'rating', 'tags', 'actors', 'plot']

Final DataFrame data types:
movieId            int64
title             object
genres            object
cleaned_title     object
year               Int64
wikipedia_id       Int64
rating           float64
tags              object
actors            object
plot              object
dtype: object


Unnamed: 0,movieId,title,genres,cleaned_title,year,wikipedia_id,rating,tags,actors,plot
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,53085,3.920930,pixar|fun,John Ratzenberger|Tim Allen|Annie Potts|Don Ri...,Woody is a pull-string cowboy doll and leader...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995,3700174,3.431818,fantasy|magic board game|Robin Williams|game,David Alan Grier|Robin Williams|Jonathan Hyde|...,"In 1869, two boys bury a game board in a fore..."
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,1934035,3.259615,moldy|old,Katie Sagona|Walter Matthau|Kevin Pollak|Daryl...,The lifelong feud between Max and John has c...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995,972970,2.357143,,Whitney Houston|Angela Bassett|Loretta Devine|...,Waiting to Exhale is a story about four Africa...
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,3303622,3.071429,pregnancy|remake,Steve Martin|Diane Keaton|Martin Short|Kimberl...,George Banks must accept the reality of what h...
...,...,...,...,...,...,...,...,...,...,...
7121,182731,Pixel Perfect (2004),Children|Comedy|Sci-Fi,Pixel Perfect,2004,1956661,4.500000,,Ricky Ullman|Leah Pipes|Spencer Redford,The movie starts with sixteen-year old Roscoe ...
7122,183301,The Tale of the Bunny Picnic (1986),Children,The Tale of the Bunny Picnic,1986,8554766,3.000000,,Steve Whitmire|Martin P. Robinson|Richard Hunt...,The story is actually told by an elderly Bean ...
7123,183317,Patti Rocks (1988),Comedy|Drama,Patti Rocks,1988,34053640,4.500000,,Chris Mulkey,
7124,184053,Battle Planet (2008),Action|Sci-Fi,Battle Planet,2008,20869935,2.000000,,John Duerler|Zack Ward|Monica May|Colleen Smit...,"In the not-so-distant future, Captain Jordan S..."


# RAG - Retrieval Augmented Generation

For semantic recommendations, all text features will be combined into a single column and then embedded. This encoded string is used for Retrieval using semantic search.

## Combine Text Features into a single column

In [32]:
# --- 1. Combine Text Features ---
print("Combining text features...")

def combine_movie_text(row):
    """Combines relevant text fields into a single descriptive string. Replace "|" with ", " for better encoding."""
    title = f"Title: {row['cleaned_title']}" if row['cleaned_title'] else ""

    genres = f"Genres: {row['genres']}".replace("|", ", ") if row['genres'] else ""

    tags = f"Tags: {row['tags']}".replace("|", ", ") if row['tags'] else ""

    actors = f"actors/Actors: {row['actors']}".replace("|", ", ") if row['actors'] else ""
    
    plot = f"Plot: {row['plot']}" if row['plot'] else "Plot: N/A" # Ensure plot always has a label

    # Combine non-empty parts
    parts = [part for part in [title, genres, tags, actors, plot] if part]
    combined = ". ".join(parts)

    return combined 

# Apply the function to create the new column
final_merged_df['combined_text'] = final_merged_df.apply(combine_movie_text, axis=1)

final_merged_df

Combining text features...


Unnamed: 0,movieId,title,genres,cleaned_title,year,wikipedia_id,tags,actors,plot,combined_text
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,53085,pixar|fun,John Ratzenberger|Tim Allen|Annie Potts|Don Ri...,Woody is a pull-string cowboy doll and leader...,"Title: Toy Story. Genres: Adventure, Animation..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995,3700174,fantasy|magic board game|Robin Williams|game,David Alan Grier|Robin Williams|Jonathan Hyde|...,"In 1869, two boys bury a game board in a fore...","Title: Jumanji. Genres: Adventure, Children, F..."
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,1934035,moldy|old,Katie Sagona|Walter Matthau|Kevin Pollak|Daryl...,The lifelong feud between Max and John has c...,"Title: Grumpier Old Men. Genres: Comedy, Roman..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995,972970,,Whitney Houston|Angela Bassett|Loretta Devine|...,Waiting to Exhale is a story about four Africa...,"Title: Waiting to Exhale. Genres: Comedy, Dram..."
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,3303622,pregnancy|remake,Steve Martin|Diane Keaton|Martin Short|Kimberl...,George Banks must accept the reality of what h...,Title: Father of the Bride Part II. Genres: Co...
...,...,...,...,...,...,...,...,...,...,...
7121,182731,Pixel Perfect (2004),Children|Comedy|Sci-Fi,Pixel Perfect,2004,1956661,,Ricky Ullman|Leah Pipes|Spencer Redford,The movie starts with sixteen-year old Roscoe ...,"Title: Pixel Perfect. Genres: Children, Comedy..."
7122,183301,The Tale of the Bunny Picnic (1986),Children,The Tale of the Bunny Picnic,1986,8554766,,Steve Whitmire|Martin P. Robinson|Richard Hunt...,The story is actually told by an elderly Bean ...,Title: The Tale of the Bunny Picnic. Genres: C...
7123,183317,Patti Rocks (1988),Comedy|Drama,Patti Rocks,1988,34053640,,Chris Mulkey,,"Title: Patti Rocks. Genres: Comedy, Drama. act..."
7124,184053,Battle Planet (2008),Action|Sci-Fi,Battle Planet,2008,20869935,,John Duerler|Zack Ward|Monica May|Colleen Smit...,"In the not-so-distant future, Captain Jordan S...","Title: Battle Planet. Genres: Action, Sci-Fi. ..."


## Embed the Combined Text using Gemini API

In [33]:
from google import genai
from google.genai import types
import time
# libraries to import environment variables
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()


# Google's text-embedding-004 allows batching up to 100 documents per request
BATCH_SIZE = 100 # Maximize batch size to reduce number of API calls

# Set Delay Between Batches to 1 second to avoid rate limiting
DELAY_BETWEEN_BATCHES = 1.0 # seconds
print(f"Batch Size: {BATCH_SIZE}")
print(f"Delay between batches: {DELAY_BETWEEN_BATCHES:.4f} seconds")

print("\nStarting embedding generation...")

# Initialize the Google GenAI client
client = genai.Client(api_key=os.getenv("API_KEY"))

# Initialize an empty list to store embeddings
embeddings_list = []

# Calculate the number of batches needed
total_batches = int(np.ceil(len(final_merged_df) / BATCH_SIZE))
print(f"Total rows: {len(final_merged_df)}, Total batches: {total_batches}")

# Process data in batches
for i in range(0, len(final_merged_df), BATCH_SIZE):
    # Get the current batch of texts
    batch_texts = final_merged_df['combined_text'][i:i + BATCH_SIZE].tolist()

    # Get the embeddings for the current batch using the Google GenAI API
    # The model used is "text-embedding-004" and task_type is "RETRIEVAL_DOCUMENT"
    # Task type is set to "RETRIEVAL_DOCUMENT" for better retrieval performance
    response = client.models.embed_content(
        model="text-embedding-004",
        contents=batch_texts,
        config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT"))

    for embedding in response.embeddings:
        embeddings_list.append(embedding.values)
    
    print(f"Batch {int(i/100 + 1)} Complete")

    # --- Rate Limiting Delay ---
    time.sleep(DELAY_BETWEEN_BATCHES)


# --- 3. Add Embeddings to DataFrame ---
print("\nEmbedding generation complete.")

# Check if the number of embeddings matches the dataframe length
if len(embeddings_list) == len(final_merged_df):
    final_merged_df['embeddings'] = embeddings_list
    print("Embeddings column added successfully.")

else:
    print(f"Error: Number of generated embeddings ({len(embeddings_list)}) does not match DataFrame length ({len(final_merged_df)}).")
    print("Embeddings column was not added. Please check errors.")


Batch Size: 100
Delay between batches: 1.0000 seconds

Starting embedding generation...
Total rows: 7126, Total batches: 72
Batch 1 Complete
Batch 2 Complete
Batch 3 Complete
Batch 4 Complete
Batch 5 Complete
Batch 6 Complete
Batch 7 Complete
Batch 8 Complete
Batch 9 Complete
Batch 10 Complete
Batch 11 Complete
Batch 12 Complete
Batch 13 Complete
Batch 14 Complete
Batch 15 Complete
Batch 16 Complete
Batch 17 Complete
Batch 18 Complete
Batch 19 Complete
Batch 20 Complete
Batch 21 Complete
Batch 22 Complete
Batch 23 Complete
Batch 24 Complete
Batch 25 Complete
Batch 26 Complete
Batch 27 Complete
Batch 28 Complete
Batch 29 Complete
Batch 30 Complete
Batch 31 Complete
Batch 32 Complete
Batch 33 Complete
Batch 34 Complete
Batch 35 Complete
Batch 36 Complete
Batch 37 Complete
Batch 38 Complete
Batch 39 Complete
Batch 40 Complete
Batch 41 Complete
Batch 42 Complete
Batch 43 Complete
Batch 44 Complete
Batch 45 Complete
Batch 46 Complete
Batch 47 Complete
Batch 48 Complete
Batch 49 Complete
Bat

In [34]:
final_merged_df['embeddings']

0       [0.016334996, -0.007906361, 0.02554022, 0.0667...
1       [-0.035171103, -0.0016925373, 0.0070137545, 0....
2       [-0.01428898, 0.008715143, -0.0026058434, 0.03...
3       [0.02996565, -0.03595311, 0.061581515, 0.05286...
4       [-0.025408199, 0.0093763005, -0.01168014, 0.06...
                              ...                        
7121    [0.022956638, -0.0015231156, -0.017472545, 0.0...
7122    [-0.013111466, 0.0050408226, 0.015124239, 0.05...
7123    [0.0023729715, 0.011117007, -0.038275216, 0.05...
7124    [-0.01165242, -0.004822535, -0.029260308, 0.05...
7125    [0.0035788808, -0.029340807, -0.019439552, 0.0...
Name: embeddings, Length: 7126, dtype: object

## Write the embeddings to CSV for later use

In [35]:
final_merged_df.to_csv("df_with_embeddings.csv", index=False)

## Test Semantic Retrieval 

In [50]:
import pandas as pd
import numpy as np

# import dataframe from csv
df = pd.read_csv("df_with_embeddings.csv")

# Convert string representation of vectors to numpy arrays
try:
    df['embeddings'] = df['embeddings'].apply(lambda x: np.array(eval(x))) #convert from string to numpy
except (SyntaxError, NameError, TypeError) as e:
    print(f"Error converting embeddings to numpy arrays.  Check that 'embeddings' is a list of numbers: {e}")

df.head(5)

Unnamed: 0,movieId,title,genres,cleaned_title,year,wikipedia_id,rating,tags,actors,plot,combined_text,embeddings
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995.0,53085,3.92093,pixar|fun,John Ratzenberger|Tim Allen|Annie Potts|Don Ri...,Woody is a pull-string cowboy doll and leader...,"Title: Toy Story. Genres: Adventure, Animation...","[0.016334996, -0.007906361, 0.02554022, 0.0667..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995.0,3700174,3.431818,fantasy|magic board game|Robin Williams|game,David Alan Grier|Robin Williams|Jonathan Hyde|...,"In 1869, two boys bury a game board in a fore...","Title: Jumanji. Genres: Adventure, Children, F...","[-0.035171103, -0.0016925373, 0.0070137545, 0...."
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995.0,1934035,3.259615,moldy|old,Katie Sagona|Walter Matthau|Kevin Pollak|Daryl...,The lifelong feud between Max and John has c...,"Title: Grumpier Old Men. Genres: Comedy, Roman...","[-0.01428898, 0.008715143, -0.0026058434, 0.03..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995.0,972970,2.357143,,Whitney Houston|Angela Bassett|Loretta Devine|...,Waiting to Exhale is a story about four Africa...,"Title: Waiting to Exhale. Genres: Comedy, Dram...","[0.02996565, -0.03595311, 0.061581515, 0.05286..."
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995.0,3303622,3.071429,pregnancy|remake,Steve Martin|Diane Keaton|Martin Short|Kimberl...,George Banks must accept the reality of what h...,Title: Father of the Bride Part II. Genres: Co...,"[-0.025408199, 0.0093763005, -0.01168014, 0.06..."


In [51]:
# Returns RAG-based recommendations sorted by similarity scores.
from google import genai
from google.genai import types
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Initialize the Google GenAI client
client = genai.Client(api_key=os.getenv("API_KEY"))

def get_embedding(text):
    """
    Get the embedding for a given text using Google's GenAI API.
    """
    response = client.models.embed_content(
        model="text-embedding-004",
        contents=text,
        config=types.EmbedContentConfig(task_type="RETRIEVAL_QUERY"))
    
    return response.embeddings[0].values

# Calculate dot product between input embedding and movie embeddings.
try:
    movie_embeddings = np.dot(np.stack(df['embeddings']), get_embedding("An action movie with a man who fights in an iron suit"))
    sorted_indices = np.argsort(movie_embeddings)[::-1]  # Sort in descending order
    top_indices = sorted_indices[:20]  # Get top 10 indices

except ValueError as e:
    print(f"Error calculating dot product.  Ensure embedding dimensions match: {e}")

# Display the top 10 recommended movies
recommended_movies = df.iloc[top_indices][['movieId', 'title', 'wikipedia_id', 'tags', 'genres', 'actors', 'plot']]
recommended_movies

Unnamed: 0,movieId,title,wikipedia_id,tags,genres,actors,plot
1058,1599,Steel (1997),2101749,,Action,Shaquille O'Neal|Richard Roundtree|Judd Nelson...,John Henry Irons is a weapons designer who in...
6867,97950,"Man with the Iron Fists, The (2012)",30840944,,Action|Adventure|Crime,Jamie Chung|David Bautista|RZA|Lucy Liu|Cung L...,
5855,59315,Iron Man (2008),5676692,,Action|Adventure|Sci-Fi,Bill Smitrovich|Robert Downey Jr.|Terrence How...,"Playboy and genius Tony Stark, who has inherit..."
1769,2625,Black Mask (Hak hap) (1996),3315039,,Action|Adventure|Crime|Sci-Fi|Thriller,Francoise Yip|Jet Li|Karen Mok|Anthony Wong|La...,"Tsui Chik , tries to lead a quiet life as a li..."
6585,88140,Captain America: The First Avenger (2011),26999426,,Action|Adventure|Sci-Fi|Thriller|War,Christian Black|Chris Evans|Hayley Atwell|Seba...,"In the present day, scientists in the Arctic ..."
6321,77561,Iron Man 2 (2010),22144721,,Action|Adventure|Sci-Fi|Thriller|IMAX,Robert Downey Jr.|Don Cheadle|Scarlett Johanss...,"In Russia, the news media covers Tony Stark's ..."
3150,4846,Iron Monkey (Siu nin Wong Fei-hung ji: Tit Ma ...,2048766,,Action|Comedy,Hsiao Ho|Yu Rongguang|Donnie Yen|Jean Wang|Shi...,The plot centers on a masked pugilist known as...
7124,184053,Battle Planet (2008),20869935,,Action|Sci-Fi,John Duerler|Zack Ward|Monica May|Colleen Smit...,"In the not-so-distant future, Captain Jordan S..."
6914,102007,"Invincible Iron Man, The (2007)",25124848,animation,Animation,Marc Worden|Elisa Gabrielli|Fred Tatasciore|Ro...,
4512,8136,Indestructible Man (1956),6015713,,Crime|Horror|Sci-Fi,"Lon Chaney, Jr.|Max Showalter|Madge Cleveland|...",Told in flashback by police detective Dick Cha...
