In [None]:
import pandas as pd
import requests
import tarfile
import json
import regex as re
import string
from wordcloud import WordCloud
from textwrap import wrap
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import seaborn as sns
from fuzzywuzzy import process



In [None]:
open('MovieSummaries.tar.gz')

<_io.TextIOWrapper name='MovieSummaries.tar.gz' mode='r' encoding='cp1252'>

In [None]:
# Read data - files are eighter saved as .txt or as tsv-format (tab separated)
df_character_metadata = pd.read_csv('MovieSummaries/character.metadata.tsv', delimiter='\t', header=None)
df_movie_metadata = pd.read_csv('MovieSummaries/movie.metadata.tsv', delimiter='\t', header=None)
df_name_cluster = pd.read_csv('MovieSummaries/name.clusters.txt', delimiter='\t', header=None)
df_plot_summary = pd.read_csv('MovieSummaries/plot_summaries.txt', delimiter='\t', header=None)
df_tvtrope_cluster = pd.read_csv('MovieSummaries/tvtropes.clusters.txt', delimiter='\t', header=None)

In [None]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\w*\d\w*', '', text)
    # Remove punctuation
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    # Remove extra spaces
    text = re.sub(r' +', ' ', text)
    #remove leading and trailing spaces
    text = text.strip()
    return text

In [None]:
# If plot_summaries.csv exists, read it, else create it
try:
    df_plot_summary = pd.read_csv('MovieSummaries/plot_summaries.csv').iloc[: , 1:]
except FileNotFoundError:
    print('Creating plot_summaries.csv')
    # Preprocess the text
    df_plot_summary[1] = df_plot_summary[1].apply(preprocess_text)
    # Save the preprocessed data
    df_plot_summary.to_csv('MovieSummaries/plot_summaries.csv')

In [None]:
# Load movie summaries and only keep ID and text columns
plot = pd.read_csv('MovieSummaries/plot_summaries.csv')
plot_df = plot.drop('Unnamed: 0', axis=1).rename(columns={"0": "id", "1": "plot"})

# Load movie metadata and only keep ID, movie title and genre json information
df_movie_metadata = pd.read_csv('MovieSummaries/movie.metadata.tsv', delimiter='\t', header=None)
genre_df = df_movie_metadata[[0,2,8]]

In [None]:
# Restructuring genre jsons to genre lists
genre_df['genre'] = genre_df[8].apply(lambda x: list(json.loads(x).values()))
genre_df.columns = ['movie_id','title','genre_json','genre']
genre_df = genre_df.drop(columns=['genre_json'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_df['genre'] = genre_df[8].apply(lambda x: list(json.loads(x).values()))


In [None]:
# Merging movie summaries and genres
df = plot_df.merge(genre_df, left_on='id', right_on='movie_id', how='left')
df = df.drop('movie_id', axis=1)

In [None]:
# There exsist 99 movies with movie summaries but no metadata available. These are removed.
df_length_old = len(df)
df.dropna(inplace=True)

# There furthermore exsist 411 movies with empty genre lists. These are also removed.
df = df[~df['genre'].apply(lambda x: len(x) == 0)]

print("Number of removed rows: ", df_length_old-len(df))

Number of removed rows:  510


In [None]:
# Final dataset
df.head()

Unnamed: 0,id,plot,title,genre
0,23890098,shlykov a hardworking taxi driver and lyosha a...,Taxi Blues,"[Drama, World cinema]"
1,31186339,the nation of panem consists of a wealthy capi...,The Hunger Games,"[Action/Adventure, Science Fiction, Action, Dr..."
2,20663735,poovalli induchoodan is sentenced for six year...,Narasimham,"[Musical, Action, Drama, Bollywood]"
3,2231378,the lemon drop kid a new york city swindler is...,The Lemon Drop Kid,"[Screwball comedy, Comedy]"
4,595909,seventhday adventist church pastor michael cha...,A Cry in the Dark,"[Crime Fiction, Drama, Docudrama, World cinema..."


In [None]:
df_character_metadata.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [None]:
actors_df = df_character_metadata[[0,8]].copy()
actors_df.columns = ["id", "actors"]

In [None]:
actors_df.head()

Unnamed: 0,id,actors
0,975900,Wanda De Jesus
1,975900,Natasha Henstridge
2,975900,Ice Cube
3,975900,Jason Statham
4,975900,Clea DuVall


In [None]:
actors_df['actors'] = actors_df['actors'].fillna('').astype(str)
# Group actors by movie and aggregate their names into a single string
actors_grouped = actors_df.groupby('id')['actors'].apply(', '.join).reset_index()

# Merge actors back into the movies DataFrame
movies_with_actors = pd.merge(df, actors_grouped, on='id', how='left')

# Fill NaN values in the relevant columns with an empty string
movies_with_actors['plot'] = movies_with_actors['plot'].fillna("")
movies_with_actors['actors'] = movies_with_actors['actors'].fillna("")

movies_with_actors.head()

Unnamed: 0,id,plot,title,genre,actors
0,23890098,shlykov a hardworking taxi driver and lyosha a...,Taxi Blues,"[Drama, World cinema]","Natalia Koliakanova, Pyotr Mamonov, Hal Singer..."
1,31186339,the nation of panem consists of a wealthy capi...,The Hunger Games,"[Action/Adventure, Science Fiction, Action, Dr...","Jacqueline Emerson, Jennifer Lawrence, Josh Hu..."
2,20663735,poovalli induchoodan is sentenced for six year...,Narasimham,"[Musical, Action, Drama, Bollywood]","Thilakan, Sai Kumar, Kalabhavan Mani, , Bharat..."
3,2231378,the lemon drop kid a new york city swindler is...,The Lemon Drop Kid,"[Screwball comedy, Comedy]","Jane Darwell, Bob Hope, Marilyn Maxwell, Ann S..."
4,595909,seventhday adventist church pastor michael cha...,A Cry in the Dark,"[Crime Fiction, Drama, Docudrama, World cinema...","Frank Holden, Sam Neill, Meryl Streep, Deborra..."


In [None]:
# additional preprocessing for genre, remove words, which confuse the clustering
def preprocess_genre(genre):
    genre = [preprocess_text(genre) for genre in genre]
    genre = [re.sub(r'\b(film|films|movie|movies|cinema|new|\'s)\b', '', genre) for genre in genre]
    genre = [genre.strip() for genre in genre]
    return genre

In [None]:
movies_with_actors['genre'] = movies_with_actors['genre'].apply(lambda x: preprocess_genre(x))

In [None]:
def recommend_movies(user_input, movies_df, top_n=5, num_genre_matches=3):
    """
    Recommend movies based on user input with improved matching for actors, genres, and plot similarity.
    Now considers multiple close genre matches for each input genre.

    Args:
        user_input (dict): Dictionary containing 'actors', 'genres', and/or 'plot'
        movies_df (DataFrame): DataFrame containing movie data
        top_n (int): Number of recommendations to return
        num_genre_matches (int): Number of similar genres to consider for each input genre

    Returns:
        DataFrame: Top recommended movies with similarity scores
    """
    # Create a copy of the dataframe to avoid modifications to original
    df = movies_df.copy()

    # Initialize similarity scores
    df['actor_score'] = 0.0
    df['genre_score'] = 0.0
    df['plot_score'] = 0.0

    # 1. Actor Matching
    if user_input.get("actors"):
        # Ensure actors column is properly formatted
        df['actors_list'] = df['actors'].apply(
            lambda x: [name.strip().lower() for name in str(x).split(',') if name.strip()]
        )

        for actor in user_input["actors"]:
            actor = actor.lower()
            # Get closest matching actor name from the dataset
            all_actors = set([a for actors in df['actors_list'] for a in actors])
            if all_actors:
                closest_match = process.extractOne(actor, all_actors)
                if closest_match and closest_match[1] >= 80:  # 80% similarity threshold
                    print(f"Matching actor '{actor}' to '{closest_match[0]}' ({closest_match[1]}% match)")
                    # Add score for movies containing this actor
                    df['actor_score'] += df['actors_list'].apply(
                        lambda x: 1 if closest_match[0] in x else 0
                    )

    # 2. Genre Matching with multiple similar genres
    if user_input.get("genres"):
        # Convert genre lists from string to actual lists if needed
        df['genre_list'] = df['genre'].apply(
            lambda x: x if isinstance(x, list) else ast.literal_eval(x) if isinstance(x, str) else []
        )

        for genre in user_input["genres"]:
            genre = genre.lower()
            # Get all unique genres from the dataset
            all_genres = set([g.lower() for genres in df['genre_list'] for g in genres])

            if all_genres:
                # Get multiple close matches for each genre
                closest_matches = process.extract(genre, all_genres, limit=num_genre_matches)

                print(f"\nFor genre '{genre}', found similar genres:")
                for match, score in closest_matches:
                    if score >= 70:  # 70% similarity threshold for extended matches
                        print(f"- '{match}' ({score}% match)")
                        # Add weighted score based on match quality
                        weight = score / 100.0  # Convert percentage to decimal
                        df['genre_score'] += df['genre_list'].apply(
                            lambda x: weight if match in [g.lower() for g in x] else 0
                        )

    # 3. Plot Similarity
    if user_input.get("plot"):
        # Prepare TF-IDF vectorizer
        tfidf = TfidfVectorizer(
            stop_words='english',
            max_features=5000,
            ngram_range=(1, 2)
        )

        # Combine user query with movie plots
        all_plots = [user_input["plot"]] + df['plot'].tolist()
        tfidf_matrix = tfidf.fit_transform(all_plots)

        # Calculate cosine similarity
        similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
        df['plot_score'] = similarity_scores

    # Calculate final weighted score
    weights = {
        'actor': 0.3,
        'genre': 0.3,
        'plot': 0.4
    }

    # Normalize scores
    if user_input.get("actors"):
        df['actor_score'] = df['actor_score'] / df['actor_score'].max() if df['actor_score'].max() > 0 else 0
    if user_input.get("genres"):
        df['genre_score'] = df['genre_score'] / df['genre_score'].max() if df['genre_score'].max() > 0 else 0
    if user_input.get("plot"):
        df['plot_score'] = (df['plot_score'] - df['plot_score'].min()) / (df['plot_score'].max() - df['plot_score'].min()) if df['plot_score'].max() > df['plot_score'].min() else 0

    # Calculate weighted score based on provided criteria
    df['final_score'] = 0
    if user_input.get("actors"):
        df['final_score'] += weights['actor'] * df['actor_score']
    if user_input.get("genres"):
        df['final_score'] += weights['genre'] * df['genre_score']
    if user_input.get("plot"):
        df['final_score'] += weights['plot'] * df['plot_score']

    # Normalize final score
    if df['final_score'].max() > 0:
        df['final_score'] = df['final_score'] / df['final_score'].max()

    # Sort and select top recommendations
    results = df.nlargest(top_n, 'final_score')[
        ['title', 'actors', 'genre', 'plot', 'final_score', 'actor_score', 'genre_score', 'plot_score']
    ]

    return results


In [None]:
# Example usage
user_input = {
    "actors": ["Keanu Reeves", "Sandra Bullock"],  # Optional: List of actors
    "genres": ["Action", "Thriller"],             # Optional: List of genres
    "plot": "A thrilling action movie involving a high-speed chase."  # Optional: Plot description
}

# Recommend movies
recommendations = recommend_movies(user_input, movies_with_actors, top_n=3)
recommendations

Matching actor 'keanu reeves' to 'keanu reeves' (100% match)
Matching actor 'sandra bullock' to 'sandra bullock' (100% match)

For genre 'action', found similar genres:
- 'action' (100% match)
- 'live action' (90% match)
- 'action thrillers' (90% match)

For genre 'thriller', found similar genres:
- 'thriller' (100% match)
- 'action thrillers' (90% match)
- 'crime thriller' (90% match)


Unnamed: 0,title,actors,genre,plot,final_score,actor_score,genre_score,plot_score
31096,Speed,"Eddie Yansick, Keanu Reeves, Dennis Hopper, Sa...","[thriller, actionadventure, action, crime fict...",an unidentified man traps several businesspeop...,1.0,1.0,0.425532,0.05231


In [None]:
user_input2 = {
    "actors": ["Keanu Reeves", "Laurence Fishburne"],
    "genres": ["Science Fiction", "Action"],
    "plot": "A computer programmer discovers that reality is a simulation and joins a rebellion to free humanity"
}

recommendations2 = recommend_movies(user_input2, movies_with_actors, top_n=3)
recommendations2

Matching actor 'keanu reeves' to 'keanu reeves' (100% match)
Matching actor 'laurence fishburne' to 'laurence fishburne' (100% match)

For genre 'science fiction', found similar genres:
- 'science fiction' (100% match)
- 'science fiction western' (90% match)
- 'apocalyptic and postapocalyptic fiction' (86% match)

For genre 'action', found similar genres:
- 'action' (100% match)
- 'live action' (90% match)
- 'action thrillers' (90% match)


Unnamed: 0,title,actors,genre,plot,final_score,actor_score,genre_score,plot_score
23006,The Matrix,"Belinda McClory, Keanu Reeves, Laurence Fishbu...","[thriller, science fiction, adventure, doomsda...",computer programmer thomas anderson is secretl...,1.0,1.0,0.689655,0.374905
16635,The Matrix Revolutions,"Mary Alice, Keanu Reeves, Laurence Fishburne, ...","[thriller, science fiction, adventure, actiona...",neo and banes bodies lie unconscious in the me...,0.785873,1.0,0.689655,0.023277
7954,The Matrix Reloaded,"Roy Jones Jr., Keanu Reeves, Laurence Fishburn...","[actionadventure, thriller, science fiction, a...",six months after the events of the first movie...,0.771698,1.0,0.689655,0.0


In [None]:
user_input3 = {
    "actors": ["matt damon", "ben affleck"],
    "genres": ["drama"],
    "plot": "A professor from prestigious university helps a genius young janitor realise his full potential"
}

recommendations3 = recommend_movies(user_input3, movies_with_actors, top_n=1)
print(recommendations3)

Matching actor 'matt damon' to 'matt damon' (100% match)
Matching actor 'ben affleck' to 'ben affleck' (100% match)

For genre 'drama', found similar genres:
- 'drama' (100% match)
- 'historical drama' (90% match)
- 'inspirational drama' (90% match)
       title                                             actors  \
6120  Aladin  Riteish Deshmukh, Amitabh Bachchan, Sanjay Dut...   

                                                  genre  \
6120  [world, musical, drama, romantic drama, romanc...   

                                                   plot  final_score  \
6120  the film opens with the family of chatterjee w...          1.0   

      actor_score  genre_score  plot_score  
6120          0.0     0.526316         1.0  
