In [1]:
from imdb import IMDb
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch.nn.functional as F
import difflib

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# IMDb data preprocessing and attribute extraction
def preprocess_imdb_datasource():
    # class to access the data
    imdb_obj = IMDb()
    movies = imdb_obj.get_top250_movies()
    movie_ids = []
    movie_titles = []
    movie_genres = []
    movie_actors = []
    movie_directors = []
    movie_plot_summaries = []
    # attributes extractions
    for movie in movies:
        movie_ids.append(movie.movieID)
        movie_titles.append(movie['title'])
        if 'genres' in movie:
            movie_genres.append(movie['genres'])
        else:
            movie_genres.append([])
        if 'cast' in movie:
            movie_actors.append([actor['name'] for actor in movie['cast']])
        else:
            movie_actors.append([])
        if 'director' in movie:
            movie_directors.append([director['name'] for director in movie['director']])
        else:
            movie_directors.append([])
        if 'plot outline' in movie:
            movie_plot_summaries.append(movie['plot outline'])
        else:
            movie_plot_summaries.append("")

    return movie_ids, movie_titles, movie_genres, movie_actors, movie_directors, movie_plot_summaries

In [3]:
# load the pre-trained gpt2 model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

def preprocess_text_with_gpt2(text):
    # tokenize the input using gpt2 tokenizer
    encoded_input = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')
    return encoded_input

def preprocess_movie_attributes(movie_genres, movie_actors, movie_directors, movie_plot_summaries):
    preprocessed_attributes = []
    for genre, actors, directors, summary in zip(movie_genres, movie_actors, movie_directors, movie_plot_summaries):
        # convert genre, actors, and directors to strings
        genre_str = ', '.join(genre)
        actors_str = ', '.join(actors)
        directors_str = ', '.join(directors)
        # concatenate the attributes into a single string
        combined_attr = f"{genre_str}, {actors_str}, {directors_str}, {summary}"
        preprocessed_attributes.append(combined_attr)
    return preprocessed_attributes


In [4]:
def encoding_movie_attributes_with_gpt(movie_attributes):
    model.eval()
    encoded_attributes = []
    for attr in movie_attributes:
        if not attr or (isinstance(attr, (list, tuple)) and all(value == '' for value in attr)):
            continue
        preprocessed_attr = preprocess_text_with_gpt2(attr)
        with torch.no_grad():
            outputs = model(input_ids=preprocessed_attr['input_ids'], attention_mask=preprocessed_attr['attention_mask'])
        if hasattr(outputs, 'last_hidden_state') and outputs.last_hidden_state.size(1) > 0:
            encoded_attr = outputs.last_hidden_state
            encoded_attributes.append(encoded_attr)
        else:
            continue
    return encoded_attributes

In [5]:
# compute similarity scores
def movie_similarity_scores(encoded_attributes):
    similarity_matrix = []
    for attr in encoded_attributes:
        attr = attr.squeeze(0)
        # Calculate cosine similarity between attribute and all other attributes
        similarity_scores = F.cosine_similarity(attr, torch.stack(encoded_attributes).squeeze(1), dim=1)
        similarity_matrix.append(similarity_scores)
    similarity_matrix = torch.stack(similarity_matrix)
    return similarity_matrix

In [6]:
def get_content_based_recommendations(movie_id, similarity_matrix, movie_ids, top_n=5):
    # Get the index of the movie in the movie_ids list
    movie_index = movie_ids.index(movie_id)
    # Get the similarity scores of the given movie with other movies
    movie_scores = similarity_matrix[movie_index]
    # Sort the movie scores in descending order and get the top N movies
    top_movie_indices = sorted(range(len(movie_scores)), key=lambda i: movie_scores[i], reverse=True)[:top_n]
    # Retrieve the movie IDs of the top recommended movies
    recommended_movie_ids = [movie_ids[index] for index in top_movie_indices]
    return recommended_movie_ids

In [7]:
def preprocess_user_input(user_input):
    movie_preferences = []
    keywords = []
    # tokenization and preprocessing using gpt2 
    tokenized_input = preprocess_text_with_gpt2(user_input)
    # logic for extracting preferences and keywords
    for index, token in enumerate(tokenized_input):
        if token in ['like', 'love', 'enjoy']:
            # i the next token is 'movie' or 'movies' to capture preferences
            if index + 1 < len(tokenized_input) and tokenized_input[index + 1] in ['movie', 'movies']:
                movie_preferences.append(token)
            else:
                keywords.append(token)
        elif token in ['recommend', 'similar', 'recommendation']:
            # if the previous token is 'with' to capture recommendation request
            if index - 1 >= 0 and tokenized_input[index - 1] == 'with':
                keywords.append(token)
        elif token == 'not':
            # if the next token is 'like' to capture preferences to avoid
            if index + 1 < len(tokenized_input) and tokenized_input[index + 1] == 'like':
                index += 1  # skip the 'like' token after 'not'
            else:
                keywords.append(token)
        else:
            keywords.append(token)
    
    return movie_preferences, keywords

In [8]:
def extract_specific_movie(user_input):
    # tokenization and preprocessing using gpt2 
    encoded_input = preprocess_text_with_gpt2(user_input)
    # predictions using gpt2 model
    with torch.no_grad():
        outputs = model.generate(input_ids=encoded_input['input_ids'], attention_mask=encoded_input['attention_mask'])
    # decode the generated output to get the movie title
    movie_title = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return movie_title

def get_movie_id_from_title(movie_title):
    # retrieve the movie_id from IMDb data source
    imdb_obj = IMDb()
    movies = imdb_obj.search_movie(movie_title)
    if movies:
        first_movie = movies[0]
        movie_id = first_movie.movieID
        return movie_id
    return None

In [9]:
def match_movies_by_keywords(keywords, movie_ids, movie_attributes):
    matched_movies = []
    for movie_id, attributes in zip(movie_ids, movie_attributes):
        # if any of the keywords match the movie attributes
        for keyword in keywords:
            if keyword in attributes:
                matched_movies.append(movie_id)
                break

    return matched_movies

def get_movie_details(movie_id):
    # retrieve the details of a movie based on its ID using IMDbPy
    ia = IMDb()
    movie = ia.get_movie(movie_id)
    title = movie['title']
    genre = ", ".join(movie['genres'])
    actors = ", ".join([actor['name'] for actor in movie['cast']])
    director = ", ".join([director['name'] for director in movie['directors']])
    summary = movie['plot'][0] if movie['plot'] else None

    return {"title": title, "genre": genre, "actors": actors, "director": director, "summary": summary}

In [10]:
movie_ids, movie_titles, movie_genres, movie_actors, movie_directors, movie_plot_summaries = preprocess_imdb_datasource()
preprocessed_attributes = preprocess_movie_attributes(movie_genres, movie_actors, movie_directors, movie_plot_summaries)
encoded_attributes = encoding_movie_attributes_with_gpt(preprocessed_attributes)
if encoded_attributes:
    similarity_matrix = movie_similarity_scores(encoded_attributes)

In [13]:
while True:
    user_input = input("User: ")
    movie_preferences, keywords = preprocess_user_input(user_input)
    if movie_preferences:
        print("Thank you for sharing your movie preferences!")
        response = input("Would you like to some movie recommendations based on your preferences? (yes/no): ")
        if response.lower() in ['yes', 'y']:
            matched_movies = match_movies_by_keywords(movie_preferences, movie_titles)
            if matched_movies:
                recommended_movie_ids = get_content_based_recommendations(matched_movies, similarity_matrix, movie_ids)
                if recommended_movie_ids:
                    print("Here are some recommended movies that may match your taste:")
                    for movie_id in recommended_movie_ids:
                        movie_details = get_movie_details(movie_id)
                        print(f"Title: {movie_details['title']}")
                        print(f"Genre: {movie_details['genre']}")
                        print(f"Actors: {movie_details['actors']}")
                        print(f"Director: {movie_details['director']}")
                        print(f"Summary: {movie_details['summary']}")
                        print()
                else:
                    print("No recommendations found matching your preferences.")
            else:
                print("No movies found matching your preferences.")
        else:
            print("OKAY.")

    elif keywords:
        # handle user keywords
        matched_movies = match_movies_by_keywords(keywords, movie_titles)
        if matched_movies:
            recommended_movie_ids = get_content_based_recommendations(matched_movies, similarity_matrix, movie_ids)
            if recommended_movie_ids:
                # display recommended movie details to the user
                print("Here are some recommended movies:")
                for movie_id in recommended_movie_ids:
                    movie_details = get_movie_details(movie_id)
                    print(f"Title: {movie_details['title']}")
                    print(f"Genre: {movie_details['genre']}")
                    print(f"Actors: {movie_details['actors']}")
                    print(f"Director: {movie_details['director']}")
                    print(f"Summary: {movie_details['summary']}")
                    print()
            else:
                print("No recommendations found for the given keywords.")
        else:
            # fuzzy matching to find close matches
            matched_movies = match_movies_by_keywords(keywords, preprocessed_attributes, movie_titles)
            if matched_movies:
                recommended_movie_ids = get_content_based_recommendations(matched_movies, similarity_matrix, movie_ids)
                if recommended_movie_ids:
                    print("Here are some recommended movies based on similar titles:")
                    for movie_id in recommended_movie_ids:
                        movie_details = get_movie_details(movie_id)
                        print(f"Title: {movie_details['title']}")
                        print(f"Genre: {movie_details['genre']}")
                        print(f"Actors: {movie_details['actors']}")
                        print(f"Director: {movie_details['director']}")
                        print(f"Summary: {movie_details['summary']}")
                        print()
                else:
                    print("No recommendations found for similar movie titles.")
            else:
                print("No movies found matching the given keywords.")
    else:
        # no movie preference or keywords detected
        print("I'm sorry, I couldn't understand your request. Could you please provide more information?")

TypeError: match_movies_by_keywords() missing 1 required positional argument: 'movie_attributes'