In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ratings_file = 'ml-latest-small/ratings.csv'
movies_file = 'ml-latest-small/movies.csv'
ratings_data = pd.read_csv(ratings_file)
movies_data = pd.read_csv(movies_file)

In [3]:
def preprocess_data(ratings_data, movies_data):
    # merge ratings and movies data based on movieId
    merged_data = pd.merge(ratings_data, movies_data, on='movieId')
    # drop rows with missing values
    merged_data.dropna(inplace=True)
    # feature engineering
    merged_data['total_ratings'] = merged_data.groupby('movieId')['rating'].transform('count')
    # normalization ratings feature
    scaler = MinMaxScaler()
    merged_data[['rating', 'total_ratings']] = scaler.fit_transform(merged_data[['rating', 'total_ratings']])
    # handle categorical featire (genres)
    encoded_genres = pd.get_dummies(merged_data['genres'], prefix='genre')
    merged_data = pd.concat([merged_data, encoded_genres], axis=1)
    return merged_data

In [30]:
# load the pre-trained gpt2 model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

def preprocess_text_with_gpt2(text, tokenizer):
    # tokenization using GPT-2 tokenizer
    encoded_input = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')
    return encoded_input

def encoding_movie_attributes_with_gpt(movie_attributes):
    if movie_attributes is None or len(movie_attributes) == 0:
        return None

    # Find the maximum length in the batch
    max_length = max(len(attr) for attr in movie_attributes)

    encoded_attributes = []
    for attr in movie_attributes:
        encoded_input = tokenizer.encode_plus(attr, add_special_tokens=True, return_tensors='pt')
        encoded_attributes.append(encoded_input['input_ids'].squeeze(0))

    # Pad each sequence individually to the maximum length
    padded_attributes = []
    for attr in encoded_attributes:
        padded_attr = torch.nn.functional.pad(attr, (0, max_length - attr.size(0)), value=0)  # Set padding value to 0
        padded_attributes.append(padded_attr)

    # Stack the padded sequences
    encoded_attributes = torch.stack(padded_attributes, dim=0)

    return encoded_attributes

In [5]:
def preprocess_movie_attributes(movie_ids, movie_titles, movie_genres):
    preprocessed_attributes = []
    for movie_id, title, genre in zip(movie_ids, movie_titles, movie_genres):
        # convert genre to a list
        genre_list = [genre.strip() for genre in genre.split(',')]
        # join the attributes into a single string
        combined_attr = f"{movie_id}, {title}, {', '.join(genre_list)}"
        preprocessed_attributes.append(combined_attr)

    return preprocessed_attributes

In [35]:
def movie_similarity_scores(encoded_attributes):
    similarity_matrix = []
    for attr in encoded_attributes:
        attr = attr.squeeze(0)
        # calculate cosine similarity between attribute and all other attributes
        similarity_scores = torch.cosine_similarity(attr.unsqueeze(0).float(), encoded_attributes.squeeze(1).float(), dim=1)
        similarity_matrix.append(similarity_scores)
    similarity_matrix = torch.stack(similarity_matrix)
    return similarity_matrix

In [7]:
def get_content_based_recommendations(movie_id, similarity_matrix, movie_ids, top_n=5):
    # get the index of the movie in the movie_ids list
    movie_index = movie_ids.index(movie_id)
    # get the similarity scores of the given movie with other movies
    movie_scores = similarity_matrix[movie_index]
    # sort the movie scores in descending order and get the top N movies
    top_movie_indices = sorted(range(len(movie_scores)), key=lambda i: movie_scores[i], reverse=True)[:top_n]
    # retrieve the movie IDs of the top recommended movies
    recommended_movie_ids = [movie_ids[index] for index in top_movie_indices]
    return recommended_movie_ids

In [8]:
def preprocess_user_input(user_input):
    movie_preferences = []
    keywords = []
    # Split the user input into individual tokens
    tokens = user_input.lower().split()
    # Logic for extracting preferences and keywords
    for index, token in enumerate(tokens):
        if token in ['like', 'love', 'enjoy']:
            # If the next token is 'movie' or 'movies', capture preferences
            if index + 1 < len(tokens) and tokens[index + 1] in ['movie', 'movies']:
                movie_preferences.append(token)
            else:
                keywords.append(token)
        elif token in ['recommend', 'similar', 'recommendation']:
            # If the previous token is 'with', capture recommendation request
            if index - 1 >= 0 and tokens[index - 1] == 'with':
                keywords.append(token)
        elif token == 'not':
            # If the next token is 'like', capture preferences to avoid
            if index + 1 < len(tokens) and tokens[index + 1] == 'like':
                index += 1  # Skip the 'like' token after 'not'
            else:
                keywords.append(token)
        else:
            keywords.append(token)
    
    print("Movie Preferences:", movie_preferences)
    print("Keywords:", keywords)
    
    return movie_preferences, keywords

In [9]:
def extract_specific_movie(user_input, movies_data):
    encoded_input = preprocess_text_with_gpt2(user_input)
    generated_output = model.generate(encoded_input['input_ids'], max_length=100)
    generated_text = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    
    # Search for movie title in the generated text
    for movie_title in movies_data['title']:
        if movie_title.lower() in generated_text.lower():
            return movie_title
    
    return None

def get_movie_id_from_title(movie_title, movies_data):
    movie = movies_data[movies_data['title'].str.lower() == movie_title]
    if not movie.empty:
        movie_id = movie['movieId'].values[0]
        return movie_id
    return None

In [10]:
def match_movies_by_keywords(keywords, movie_ids, movie_attributes):
    matched_movies = []
    for movie_id, attributes in zip(movie_ids, movie_attributes):
        attributes = [attr.lower() for attr in attributes]
        keywords = [keyword.lower() for keyword in keywords]
        # perform partial matching
        for keyword in keywords:
            for attribute in attributes:
                if keyword in attribute:
                    matched_movies.append(movie_id)
                    break
    return matched_movies

def get_movie_details(movie_id, movies_data):
    movie = movies_data[movies_data['movieId'] == movie_id]
    if not movie.empty:
        title = movie['title'].values[0]
        genre = movie['genres'].values[0]
        return {"title": title, "genre": genre}
    return None

In [36]:
def recommend_movies(movies_data, ratings_data, top_k=5):
    user_preferences = input("Enter your movie preferences (comma-separated): ").lower().split(',')
    # preprocess the data
    data = preprocess_data(ratings_data,movies_data)
    # preprocess movie attributes
    movie_attributes = preprocess_movie_attributes(data['movieId'], data['title'], data['genres'])
    # encode movie attributes with GPT-2
    encoded_attributes = encoding_movie_attributes_with_gpt(movie_attributes)
    if encoded_attributes is not None and encoded_attributes.size(0) > 0:
        # compute similarity scores
        similarity_matrix = movie_similarity_scores(encoded_attributes)
        # match user preferences with movie attributes
        matched_movies = match_movies_by_keywords(user_preferences, data['movieId'], movie_attributes)
        # get content-based recommendations
        recommended_movie_ids = get_content_based_recommendations(matched_movies, similarity_matrix, data['movieId'])
        # retrieve movie details for recommended movies
        recommendations = []
        for movie_id in recommended_movie_ids:
            movie_details = get_movie_details(movie_id, movies_data)
            if movie_details:
                recommendations.append(movie_details)
        # display top-k recommendations
        print(f"\nTop {top_k} Recommendations:")
        for i, movie in enumerate(recommendations[:top_k], start=1):
            print(f"\nMovie {i}:")
            print(f"Title: {movie['title']}")
            print(f"Genre: {movie['genre']}")
        return recommendations[:top_k]
    return None

recommended_movies = recommend_movies(movies_data, ratings_data, top_k=5)