In [10]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
# Reading the dataset
userbase1_dataframe = pd.read_csv("dataset/ml-100k/u1.base", names=['user id', 'item id', 'rating', 'timestamp'], delimiter="\t")

In [12]:
# Let's assume you have another dataset containing movie information like movie_id, title, genres
movies_dataframe = pd.read_csv("dataset/ml-100k/u.item", delimiter="|", encoding="latin1",
                               names=["item id", "title", "release date", 
                                      "video release date", "IMDb URL", "unknown", 
                                      "Action", "Adventure", "Animation",
                                      "Children's", "Comedy", "Crime",
                                      "Documentary", "Drama", "Fantasy",
                                      "Film-Noir", "Horror", "Musical",
                                      "Mystery", "Romance", "Sci-Fi",
                                      "Thriller", "War", "Western"
                                      ])
movies_dataframe = movies_dataframe.drop(["unknown"], axis=1)

In [13]:
# Feature Extraction: Using movie genres for simplicity
# Created a string containing all genres for each movie
genres_columns = ["Action", "Adventure", "Animation", "Children's", 
                  "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
                  "Film-Noir", "Horror", "Musical", "Mystery", "Romance",
                  "Sci-Fi", "Thriller", "War", "Western"]

In [14]:
def extract_genres(row):
    genres = [col for col in genres_columns if row[col] == 1]
    return ' '.join(genres)

In [15]:
movies_dataframe['genres'] = movies_dataframe.apply(extract_genres, axis=1)

In [16]:
# Vectorization
vectorizer = CountVectorizer(binary=True, lowercase=False)
genre_matrix = vectorizer.fit_transform(movies_dataframe['genres'])

In [17]:
def get_recommendations_genre(genres):
    # Convert the list of genres into a single string
    genres_string = ' '.join(genres)
    
    # Vectorize the provided genres
    genres_vector = vectorizer.transform([genres_string])
    
    # Calculate similarity with all movies
    genre_similarity = cosine_similarity(genres_vector, genre_matrix).flatten()
    
    # Get indices of top similar movies
    top_indices = genre_similarity.argsort()[::-1][1:51]  # Exclude the first, as it's the input itself
    
    # Get the movie titles and similarity scores
    recommended_movies = movies_dataframe.iloc[top_indices][['title']].copy()
    recommended_movies['similarity_score'] = genre_similarity[top_indices]
    
    return recommended_movies.reset_index()

In [18]:
# Example usage with a genre
genre_input = ["Action", "Thriller", "Adventure"]
recommended_movies_df = get_recommendations_genre(genre_input)
print(f"Recommended movies for the genre '{genre_input}':")
recommended_movies_df

Recommended movies for the genre '['Action', 'Thriller', 'Adventure']':


Unnamed: 0,index,title,similarity_score
0,117,Twister (1996),1.0
1,929,Chain Reaction (1996),1.0
2,1313,Surviving the Game (1994),1.0
3,1015,Con Air (1997),1.0
4,1012,Anaconda (1997),1.0
5,826,Daylight (1996),1.0
6,116,"Rock, The (1996)",1.0
7,981,Maximum Risk (1996),1.0
8,565,Clear and Present Danger (1994),1.0
9,1,GoldenEye (1995),1.0
