In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Reading the dataset
userbase1_dataframe = pd.read_csv("dataset/ml-100k/u1.base", names=['user id', 'item id', 'rating', 'timestamp'], delimiter="\t")
userbase1_dataframe

Unnamed: 0,user id,item id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
79995,943,1067,2,875501756
79996,943,1074,4,888640250
79997,943,1188,3,888640250
79998,943,1228,3,888640275


In [3]:
# Let's assume you have another dataset containing movie information like movie_id, title, genres
movies_dataframe = pd.read_csv("dataset/ml-100k/u.item", delimiter="|", encoding="latin1",
                               names=["item id", "title", "release date", 
                                      "video release date", "IMDb URL", "unknown", 
                                      "Action", "Adventure", "Animation",
                                      "Children's", "Comedy", "Crime",
                                      "Documentary", "Drama", "Fantasy",
                                      "Film-Noir", "Horror", "Musical",
                                      "Mystery", "Romance", "Sci-Fi",
                                      "Thriller", "War", "Western"
                                      ])
movies_dataframe

Unnamed: 0,item id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Merge the datasets on 'item id'
merged_data = pd.merge(userbase1_dataframe, movies_dataframe, on='item id')

In [5]:
# Feature Extraction: Using movie genres for simplicity
# Created a string containing all genres for each movie
genres_columns = ["unknown", "Action", "Adventure", "Animation", "Children's", 
                  "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
                  "Film-Noir", "Horror", "Musical", "Mystery", "Romance",
                  "Sci-Fi", "Thriller", "War", "Western"]

In [6]:
def extract_genres(row):
    genres = [col for col in genres_columns if row[col] == 1]
    return ' '.join(genres)

In [7]:
movies_dataframe['genres'] = movies_dataframe.apply(extract_genres, axis=1)
movies_dataframe

Unnamed: 0,item id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,genres
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,Animation Children's Comedy
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,Action Adventure Thriller
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,Thriller
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Action Comedy Drama
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,Crime Drama Thriller
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Drama
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,Romance Thriller
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,Drama Romance
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Comedy


In [8]:
# Vectorization
vectorizer = CountVectorizer(binary=True, lowercase=False)
genre_matrix = vectorizer.fit_transform(movies_dataframe['genres'])

In [9]:
# Compute Cosine Similarity
cosine_sim = cosine_similarity(genre_matrix, genre_matrix)

In [10]:
def get_recommendations_multiple(movie_titles, cosine_sim=cosine_sim):
    movie_indices = []
    for title in movie_titles:
        # Get the index of the movie that matches the title
        idx = movies_dataframe[movies_dataframe['title'] == title].index
        if not idx.empty:
            movie_indices.append(idx[0])

    if not movie_indices:
        return pd.DataFrame()  # Return an empty DataFrame if no movies found
    
    # Initialize an empty array to store the similarity scores
    sim_scores_sum = [0] * len(cosine_sim)

    # Calculate the sum of similarity scores for all selected movies
    for idx in movie_indices:
        sim_scores_sum += cosine_sim[idx]

    # Normalize the sum of similarity scores
    sim_scores_sum /= len(movie_indices)

    # Sort the movies based on the similarity scores
    sim_scores = list(enumerate(sim_scores_sum))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 50 most similar movies
    sim_scores = sim_scores[1:51]

    # Get the movie indices and similarity scores
    movie_indices = [i[0] for i in sim_scores]
    similarity_scores = [i[1] for i in sim_scores]

    # Get the movie titles
    recommended_movies = movies_dataframe['title'].iloc[movie_indices]

    # Create a DataFrame with movie titles and similarity scores
    recommendations_df = pd.DataFrame({
        'title': recommended_movies.values,
        'similarity_score': similarity_scores
    })

    return recommendations_df



In [11]:
# Example usage with three movies
movie_titles = ["Batman Forever (1995)", "Batman Returns (1992)", "Jaws (1975)"]
recommendations_df = get_recommendations_multiple(movie_titles)
print(f"For {movie_titles}")
recommendations_df

For ['Batman Forever (1995)', 'Batman Returns (1992)', 'Jaws (1975)']


Unnamed: 0,title,similarity_score
0,Batman Returns (1992),0.784518
1,Evil Dead II (1987),0.735702
2,Rumble in the Bronx (1995),0.713433
3,Batman & Robin (1997),0.713433
4,"Three Musketeers, The (1993)",0.713433
5,Cliffhanger (1993),0.713433
6,From Dusk Till Dawn (1996),0.658032
7,"Ghost and the Darkness, The (1996)",0.638071
8,Raiders of the Lost Ark (1981),0.638071
9,Indiana Jones and the Last Crusade (1989),0.638071
