In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load the data
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
movies.shape

(9742, 3)

In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [6]:
ratings.shape

(100836, 4)

In [7]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [8]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [9]:
# Preprocess the data
df1 = movies.drop(['genres'], axis=1)
ratings.dropna(inplace=True)
df2 = ratings.drop(['timestamp'], axis=1)


In [10]:
# Create a pivot table for the ratings data
user_movie_matrix = df2.pivot_table(index='userId', columns='movieId', values='rating')

In [11]:
user_movie_matrix.shape

(610, 9724)

In [12]:
user_movie_matrix.sum()

movieId
1         843.0
2         377.5
3         169.5
4          16.5
5         150.5
          ...  
193581      4.0
193583      3.5
193585      3.5
193587      3.5
193609      4.0
Length: 9724, dtype: float64

In [13]:
# Fill missing values with 0
user_movie_matrix.fillna(0, inplace=True)

In [14]:
user_movie_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Create a TF-IDF vectorizer object for movie genres
tfidf = TfidfVectorizer(stop_words='english')
movies['genres'] = movies['genres'].fillna('')
tfidf_matrix = tfidf.fit_transform(movies['genres'])
cosine_sim_genres = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [16]:
tfidf_matrix

<9742x23 sparse matrix of type '<class 'numpy.float64'>'
	with 23185 stored elements in Compressed Sparse Row format>

In [17]:
# Create a function to recommend movies based on movie genres and release year
import re
def content_based_recommendations(movie_title, cosine_sim, user_movie_matrix, movies, num_recommendations):
    movie_index = movies[movies['title'] == movie_title].index[0]
    sim_scores = list(enumerate(cosine_sim[movie_index]))
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:num_recommendations+1]
    recommended_movies = []
    for i in range(len(sorted_scores)):
        movie_id = movies.iloc[sorted_scores[i][0]]['movieId']
        title = movies.loc[movies['movieId'] == movie_id]['title']
        if len(title) > 0:
            title = title.values[0]
            year = re.search('\((.*?)\)', title).group(1)
            title = re.sub('\(.*?\)', '', title).strip()
            recommended_movies.append((title, year))
    return recommended_movies

In [18]:
# Test the recommendation system
movie_title = 'Toy Story (1995)'
num_recommendations = 10
recommended_movies = content_based_recommendations(movie_title, cosine_sim_genres, user_movie_matrix, movies, num_recommendations)
print("Recommendations for movie '", movie_title, "': ")
for movie in recommended_movies:
    print(movie[0], "(", movie[1], ")")

Recommendations for movie ' Toy Story (1995) ': 
Antz ( 1998 )
Toy Story 2 ( 1999 )
Adventures of Rocky and Bullwinkle, The ( 2000 )
Emperor's New Groove, The ( 2000 )
Monsters, Inc. ( 2001 )
Wild, The ( 2006 )
Shrek the Third ( 2007 )
Tale of Despereaux, The ( 2008 )
Asterix and the Vikings ( Astérix et les Vikings )
Turbo ( 2013 )


In [95]:
# Define the number of recommendations
'''num_recommendations = 10

# Split the dataset into training and test sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(ratings, test_size=0.2)

# Create a dictionary of the movies that each user has rated
user_ratings = {}
for index, row in train.iterrows():
    if row['userId'] not in user_ratings:
        user_ratings[row['userId']] = []
    user_ratings[row['userId']].append(row['movieId'])

# Calculate precision@k
from sklearn.metrics import precision_score
precision = 0.0
for user_id in user_ratings:
    # Get the movies that the user has rated
    user_movies = user_ratings[user_id]
    
    # Generate recommendations for the user
    recommendations = content_based_recommendations(user_movies, cosine_sim_genres, user_movie_matrix, movies, num_recommendations)
    
    # Calculate the precision of the recommendations
    relevant_recommendations = set(user_movies) & set([r[0] for r in recommendations])
    precision += len(relevant_recommendations) / num_recommendations
    
precision /= len(user_ratings)
print("Precision@%d: %.2f%%" % (num_recommendations, precision * 100))'''


'num_recommendations = 10\n\n# Split the dataset into training and test sets\nfrom sklearn.model_selection import train_test_split\ntrain, test = train_test_split(ratings, test_size=0.2)\n\n# Create a dictionary of the movies that each user has rated\nuser_ratings = {}\nfor index, row in train.iterrows():\n    if row[\'userId\'] not in user_ratings:\n        user_ratings[row[\'userId\']] = []\n    user_ratings[row[\'userId\']].append(row[\'movieId\'])\n\n# Calculate precision@k\nfrom sklearn.metrics import precision_score\nprecision = 0.0\nfor user_id in user_ratings:\n    # Get the movies that the user has rated\n    user_movies = user_ratings[user_id]\n    \n    # Generate recommendations for the user\n    recommendations = content_based_recommendations(user_movies, cosine_sim_genres, user_movie_matrix, movies, num_recommendations)\n    \n    # Calculate the precision of the recommendations\n    relevant_recommendations = set(user_movies) & set([r[0] for r in recommendations])\n    p