# The goal of the first assignment is to implement a user-based collaborative filtering approach.

In [85]:
# Import Pandas
# The Pandas library to load and manipulate data.
import pandas as pd
# the code imports the Pandas library using the import statement and assigns it the alias pd

# Load Movies data
movies_data = pd.read_csv("ml-latest-small/movies.csv")
# it loads a CSV file called movies.csv using the pd.read_csv() function and assigns it to a variable called movies_data.


# Load Links data
links_data = pd.read_csv("ml-latest-small/links.csv")

# Load ratings data
ratings_data = pd.read_csv("ml-latest-small/ratings.csv")

# Load tags data
tags_data = pd.read_csv("ml-latest-small/tags.csv")


In [86]:
# Print the first few rows
movies_data.head()



Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [87]:
# Print the first few rows
links_data.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [88]:
# Print the first few rows
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [89]:
# Print the first few rows
tags_data.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [90]:
ratings_data.shape[0]
#The Pandas .shape attribute can be used to return a tuple that contains the number of rows and columns, 
#in the following format (rows, columns). 
#The interest here is in the rows so I can get the first index of that tuple.

100836

In [91]:
# convert long to wide
ratings_data_raw = ratings_data.copy()
ratings_data = ratings_data.pivot(index='userId', columns='movieId', values='rating')

In [92]:
ratings_data.shape

(610, 9724)

In [93]:
ratings_data.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [94]:
def pearson_similarity(v1, v2):
    
    pearson = v1.corr(v2)
    
    return pearson

In [95]:
def compute_similarities(user_id, ratings_matrix):
    
    # get ratings of user to re-use in the similarity computation
    ratings_user = ratings_matrix.loc[user_id,:]
    
    # calculate the similarity between the given user and the other users
    similarities = ratings_matrix.apply(
        lambda row: pearson_similarity(ratings_user, row), 
        axis=1)

    similarities = similarities.to_frame(name='similarity')

    # find most similar users to the given user
    similarities = similarities.sort_values(by='similarity', ascending=False)
    
    # drop the similarity of the user (should be ~1 anyways)
    similarities = similarities.drop(user_id)
    
    return similarities

In [96]:
user_id=10
similarities = compute_similarities(user_id, ratings_data)
similarities.head(10)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,similarity
userId,Unnamed: 1_level_1
574,1.0
146,1.0
136,1.0
225,1.0
478,1.0
179,1.0
384,1.0
56,1.0
192,1.0
568,1.0


In [97]:
def read_names(path_to_names):
    data = {}
    with open(path_to_names, 'r', encoding='utf-8') as file:
        # Skip the first line (header)
        next(file)
        for line in file:
            pieces = line.strip().split(',')
            movie_id = int(pieces[0])
            title = pieces[1]
            data[movie_id] = title
    return data

# Usage example
movie_names = read_names("ml-latest-small/movies.csv")


In [98]:
def predict_rating(item_id, ratings_data, similarities, N=10):
    """
    Predict the rating of a given item by a user, given the ratings of similar users.
    Takes the N users with the highest similarity measure, AND who have rated the given item.
    Returns the average rating of the most similar users who previously rated the item.
    
    parameters:
    - item_id: int, item that needs a rating prediction
    - ratings: pd.DataFrame
    - similarities: pd.DataFrame
    - N: int, number of neighbors to use for rating prediction
    
    returns:
    - a float representing the predicted rating for the given item
    
    """
    
    # get the ratings of all users for the specific item
    users_ratings = ratings_data.loc[:, item_id]
    
    # only keep users who rated the given item, otherwise you won't be able to generate a prediction based on the users ratings
    most_similar_users_who_rated_item = similarities.loc[~users_ratings.isnull()]
    
    # keep N users with highest similarities to given user who also rated the given item
    N_most_similar_users = most_similar_users_who_rated_item.head(N)
    
    # find ratings item for most similar users:
    ratings_for_item = ratings_data.loc[N_most_similar_users.index, item_id]
    
    # predict the rating of the item by averaging the ratings of that item of the most similar users
    return ratings_for_item.mean()

In [99]:
# let us test
item_id=5
N_NEIGHBORS=5
predict_rating(item_id, ratings_data, similarities, N=N_NEIGHBORS)

3.8

In [100]:

def recommend(user_id, ratings_data, movie_names, n_neighbors=10, n_recomm=5):
    """
    
    Recommend N movies for a given user based on ratings data.
    
    1. get the ratings of the user
    2. get the movies that the user has not rated
    3. compute the similarities between the user and the other users
    4. generate movie ratings predictions for the user based on the similarities with other users
    5. find the N movies with the highest predicted ratings
    
    parameters:
    - user_id: int, user to generate recommendations for
    - ratings: pd.DataFrame, user-movie ratings
    - movie_names: dict, mapping of (movie id -> movie name)
    - n_neighbors: int: the number of neighbors to use to generate rating predictions
    - n_recomm: int, number of movies to recommend
    
    returns:
    - pd.DataFrame with [movie_id, rating, movie name]
    
    """
    
    # get the ratings of the user
    ratings_user = ratings_data.loc[user_id, :]
    
    # all the items a user has not rated, that can be recommended
    all_items = ratings_data.loc[user_id,:]
    unrated_items = all_items.loc[all_items.isnull()]
    
    # convert the index with item ids into Series values
    unrated_items = unrated_items.index.to_series(name='item_ids').reset_index(drop=True)
    print('User {} has {} unrated items.'.format(user_id, len(unrated_items)))
    
    # compute user similarities
    similarities = compute_similarities(user_id, ratings_data)
        
    # generate predictions for unseen items based on the user similarity data
    predictions = unrated_items.apply(lambda d: predict_rating(d, ratings_data, similarities, N=n_neighbors))
    
    # sort items by highest predicted rating
    predictions = predictions.sort_values(ascending=False)
    
    # recommend top N items
    recommends = predictions.head(n_recomm)
    
    # reformat the result
    recommends = recommends.to_frame(name='predicted_rating')
    recommends = recommends.rename_axis('movie_id')
    recommends = recommends.reset_index()
    
    recommends['name'] = recommends['movie_id'].apply(lambda d: movie_names.get(d, "Unknown Movie"))

    return recommends

In [101]:
# let us test
user_id = 5
N_RECOMMENDATIONS=10
recommends = recommend(user_id, ratings_data, movie_names, n_neighbors=N_NEIGHBORS, n_recomm=N_RECOMMENDATIONS)
recommends

User 5 has 9680 unrated items.


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Unnamed: 0,movie_id,predicted_rating,name
0,3271,5.0,Of Mice and Men (1992)
1,8763,5.0,Unknown Movie
2,4201,5.0,Unknown Movie
3,4196,5.0,Unknown Movie
4,8690,5.0,Slaughterhouse-Five (1972)
5,2899,5.0,Gulliver's Travels (1939)
6,8713,5.0,"""New Adventures of Pippi Longstocking"
7,8726,5.0,Unknown Movie
8,4156,5.0,Company Man (2000)
9,4128,5.0,"""Lost Boys"


In [102]:
def jaccard_similarity(user1, user2):
    # Find the set of items each user has interacted with
    items_user1 = set(user1.keys())
    items_user2 = set(user2.keys())
    
    # Calculate the size of the intersection and union of the two sets
    intersection_size = len(items_user1.intersection(items_user2))
    union_size = len(items_user1.union(items_user2))
    
    # Calculate Jaccard similarity
    if union_size == 0:
        return 0
    else:
        return intersection_size / union_size
