In [53]:
import numpy as np
import pandas as pd

movie_df = pd.read_csv('F:\\archive\\movies.csv')
rating_df = pd.read_csv('F:\\archive\\ratings.csv')
movie_df.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [54]:
rating_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [55]:
global_mean_rating = round(rating_df['rating'].mean(),3)
mean_rating_per_user = round(rating_df.groupby('userId')['rating'].mean().mean(),3)
print(global_mean_rating,mean_rating_per_user)

3.502 3.657


In [56]:
#Bayesian Average is Taken

In [57]:
movie_stats = rating_df.groupby('movieId')['rating'].agg(['count', 'mean'])
print(movie_stats['mean'])
C = movie_stats['count'].mean() #Average number of ratings for a given movie
m = movie_stats['mean'].mean() #Average rating for a given movie

def bayesian_avg(ratings):
    bayesian_avg = (C*m+ratings.sum())/(C+ratings.count())
    return round(bayesian_avg, 3)

movieId
1         3.920930
2         3.431818
3         3.259615
4         2.357143
5         3.071429
            ...   
193581    4.000000
193583    3.500000
193585    3.500000
193587    3.500000
193609    4.000000
Name: mean, Length: 9724, dtype: float64


In [58]:
bayesian_avg_ratings = rating_df.groupby('movieId')['rating'].agg(bayesian_avg).reset_index()
bayesian_avg_ratings.columns = ['movieId', 'bayesian_avg']
movie_stats = movie_stats.merge(bayesian_avg_ratings, on='movieId')
movie_stats = movie_stats.merge(movie_df[['movieId', 'title']])
movie_stats.sort_values('bayesian_avg', ascending=False).head()

Unnamed: 0,movieId,count,mean,bayesian_avg,title
277,318,317,4.429022,4.392,"Shawshank Redemption, The (1994)"
659,858,192,4.289062,4.236,"Godfather, The (1972)"
2224,2959,218,4.272936,4.227,Fight Club (1999)
224,260,251,4.231076,4.193,Star Wars: Episode IV - A New Hope (1977)
46,50,204,4.237745,4.191,"Usual Suspects, The (1995)"


In [59]:
movie_df['genres'] = movie_df['genres'].apply(lambda x: x.split("|"))
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [60]:
from scipy.sparse import csr_matrix

def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe.
    
    Args:
        df: pandas dataframe containing 3 columns (userId, movieId, rating)
    
    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        movie_mapper: dict that maps movie id's to movie indices
        movie_inv_mapper: dict that maps movie indices to movie id's
    """
    M = df['userId'].nunique()
    N = df['movieId'].nunique()

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))
    
    user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))
    
    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(rating_df)

In [61]:
from sklearn.neighbors import NearestNeighbors

def find_similar_movies(movie_id, X, movie_mapper, movie_inv_mapper, k=50, metric='cosine'):
    """
    Finds k-nearest neighbours for a given movie id.
    
    Args:
        movie_id: id of the movie of interest
        X: user-item utility matrix
        k: number of similar movies to retrieve
        metric: distance metric for kNN calculations
    
    Output: returns list of k similar movie ID's
    """
    X = X.T
    neighbour_ids = []
    
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    if isinstance(movie_vec, (np.ndarray)):
        movie_vec = movie_vec.reshape(1,-1)
    # use k+1 since kNN output includes the movieId of interest
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(X)
    neighbour = kNN.kneighbors(movie_vec, return_distance=False)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

similar_movies = find_similar_movies(1, X, movie_mapper, movie_inv_mapper,k=50)
movie_titles = dict(zip(movie_df['movieId'], movie_df['title']))

movie_id = 74228
similar_movies = find_similar_movies(movie_id, X, movie_mapper, movie_inv_mapper, metric='cosine', k=200)

collabaration_filter_list = similar_movies

movie_title = movie_titles[movie_id]

print(f"Because you watched {movie_title}:")
for i in similar_movies:
    print(movie_titles[i])

Because you watched Triangle (2009):
Get on the Bus (1996)
Dead Heat (1988)
Stuck (2007)
Battles Without Honor & Humanity (Jingi naki tatakai) (1973)
Little Big Soldier (Da bing xiao jiang) (2010)
One-Armed Swordsman, The (Dubei dao) (1967)
The Gallows (2015)
Hatchet II (2010)
Strange Circus (Kimyô na sâkasu) (2005)
Tokyo Tribe (2014)
Rogue (2007)
The Boy and the Beast (2015)
Undisputed III: Redemption (2010)
Exte: Hair Extensions (2007)
Accidental Spy, The (Dak miu mai shing) (2001)
Bloodsport: The Dark Kumite (1999)
Goodnight Mommy (Ich seh ich seh) (2014)
Ruins, The (2008)
Inside (À l'intérieur) (2007)
Shepherd: Border Patrol, The (2008)
The Magnificent Ruffians (1979)
Lifted (2006)
Last Winter, The (2006)
The Green Inferno (2014)
Who's That Knocking at My Door? (1967)
Kill List (2011)
Birdemic: Shock and Terror (2010)
Stake Land (2010)
As Above, So Below (2014)
Dobermann (1997)
Hitchcock/Truffaut (2015)
Return of the One-Armed Swordsman (1969)
Masked Avengers (1981)
New One-Armed S

In [62]:
# #cold start problem
# !pip install fuzzywuzzy

In [63]:
genres = set(g for G in movie_df['genres'] for g in G)

for g in genres:
    movie_df[g] = movie_df.genres.transform(lambda x: int(g in x))
    
movie_genres = movie_df.drop(columns=['movieId', 'title','genres'])
movie_genres.head()

Unnamed: 0,Drama,Western,War,Romance,Fantasy,Action,Thriller,Crime,Film-Noir,Horror,IMAX,Animation,Documentary,Children,Mystery,Sci-Fi,Adventure,(no genres listed),Comedy,Musical
0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [64]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(movie_genres, movie_genres)
print(f"Dimensions of our genres cosine similarity matrix: {cosine_sim.shape}")
print(cosine_sim)

Dimensions of our genres cosine similarity matrix: (9742, 9742)
[[1.         0.77459667 0.31622777 ... 0.         0.31622777 0.4472136 ]
 [0.77459667 1.         0.         ... 0.         0.         0.        ]
 [0.31622777 0.         1.         ... 0.         0.         0.70710678]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.31622777 0.         0.         ... 0.         1.         0.        ]
 [0.4472136  0.         0.70710678 ... 0.         0.         1.        ]]


In [65]:
movie_idx = dict(zip(movie_df['movieId'], list(movie_df.index)))
idx = movie_idx[movie_id]
print(f"Movie index for {movie_df.loc[idx, 'title']} with movie ID {movie_id}: has index{idx}")

Movie index for Triangle (2009) with movie ID 74228: has index7250


In [66]:
def get_content_based_recommendations(movie_id, n_recommendations=50):
    idx = movie_idx[movie_id]
    title = movie_df.loc[idx, 'title']
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:(n_recommendations+1)]
    similar_movies_idx = [i[0] for i in sim_scores]
    similar_movie_ids = movie_df.loc[similar_movies_idx, 'movieId'].tolist()
    print(f"Because you watched {title}:")
    print(movie_df['title'].iloc[similar_movies_idx])
    return similar_movie_ids

In [67]:
content_based_list = get_content_based_recommendations(movie_id, 200)

Because you watched Triangle (2009):
1016                        Amityville Horror, The (1979)
3421                                  Santa Sangre (1989)
3461                                   Others, The (2001)
3482                               Wicker Man, The (1973)
4420                   Tenant, The (Locataire, Le) (1976)
                              ...                        
4050                                        Looker (1981)
4178    American Friend, The (Amerikanische Freund, De...
4258                   Irreversible (Irréversible) (2002)
4281                                  Dreamcatcher (2003)
4327                                      Identity (2003)
Name: title, Length: 200, dtype: object


In [68]:
import random

# Find common elements
common_elements = list(set(content_based_list) & set(collabaration_filter_list))
movie_id_to_title = dict(zip(movie_df['movieId'], movie_df['title']))
print([movie_id_to_title[movie_id] for movie_id in common_elements])

num_common_elements = len(common_elements)

num_random_from_content_based = max(0, 10 - num_common_elements)

random_elements_from_list2 = random.sample(content_based_list, num_random_from_content_based)

final_elements = common_elements + random_elements_from_list2
if len(final_elements) > 10:
    final_elements = final_elements[:10]

print("Final Elements (at most 10 elements):", final_elements)


['Digging Up the Marrow (2014)', 'Pulse (Kairo) (2001)', 'Book of Shadows: Blair Witch 2 (2000)', 'Split (2017)', 'Frontière(s) (2007)', 'Texas Chainsaw 3D (2013)', 'Haunter (2013)', 'Kill List (2011)', 'Mirrors (2008)', 'Strange Circus (Kimyô na sâkasu) (2005)', 'Abandoned, The (2006)', 'Pact, The (2012)', 'We Are What We Are (2013)']
Final Elements (at most 10 elements): [130050, 27491, 3973, 166534, 66310, 99721, 107436, 89837, 61262, 60303]


In [69]:
hybrid = [movie_id_to_title[movie_id] for movie_id in final_elements]

movie_idx = dict(zip(movie_df['movieId'], list(movie_df.index)))
idx = movie_idx[movie_id]
print(f"Movie {movie_df.loc[idx, 'title']}\n")
for element in hybrid:
    print(element)

Movie Triangle (2009)

Digging Up the Marrow (2014)
Pulse (Kairo) (2001)
Book of Shadows: Blair Witch 2 (2000)
Split (2017)
Frontière(s) (2007)
Texas Chainsaw 3D (2013)
Haunter (2013)
Kill List (2011)
Mirrors (2008)
Strange Circus (Kimyô na sâkasu) (2005)
