In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
movies=pd.read_csv("C:/Users/adham/Downloads/movies.csv")
ratings=pd.read_csv("C:/Users/adham/Downloads/ratings.csv")

In [None]:
movies.tail()

In [None]:
ratings.head()

In [None]:
n_ratings =len(ratings)
n_movies=ratings['movieId'].nunique()
n_users=ratings['userId'].nunique()
print("number of unique movies",n_movies)
print("number of unique users",n_users)
print(f"Average number of ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average number of ratings per movie: {round(n_ratings/n_movies, 2)}")
print("movies shape",movies.shape)
print("ratings shape",ratings.shape)

## dense data set

In [None]:
sns.countplot(x="rating",data=ratings)
plt.title("Distribution of ratings",fontsize=16)
plt.show()

In [None]:
print(f"Mean global rating: {round(ratings['rating'].mean(),2)}.")

In [None]:
movie_ratings = ratings.merge(movies,on="movieId")
movie_ratings['title'].value_counts()[0:10]

In [None]:
mean_ratings = ratings.groupby('movieId')[['rating']].mean()
lowest_rated = mean_ratings['rating'].idxmin()

movies[movies['movieId'] == lowest_rated ]

In [None]:
highest_rated = mean_ratings['rating'].idxmax()

print(movies[movies['movieId'] == highest_rated ])

ratings[ratings['movieId']==highest_rated]

### it has only 2 ratings this isnt a correct answer so we will use bayesian average

In [None]:
movie_stats = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])
movie_stats.head()

In [117]:
C = movie_stats['count'].mean()
m = movie_stats['mean'].mean()

print(f"Average number of ratings for a given movie: {C:.2f}")
print(f"Average rating for a given movie: {m:.2f}")
def bayesian_avg(ratings):
    bayesian_avg = (C*m+ratings.sum())/(C+ratings.count())
    return round(bayesian_avg, 3)

Average number of ratings for a given movie: 10.37
Average rating for a given movie: 3.26


In [113]:
bayesian_avg_ratings = ratings.groupby('movieId')['rating'].agg(bayesian_avg).reset_index()
bayesian_avg_ratings.columns = ['movieId', 'bayesian_avg']
movie_stats = movie_stats.merge(bayesian_avg_ratings, on='movieId')

In [None]:
movie_stats = movie_stats.merge(movies[['movieId', 'title']])
movie_stats.sort_values('bayesian_avg', ascending=False).head()

In [None]:
movie_stats.sort_values('bayesian_avg', ascending=True).tail()

In [None]:
movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))
movies.head()

In [None]:
from collections import Counter

genre_frequency = Counter(g for genres in movies['genres'] for g in genres)

print(f"There are {len(genre_frequency)} genres.")

print(genre_frequency)
print("the most common 7\n",genre_frequency.most_common(7))

In [None]:
genre_frequency_df = pd.DataFrame([genre_frequency]).T.reset_index()
genre_frequency_df.columns = ['genre', 'count']

sns.barplot(x='genre', y='count', data=genre_frequency_df.sort_values(by='count', ascending=True))
plt.xticks(rotation=90)
plt.show()

## now we will start creating the collabrative filter
#### the first step is that we need to create our user movies matrix we different user rate different movies


In [None]:
from scipy.sparse import csr_matrix

def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe.
    
    Returns:
        X: sparse matrix
        user_mapper:  maps user id's to user indices 
        user_inv_mapper: dict that maps user indices to user id's
        movie_mapper: dict that maps movie id's to movie indices
        movie_inv_mapper: dict that maps movie indices to movie id's
    """
#     M represents the number of unique users, and N represents the number of unique movies
    M = df['userId'].nunique()
    N = df['movieId'].nunique()

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))  # index lel users kol user bel id elly hsntgh beeh
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N)))) # index lel movies kol movie bel id elly hsntgh beeh
#     user_inv_mapper is the reverse mapping, converting indices back to original user IDs.
#     movie_inv_mapper is the same for movie indices back to original movie IDs.
    user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))
    
    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)

In [None]:
X.shape

### calculates how sparse is the matrix

In [None]:
n_total = X.shape[0]*X.shape[1] ##multiply m*n to get all elemnts
n_ratings = X.nnz #nnz counts stored values in matrix
sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%")

## lets explore most rated movies and most rating users

In [None]:
n_ratings_per_user = X.getnnz(axis=1)
len(n_ratings_per_user)
n_ratings_per_user = X.getnnz(axis=1)

n_ratings_per_movie = X.getnnz(axis=0)
len(n_ratings_per_movie)
print(f"Most active user rated {n_ratings_per_user.max()} movies.")
print(f"Least active user rated {n_ratings_per_user.min()} movies.")
len(n_ratings_per_user)
print(f"Most rated movie has {n_ratings_per_movie.max()} ratings.")
print(f"Least rated movie has {n_ratings_per_movie.min()} ratings.")

In [None]:
pip install fuzzywuzzy
pip install python-levenshtein

In [75]:
from sklearn.neighbors import NearestNeighbors

def find_similar_movies(movie_id, X, movie_mapper, movie_inv_mapper, k, metric='cosine'):
    
    X = X.T #transpose x
    neighbour_ids = []
    
    movie_ind = movie_mapper[movie_id] #get real index of movie
    movie_vec = X[movie_ind] # retrieve all user ratings for movies
    if isinstance(movie_vec, (np.ndarray)):
        movie_vec = movie_vec.reshape(1,-1)
    # use k+1 since kNN output includes the movieId of interest
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(X)
    neighbour = kNN.kneighbors(movie_vec, return_distance=False)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [93]:
movie_titles = dict(zip(movies['movieId'], movies['title']))

In [96]:

def get_similar_movies(movie_id,k):
    similar_movies = find_similar_movies(movie_id, X, movie_mapper, movie_inv_mapper, metric='cosine', k=k)
    movie_title = movie_titles[movie_id]

    print(f"Because you watched {movie_title}:")
    for i in similar_movies:
        print(movie_titles[i])

In [98]:
movie_id = 54001
get_similar_movies(movie_id,5)


Because you watched Harry Potter and the Order of the Phoenix (2007):
Harry Potter and the Goblet of Fire (2005)
Harry Potter and the Half-Blood Prince (2009)
Harry Potter and the Deathly Hallows: Part 1 (2010)
Harry Potter and the Chamber of Secrets (2002)


## now we will switch to content based model

In [100]:
genres = set(g for G in movies['genres'] for g in G)

for g in genres:
    movies[g] = movies.genres.transform(lambda x: int(g in x))
    
movie_genres = movies.drop(columns=['movieId', 'title','genres'])

In [103]:
movie_genres

Unnamed: 0,Children,War,IMAX,Adventure,Musical,Animation,(no genres listed),Mystery,Crime,Film-Noir,Documentary,Sci-Fi,Action,Thriller,Horror,Romance,Comedy,Drama,Western,Fantasy
0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1
1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1
9738,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1
9739,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
9740,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [104]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(movie_genres, movie_genres)
print(f"Dimensions of our genres cosine similarity matrix: {cosine_sim.shape}")

Dimensions of our genres cosine similarity matrix: (9742, 9742)


In [105]:
from fuzzywuzzy import process

def movie_finder(title):
    all_titles = movies['title'].tolist()
    closest_match = process.extractOne(title,all_titles)
    return closest_match[0]

In [110]:
def get_content_based_recommendations(title_string, n_recommendations=10):
    title = movie_finder(title_string)
    idx = movie_idx[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:(n_recommendations+1)]
    similar_movies = [i[0] for i in sim_scores]
    print(f"Because you watched {title}:")
    print(movies['title'].iloc[similar_movies])

In [111]:
get_content_based_recommendations('toy story', 5)

Because you watched Toy Story (1995):
1706                                       Antz (1998)
2355                                Toy Story 2 (1999)
2809    Adventures of Rocky and Bullwinkle, The (2000)
3000                  Emperor's New Groove, The (2000)
3568                             Monsters, Inc. (2001)
Name: title, dtype: object
