<p style="font-size:32px;text-align:center"><font color=MediumVioletRed><b>Netflix Movie Recommendations</b></font></p>
<p style="font-size:20px;text-align:center"><b><font color=DarkGoldenRod>Part 2</font></b></p>

In [1]:
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import random

import os
os.chdir("D:/Applied_Ai/Case Studies/Netflix Movie Recommendation")

<h3><font color=DarkMagenta>Creating Sparse Matrix From Dataset</font></h3>

<table>
<tr>
<td>
<img src='data_c.jpg' width='250px' align=left/>
</td>
<td>
<img src='arrow.jpg' width='60px' align=left/>
</td>
<td>
<img src='data_sparse_c.jpg' width='400px' align=left/>
</td>
</tr>
</table>

<p style="font-size:15px"><b><font color=SaddleBrown>Creating Sparse Matrix From Train Dataset</font></b></p>

In [7]:
if os.path.isfile('train_sparse_matrix.npz'):
    print("File is present, getting it from disk...")
    #just get it from the disk instead of computing it
    train_sparse_matrix = sparse.load_npz('train_sparse_matrix.npz')
    print("Done...")
else:
    print("We are creating sparse_matrix from the dataframe...")
    start = datetime.now()
    
    #create sparse_matrix and store it for after usage.
    #csr_matrix(data_values, (row_index, col_index), shape_of_matrix)
    #it should be in such a way that, MATRIX[row, col] = data
    train_sparse_matrix = sparse.csr_matrix((train_data.rating.values, (train_data.user.values,
                                               train_data.movie.values)),)
    
    print('Done... It\'s shape is : (user, movie) - ',train_sparse_matrix.shape)
    print('Saving it into disk for furthur usage...')
    #save it into disk
    sparse.save_npz("train_sparse_matrix.npz", train_sparse_matrix)
    print('Done...\n')
    print(datetime.now() - start)

File is present, getting it from disk...
Done...


<p style="font-size:15px"><b><font color=SaddleBrown>The Sparsity of Train Sparse Matrix</font></b></p>

In [8]:
user, movie = train_sparse_matrix.shape

ele = train_sparse_matrix.count_nonzero()

print("Sparsity of Train Matrix - {}%".format((1-(ele/(user*movie)))*100))

Sparsity of Train Matrix - 99.8292709259195%


<p style="font-size:15px"><b><font color=SaddleBrown>Creating Sparse Matrix From Test Dataset</font></b></p>

In [9]:
if os.path.isfile('test_sparse_matrix.npz'):
    print("File is present, getting it from disk...")
    #just get it from the disk instead of computing it
    test_sparse_matrix = sparse.load_npz('test_sparse_matrix.npz')
    print("Done...")
else: 
    print("We are creating sparse_matrix from the dataframe...")
    start = datetime.now()
    
    #create sparse_matrix and store it for after usage.
    #csr_matrix(data_values, (row_index, col_index), shape_of_matrix)
    #it should be in such a way that, MATRIX[row, col] = data
    test_sparse_matrix = sparse.csr_matrix((test_data.rating.values, (test_data.user.values,
                                               test_data.movie.values)))
    
    print('Done... It\'s shape is : (user, movie) - ',test_sparse_matrix.shape)
    print('Saving it into disk for furthur usage...')
    #save it into disk
    sparse.save_npz("test_sparse_matrix.npz", test_sparse_matrix)
    print('Done...\n')
    print(datetime.now() - start)

File is present, getting it from disk...
Done...


<p style="font-size:15px"><b><font color=SaddleBrown>The Sparsity of Test Sparse Matrix</font></b></p>

In [10]:
user, movie = test_sparse_matrix.shape

ele = test_sparse_matrix.count_nonzero()

print("Sparsity of Test Matrix - {}%".format((1-(ele/(user*movie)))*100))

Sparsity of Test Matrix - 99.95731772988694%


<h3><font color=DarkMagenta>Finding Global Average of All Movie Ratings,<br>
Average Rating Per User and Average Rating Per Movie</font></h3>

In [11]:
#get the user averages in dictionary (key: user_id/movie_id, value: avg rating)

def get_average_ratings(sparse_matrix, of_users):
    
    #average ratings of user/axes
    ax = 1 if of_users else 0    #1 - User axes, 0 - Movie axes

    #".A1" is for converting Column_Matrix to 1-D numpy array 
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1
    #boolean matrix of ratings (whether a user rated that movie or not)
    is_rated = sparse_matrix != 0
    #no of ratings that each user OR movie
    no_of_ratings = is_rated.sum(axis=ax).A1
    
    #max_user and max_movie ids in sparse matrix 
    u, m = sparse_matrix.shape
    #creae a dictonary of users and their average ratigns..
    average_ratings = {i : sum_of_ratings[i]/no_of_ratings[i]
                                 for i in range(u if of_users else m) 
                                    if no_of_ratings[i] != 0}
    #return that dictionary of average ratings
    return average_ratings

<p style="font-size:15px"><b><font color=SaddleBrown>Finding Global Average of All Movie Ratings</font></b></p>

In [12]:
train_averages = dict()
#get the global average of ratings in our train set

train_global_average = train_sparse_matrix.sum()/train_sparse_matrix.count_nonzero()
train_averages['global'] = train_global_average

print(train_averages)

{'global': 3.582890686321557}


<p style="font-size:15px"><b><font color=SaddleBrown>Finding Average Rating Per User</font></b></p>

In [13]:
train_averages['user'] = get_average_ratings(train_sparse_matrix, of_users=True)

print('Average Rating of User 10 -',train_averages['user'][10])

Average Rating of User 10 - 3.3781094527363185


<p style="font-size:15px"><b><font color=SaddleBrown>Finding Average Rating Per Movie</font></b></p>

In [14]:
train_averages['movie'] = get_average_ratings(train_sparse_matrix, of_users=False)

print('Average Rating of Movie 15 -',train_averages['movie'][15])

Average Rating of Movie 15 - 3.3038461538461537


<h3><font color=DarkMagenta>Cold Start Problem</font></h3>

<p style="font-size:15px"><b><font color=SaddleBrown>Cold Start Problem With Users</font></b></p>

In [16]:
ratings_data = pd.read_csv("ratings_data.csv")

total_users = len(np.unique(ratings_data.user))
users_train = len(train_averages['user'])
new_users = total_users - users_train

print('TOTAL NUMBER OF USERS -', total_users)
print('NUMBER OF USERS IN TRAIN DATA -', users_train)
print("NUMBER OF USERS THAT DIDN'T APPEAR IN TRAIN DATA - {} ({}%)".format(new_users, np.round((new_users/total_users)*100,2)))

TOTAL NUMBER OF USERS - 480189
NUMBER OF USERS IN TRAIN DATA - 405041
NUMBER OF USERS THAT DIDN'T APPEAR IN TRAIN DATA - 75148 (15.65%)


> We might have to handle __new users__ (75148) who didn't appear in train data.

<p style="font-size:15px"><b><font color=SaddleBrown>Cold Start Problem With Movies</font></b></p>

In [17]:
total_movies = len(np.unique(ratings_data.movie))
movies_train = len(train_averages['movie'])
new_movies = total_movies - movies_train

print('TOTAL NUMBER OF MOVIES -', total_movies)
print('NUMBER OF MOVIES IN TRAIN DATA -', movies_train)
print("NUMBER OF MOVIES THAT DIDN'T APPEAR IN TRAIN DATA - {} ({}%)".format(new_movies, np.round((new_movies/total_movies)*100,2)))

TOTAL NUMBER OF MOVIES - 17770
NUMBER OF MOVIES IN TRAIN DATA - 17424
NUMBER OF MOVIES THAT DIDN'T APPEAR IN TRAIN DATA - 346 (1.95%)


> We might have to handle __346 movies__ (small comparatively) in test data.

<h3><font color=DarkMagenta>Computing Similarity Matrices</font></h3>

<p style="font-size:15px"><b><font color=SaddleBrown>Computing User-User Similarity Matrix</font></b></p>

- Calculating User-User Similarity Matrix is **not very easy** (_unless we have huge Computing Power and lots of time_), because of number of users.

    * You can try if you want to. Your system could crash or the program stops with **Memory Error**

* <p style="font-size:15px"><b><font color=DarkOliveGreen>Trying With All Dimensions (17k Dimensions Per User)</font></b></p>

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

def compute_user_similarity(sparse_matrix, compute_for_few=False, top = 100, verbose=False, verb_for_n_rows = 20,
                            draw_time_taken=True):
    no_of_users = sparse_matrix.shape
    #get the indices of  non zero rows(users) from our sparse matrix
    row_ind, col_ind = sparse_matrix.nonzero()
    
    row_ind = sorted(set(row_ind)) #we don't have to
    time_taken = list() #time taken for finding similar users for an user
    
    #we create rows, cols, and data lists, which can be used to create sparse matrices
    rows, cols, data = list(), list(), list()
    if verbose: print("Computing Top",top,"Similarities For Each User..")
    
    start = datetime.now()
    temp = 0
    
    for row in row_ind[:top] if compute_for_few else row_ind:
        temp = temp+1
        prev = datetime.now()
        
        #get the similarity row for this user with all other users
        sim = cosine_similarity(sparse_matrix.getrow(row), sparse_matrix).ravel()
        #we will get only the top ''top'' most similar users and ignore rest of them..
        top_sim_ind = sim.argsort()[-top:]
        top_sim_val = sim[top_sim_ind]
        
        #add them to our rows, cols and data
        rows.extend([row]*top)
        cols.extend(top_sim_ind)
        data.extend(top_sim_val)
        time_taken.append(datetime.now().timestamp() - prev.timestamp())
        if verbose:
            if temp%verb_for_n_rows == 0:
                print("Computing Done For {} Users [Time Elapsed : {}]"
                      .format(temp, datetime.now()-start))
            
    #lets create sparse matrix out of these and return it
    if verbose: print('Creating Sparse Matrix From The Computed Similarities')
    #return rows, cols, data
    
    if draw_time_taken:
        plt.plot(time_taken, label = 'Time Taken For Each User')
        plt.plot(np.cumsum(time_taken), label='Total Time')
        plt.legend(loc='best')
        plt.xlabel('User')
        plt.ylabel('Time (Seconds)')
        plt.show()
        
    return sparse.csr_matrix((data, (rows, cols)), shape=(no_of_users, no_of_users)), time_taken      

In [23]:
start = datetime.now()
u_u_sim_sparse = compute_user_similarity(train_sparse_matrix, compute_for_few=True, top = 100, verbose=True)
print("-"*100)
print("Time Taken :",datetime.now()-start)

Computing Top 100 Similarities For Each User..

Computing Done For 20 Users [Time Elapsed : 0:03:20.300488]
Computing Done For 40 Users [Time Elapsed : 0:06:38.518391]
Computing Done For 60 Users [Time Elapsed : 0:09:53.143126]
Computing Done For 80 Users [Time Elapsed : 0:13:10.080447]
Computing Done For 100 Users [Time Elapsed : 0:16:24.711032]

Time Taken : 0:16:24.711032


* <p style="font-size:15px"><b><font color=DarkOliveGreen>Trying With Reduced Dimensions (Using TruncatedSVD For Dimensionality Reduction of User Vector)</font></b></p>

* We have  **405,041 users** in out training set and computing similarities between them (**17K dimensional vector**) is time consuming.

- From above plot, It took roughly __8.88 sec__ for computing simlilar users for __one user__
    
    
- We have __405,041 users__ with us in training set.


- ${ 405041 \times 8.88 = 3596764.08  \sec } =  59946.068 \min = 999.101133333 \text{ hours}
= 41.629213889 \text{ days}...$

    - Even if we run on 4 cores parallelly (a typical system now a days), It will still take almost __10 and 1/2__ days.
    
**IDEA:** Instead, we will try to reduce the dimentsions using SVD, so that __it might__ speed up the process.

**This is taking more time for each user than Original one.**

- It took almost __12.18__ for computing simlilar users for __one user__.
    
    
- We have __405041 users__ with us in training set.


- ${ 405041 \times 12.18 ==== 4933399.38 \sec } ====  82223.323 \min ==== 1370.388716667 \text{ hours}
==== 57.099529861 \text{ days}...$

    - Even we run on 4 cores parallelly (a typical system now a days), It will still take almost __(14 - 15) __ days.


<p style="font-size:15px"><font color=red>Is there any other way to compute user user similarity ?</font></p>

- An alternative is to compute similar users for a particular user,  whenenver required (**ie., Run time**)
    - We maintain a binary Vector for users, which tells us whether we already computed or not..
    - ***If not*** : 
        - Compute top (let's just say, 1000) most similar users for this given user, and add this to our datastructure, so that we can just access it(similar users) without recomputing it again.
        - 
    - ***If It is already Computed***:
        - Just get it directly from our datastructure, which has that information.
        - In production time, We might have to recompute similarities, if it is computed a long time ago. Because user preferences changes over time. If we could maintain some kind of Timer, which when expires, we have to update it ( recompute it ). 
        - 
    - ***Which datastructure to use:***
        - It is purely implementation dependant. 
        - One simple method is to maintain a **Dictionary Of Dictionaries**.
            - 
            - **key    :** _userid_ 
            - __value__: _Again a dictionary_
                - __key__  : _Similar User_
                - __value__: _Similarity Value_

<p style="font-size:15px"><b><font color=SaddleBrown>Computing Movie-Movie Similarity Matrix</font></b></p>

In [2]:
if not os.path.isfile('m_m_sim_sparse.npz'):
    print("It Seems You Don't Have That File. Computing Movie-Movie Similarity...")
    start = datetime.now()
    
    m_m_sim_sparse = cosine_similarity(X=train_sparse_matrix.T, dense_output=False)
    print("Done...")
    
    #store this sparse matrix in disk before using it. For future purposes.
    print("Saving File To Disk...")
    sparse.save_npz("m_m_sim_sparse.npz", m_m_sim_sparse)
    print("Done...")
    print(datetime.now() - start)
else:
    print("File Is There, We Will Get It...")
    m_m_sim_sparse = sparse.load_npz("m_m_sim_sparse.npz")
    print("Done...")

print("It's a",m_m_sim_sparse.shape,"Dimensional Matrix.")

File Is There, We Will Get It...
Done...
It's a (17771, 17771) Dimensional Matrix.


- Even though we have similarity measure of each movie, with all other movies, We generally don't care much about least similar movies.

- Most of the times, only top_xxx similar items matters. It may be 10 or 100.

- We take only those top similar movie ratings and store them  in a saperate dictionary.

In [4]:
movie_ids = np.unique(m_m_sim_sparse.nonzero()[1])

In [6]:
start = datetime.now()
similar_movies = dict()
for movie in movie_ids:
    #get the top similar movies and store them in the dictionary
    sim_movies = m_m_sim_sparse[movie].toarray().ravel().argsort()[::-1][1:]
    similar_movies[movie] = sim_movies[:100]
print(datetime.now() - start)

#just testing similar movies for movie_15
similar_movies[15]

0:00:33.411700


array([ 8279,  8013, 16528,  5927, 13105, 12049,  4424, 10193, 17590,
        4549,  3755,   590, 14059, 15144, 15054,  9584,  9071,  6349,
       16402,  3973,  1720,  5370, 16309,  9376,  6116,  4706,  2818,
         778, 15331,  1416, 12979, 17139, 17710,  5452,  2534,   164,
       15188,  8323,  2450, 16331,  9566, 15301, 13213, 14308, 15984,
       10597,  6426,  5500,  7068,  7328,  5720,  9802,   376, 13013,
        8003, 10199,  3338, 15390,  9688, 16455, 11730,  4513,   598,
       12762,  2187,   509,  5865,  9166, 17115, 16334,  1942,  7282,
       17584,  4376,  8988,  8873,  5921,  2716, 14679, 11947, 11981,
        4649,   565, 12954, 10788, 10220, 10963,  9427,  1690,  5107,
        7859,  5969,  1510,  2429,   847,  7845,  6410, 13931,  9840,
        3706])

<p style="font-size:15px"><b><font color=SaddleBrown>Finding Most Similar Movies Using Similarity Matrix</font></b></p>

**Does Similarity really works as the way we expected ?**<br>
_Let's pick some random movie and check for its similar movies..._

In [7]:
#first Let's load the movie details into see dataframe

movie_titles = pd.read_csv("movie_titles.csv", sep=',', header = None,
                           names=['movie_id', 'year_of_release', 'title'], verbose=True,
                      index_col = 'movie_id', encoding = "ISO-8859-1")

movie_titles.head()

Tokenization took: 15.57 ms
Type conversion took: 15.62 ms
Parser memory cleanup took: 0.00 ms


Unnamed: 0_level_0,year_of_release,title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW


<p style="font-size:15px"><b><font color=DarkOliveGreen>Similar Movies for 'Vampire Journals'</font></b></p>

In [9]:
mv_id = 67

print("Movie ----->",movie_titles.loc[mv_id].values[1])

print("It Has {} Ratings From Users.".format(train_sparse_matrix[:,mv_id].getnnz()))

print("We Have {} Movies Which Are Similar To This Movie & We Will Get Only Top Most.".format(m_m_sim_sparse[:,mv_id].getnnz()))

Movie -----> Vampire Journals

It Has 270 Ratings From Users.
We Have 17284 Movies Which Are Similar To This Movie & We Will Get Only Top Most.


In [None]:
similarities = m_m_sim_sparse[mv_id].toarray().ravel()

similar_indices = similarities.argsort()[::-1][1:]

similarities[similar_indices]

sim_indices = similarities.argsort()[::-1][1:] #it will sort and reverse the array and ignore its similarity (ie.,1)
                                               #and return its indices(movie_ids)

In [None]:
plt.plot(similarities[sim_indices], label='All The Ratings')
plt.plot(similarities[sim_indices[:100]], label='Top 100 Similar Movies')
plt.title("Similar Movies of {}(movie_id)".format(mv_id), fontsize=20)
plt.xlabel("Movies (Not Movie_Ids)", fontsize=15)
plt.ylabel("Cosine Similarity",fontsize=15)
plt.legend()
plt.show()

<p style="font-size:15px"><b><font color=DarkOliveGreen>Top 10 Similar Movies</font></b></p>

In [16]:
movie_titles.loc[sim_indices[:10]]

Unnamed: 0_level_0,year_of_release,title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
323,1999,Modern Vampires
4044,1998,Subspecies 4: Bloodstorm
1688,1993,To Sleep With a Vampire
13962,2001,Dracula: The Dark Prince
12053,1993,Dracula Rising
16279,2002,Vampires: Los Muertos
4667,1996,Vampirella
1900,1997,Club Vampire
13873,2001,The Breed
15867,2003,Dracula II: Ascension


> Similarly, we can ___find similar users___ and compare how similar they are. 

<p style="font-size:20px;text-align:center"><b>For Machine Learning Models - Refer <a href="https://gitlab.com/akashbangalkar/netflix-movie-recommendation/-/blob/main/3_Netflix_Machine_Learning.ipynb">Part 3</a> Notebook</b></p>