In [1]:
from datetime import datetime 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import os

In [6]:
startTime=datetime.now()
if not os.path.isfile('Extracteddata.csv'):
    data=open('data.csv',mode='w')
    
    row=list()
    files=['data/combined_data_1.txt','data/combined_data_2.txt','data/combined_data_3.txt','data/combined_data_4.txt']
    for file in files:
        print("Reading file {}......".format(file))
        with open(file) as fileOpened:
            for line in fileOpened:
                line =line.strip()
                if line.endswith(':'):
                    movieId=line.replace(":","")
                else:
                    row= [x for x in line.split(",")]
                    row.insert(0,movieId)
                    data.write(','.join(row))
                    data.write("\n")
        print("file done ")
    data.close()
print("total time taken is {} ".format(datetime.now()-startTime))

Reading file data/combined_data_1.txt......
file done 
Reading file data/combined_data_2.txt......
file done 
Reading file data/combined_data_3.txt......
file done 
Reading file data/combined_data_4.txt......
file done 
total time taken is 0:10:43.349587 


In [8]:
df=pd.read_csv("data.csv",sep=',',names=['movieId','user','rating','date'])

In [9]:
df.head()

Unnamed: 0,movieId,user,rating,date
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03


In [12]:
print("No of ratings {} and no of unique users {} and no. of unique movies {}".format(len(df),len(np.unique(df.user)),len(np.unique(df.movieId))))

No of ratings 100480507 and no of unique users 480189 and no. of unique movies 17770


100M reviews and 480K users and 17K movies

Checking for NAN in dataframe

In [13]:
df.isnull().any()

movieId    False
user       False
rating     False
date       False
dtype: bool

creating sparse matrix

https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html

In [15]:
from scipy import sparse

In [17]:
start = datetime.now()
if os.path.isfile('sparse_matrix.npz'):
    print("It is present in your pwd, getting it from disk....")
    # just get it from the disk instead of computing it
    sparse_matrix = sparse.load_npz('sparse_matrix.npz')
    print("DONE..")
else: 
    print("We are creating sparse_matrix from the dataframe..")
    # create sparse_matrix and store it for after usage.
    # csr_matrix(data_values, (row_index, col_index), shape_of_matrix)
    # It should be in such a way that, MATRIX[row, col] = data
    sparse_matrix = sparse.csr_matrix((df.rating.values, (df.user.values,
                                               df.movieId.values)),)
    
    print('Done. It\'s shape is : (user, movie) : ',sparse_matrix.shape)
    print('Saving it into disk for furthur usage..')
    # save it into disk
    sparse.save_npz("sparse_matrix.npz", sparse_matrix)
    print('Done..\n')

print(datetime.now() - start)

We are creating sparse_matrix from the dataframe..
Done. It's shape is : (user, movie) :  (2649430, 17771)
Saving it into disk for furthur usage..
Done..

0:02:41.969563


In [18]:
us,mv = sparse_matrix.shape
elem = sparse_matrix.count_nonzero()

print("Sparsity Of Train matrix : {} % ".format(  (1-(elem/(us*mv))) * 100) )

Sparsity Of Train matrix : 99.78658865580644 % 


### Why the no of rows in sparse matrix are not as same as no of unique customers?

Because some userIds may be missing in between and for those ids we have whole row filled with zeros

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html

In [20]:
start = datetime.now()
if not os.path.isfile('m_m_sim_sparse.npz'):
    print("It seems you don't have that file. Computing movie_movie similarity...")
    start = datetime.now()
    m_m_sim_sparse = cosine_similarity(X=sparse_matrix.T, dense_output=False)
    print("Done..")
    # store this sparse matrix in disk before using it. For future purposes.
    print("Saving it to disk without the need of re-computing it again.. ")
    sparse.save_npz("m_m_sim_sparse.npz", m_m_sim_sparse)
    print("Done..")
else:
    print("It is there, We will get it.")
    m_m_sim_sparse = sparse.load_npz("m_m_sim_sparse.npz")
    print("Done ...")

print("It's a ",m_m_sim_sparse.shape," dimensional matrix")

print(datetime.now() - start)

It seems you don't have that file. Computing movie_movie similarity...
Done..
Saving it to disk without the need of re-computing it again.. 
Done..
It's a  (17771, 17771)  dimensional matrix
0:12:11.102959


Reading file name from the movie csv

In [23]:
movieTitles=pd.read_csv("movie_titles.csv",sep=",",header=None,names=['movie_id', 'year_of_release', 'title'], verbose=True,
                      index_col = 'movie_id', encoding = "ISO-8859-1")
movieTitles.head()

Tokenization took: 12.98 ms
Type conversion took: 686.16 ms
Parser memory cleanup took: 0.00 ms


Unnamed: 0_level_0,year_of_release,title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW


In [24]:
m_m_sim_sparse.shape

(17771, 17771)

In [55]:
mv_id=input()

4342


In [43]:
# movieTitles.loc

In [56]:
movieName=movieTitles.iloc[int(mv_id)-1].values[1]
print(movieName)

Batman: The Animated Series: Out of the Shadows


In [57]:
print("\nIt has {} similar movies.".format(m_m_sim_sparse[:,mv_id].getnnz())) 
#getnnz() gives the count of Number of Non Zeros, nnz


It has 17764 similar movies.


In [58]:
similarities = m_m_sim_sparse[mv_id].toarray().ravel()

similar_indices = similarities.argsort()[::-1][1:]

similarities[similar_indices]

sim_indices = similarities.argsort()[::-1][1:] # It will sort and reverse the array and ignore its similarity (ie.,1)
                                               # and return its indices(movie_ids)

In [59]:
movieTitles.loc[sim_indices[:10]]

Unnamed: 0_level_0,year_of_release,title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
11967,1992.0,Batman: The Animated Series: The Legend Begins
2346,1992.0,Batman: The Animated Series: Tales of the Dark...
13650,1992.0,Batman the Animated Series: Secrets of the Cap...
7903,1992.0,Adventures of Batman & Robin: Poison Ivy/The P...
11088,1992.0,Adventures of Batman & Robin: The Joker/Fire &...
14017,1999.0,Batman Beyond: School Dayz / Spellbound
12770,1998.0,The Batman Superman Movie
3495,1993.0,Batman: Mask of the Phantasm
13835,1992.0,Batman the Animated Series: Vol. 2
7595,1992.0,Batman the Animated Series: Vol. 1


It is amazing that we haven't told the matrix about batman movie name, but all the similar movies turned to be of batman type