## CROSS DOMAIN RECOMMENDER SYSTEM 

## A) Movie Recommender System Based on Matrix Factorization

In [1]:
import numpy as np
import pandas as pd
import warnings
import itertools

warnings.filterwarnings('ignore')

In [2]:
#reading csv file 
movies=pd.read_csv("ml-latest-small\\movies.csv")
user_ratings=pd.read_csv("ml-latest-small\\ratings.csv")
user_ratings = user_ratings.sort_values('userId').reset_index(drop= True)

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
user_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,2329,5.0,964983263
2,1,2338,2.0,964983546
3,1,2353,5.0,964983861
4,1,2366,4.0,964982462
...,...,...,...,...
100831,610,6387,3.5,1479542038
100832,610,6383,2.5,1493846084
100833,610,6378,3.5,1493844983
100834,610,6708,3.5,1493847441


In [5]:
def genre_array(str):
    return str.split('|')

movies['genre'] = movies['genres'].apply(genre_array)
del movies['genres']

movie_col = list(movies.columns)
movie_tags = movies['genre']
tag_table = [[token, idx] for idx, token in enumerate(set(itertools.chain.from_iterable(movie_tags)))]
tag_table = pd.DataFrame(tag_table)
tag_table.columns = ['Tag', 'Index']

tag_dummy = np.zeros([len(movies),len(tag_table)])


for i in range(len(movies)):
    for j in range(len(tag_table)):
        if tag_table['Tag'][j] in list(movie_tags[i]):
            tag_dummy[i, j] = 1

movies = pd.concat([movies, pd.DataFrame(tag_dummy)], 1)
movie_col.extend([string for string in tag_table['Tag']])
movies.columns = movie_col
del movies['genre']


In [6]:
movielens = pd.merge(user_ratings, movies, on = 'movieId')
movielens.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,Comedy,Adventure,(no genres listed),Film-Noir,Animation,...,Action,Fantasy,Thriller,Western,War,Mystery,Crime,Drama,Horror,Musical
0,1,1,4.0,964982703,Toy Story (1995),1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,1,4.0,847434962,Toy Story (1995),1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,1,4.5,1106635946,Toy Story (1995),1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15,1,2.5,1510577970,Toy Story (1995),1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,1,4.5,1305696483,Toy Story (1995),1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
#creating a matrix where rows denote user_ids and columns are movie_titles
#value inside matrix tells rating given by ith user to jth movie
movie_matrix = movielens.pivot_table(index='userId', columns= 'title', values= 'rating')

In [8]:
movie_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [9]:
movie_matrix.fillna(0, inplace=True)
movie_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
#Normalize
matrix = movie_matrix.to_numpy()

In [11]:
matrix

array([[0. , 0. , 0. , ..., 0. , 4. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [4. , 0. , 0. , ..., 1.5, 0. , 0. ]])

user_mean = np.mean(matrix, axis=1)

In [12]:
user_mean = np.mean(matrix, axis=1)
#user_mean

In [13]:
user_mean.reshape(-1,1)
#print(user_mean)
matrix_normalized = matrix - user_mean.reshape(-1,1)
#matrix_normalized

In [14]:
from scipy.sparse.linalg import svds

In [15]:
# applying SVD 
#Number of latent factors are 50
U , sigma, Vt = svds(matrix_normalized, k=50) 

In [16]:
U

array([[ 1.79702754e-02, -2.02289781e-04,  1.27344112e-02, ...,
         3.36548791e-03,  6.21341567e-02, -5.96394935e-02],
       [ 5.05792483e-03,  1.14989528e-03,  1.52169080e-02, ...,
        -1.23387110e-03, -1.76739229e-02, -6.26340462e-03],
       [-8.03699918e-04, -4.97606323e-03, -6.71152809e-03, ...,
         6.77060830e-04,  2.03920206e-03, -6.52562721e-04],
       ...,
       [ 1.59151504e-01, -1.34247572e-01, -5.56697938e-02, ...,
        -1.46532716e-02,  1.22795675e-02, -1.18548986e-01],
       [-9.14476346e-03,  6.54592782e-03, -5.42432984e-03, ...,
        -4.09777679e-02,  1.40025038e-02, -8.56755529e-03],
       [-1.01612266e-02,  8.91566544e-03,  6.35516696e-02, ...,
         6.17956006e-02, -2.03171926e-01, -1.21424493e-01]])

In [17]:
sigma

array([ 67.86100092,  68.19375558,  69.02327882,  69.4115474 ,
        69.91738044,  70.01667134,  70.19162241,  71.67220718,
        72.43256726,  73.2171113 ,  73.43426562,  74.02458721,
        74.28860874,  74.91899813,  75.17387751,  75.58659454,
        76.69219203,  77.34671188,  78.38669451,  79.03824855,
        79.21035912,  80.55439795,  81.54451391,  82.19784757,
        83.04467757,  85.11359231,  85.74572954,  86.50266192,
        87.91501355,  90.33563347,  90.93545926,  92.25501877,
        93.40073261,  97.09241651,  99.30680373,  99.81069083,
       101.82703734, 105.97218685, 107.04965303, 109.2072465 ,
       112.81943799, 120.61017176, 122.64526202, 134.58585563,
       139.63194433, 153.93101542, 163.7303269 , 184.85931669,
       231.22391483, 474.18051734])

In [18]:
Vt

array([[-7.28835225e-04, -5.32529172e-03, -6.14803704e-03, ...,
         2.29985422e-03, -5.05860563e-03,  5.12905077e-04],
       [ 1.83956646e-03,  1.72666920e-03,  4.55698564e-06, ...,
         4.92095858e-03, -4.21453841e-03,  7.26613242e-04],
       [ 3.10717896e-03,  7.14727886e-04, -9.39768602e-04, ...,
        -1.65631653e-03, -3.34452997e-03, -1.13900950e-03],
       ...,
       [-8.99230900e-04, -1.68777954e-03, -1.57395371e-03, ...,
        -2.92995191e-03,  6.37146121e-03, -2.24975235e-03],
       [-3.61700664e-03, -3.02621359e-04, -7.38672679e-05, ...,
        -4.13277570e-03,  1.16128017e-02,  8.69309766e-05],
       [ 4.13098745e-03,  4.61167317e-03,  4.59998848e-03, ...,
         3.28534752e-03, -8.01192041e-03,  5.08737289e-03]])

In [19]:
#converting sigma to diagonal matrix
sigma = np.diag(sigma) 

In [20]:
sigma

array([[ 67.86100092,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,  68.19375558,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,  69.02327882, ...,   0.        ,
          0.        ,   0.        ],
       ...,
       [  0.        ,   0.        ,   0.        , ..., 184.85931669,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
        231.22391483,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        , 474.18051734]])

### Making Predictions

In [21]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma),Vt) + user_mean.reshape(-1,1)

In [22]:
col=movie_matrix.columns
print(col)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns=col)

Index([''71 (2014)', ''Hellboy': The Seeds of Creation (2004)',
       ''Round Midnight (1986)', ''Salem's Lot (2004)',
       ''Til There Was You (1997)', ''Tis the Season for Love (2015)',
       ''burbs, The (1989)', ''night Mother (1986)',
       '(500) Days of Summer (2009)', '*batteries not included (1987)',
       ...
       'Zulu (2013)', '[REC] (2007)', '[REC]² (2009)',
       '[REC]³ 3 Génesis (2012)',
       'anohana: The Flower We Saw That Day - The Movie (2013)',
       'eXistenZ (1999)', 'xXx (2002)', 'xXx: State of the Union (2005)',
       '¡Three Amigos! (1986)', 'À nous la liberté (Freedom for Us) (1931)'],
      dtype='object', name='title', length=9719)


In [23]:
#610 users and predicted rating given by each user to each movie
#unrated movies have been rated now
preds_df

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
0,-0.067955,0.018620,-0.041533,-0.037173,-0.047273,-0.007202,0.549227,-0.008900,-0.607781,0.221628,...,0.014309,-0.459805,0.022586,-0.052623,-0.020094,0.346683,-0.284519,-0.186760,1.499991,0.034606
1,-0.028293,-0.011688,-0.010462,0.001095,-0.002724,-0.007396,0.004290,0.008886,0.150959,-0.005892,...,0.004586,-0.014558,-0.027255,-0.034335,0.016768,0.059210,-0.104489,-0.009522,0.057070,0.000111
2,0.023213,0.009783,0.013288,0.010796,0.010376,0.006465,0.091815,-0.002024,0.016746,-0.003368,...,0.007332,0.045168,0.027881,0.027486,0.006450,0.019643,0.000363,0.013496,0.052682,0.011861
3,-0.008667,0.006796,-0.014741,-0.005001,0.014988,-0.033562,-0.372983,0.009115,-0.204434,0.045343,...,-0.004028,-0.133861,-0.069741,-0.057557,0.004191,0.086672,-0.199954,-0.035476,0.019514,-0.005279
4,0.011838,-0.000451,-0.002750,-0.010783,-0.012690,-0.013753,-0.105477,0.000415,0.022140,-0.093820,...,0.001723,0.047886,0.011493,0.003101,-0.007398,-0.074235,-0.004574,0.044573,-0.091330,-0.001727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,0.026390,0.004616,-0.014601,0.033709,0.089045,0.007444,-0.110581,0.006924,-0.574718,0.094196,...,0.013098,0.043257,0.029398,0.041606,-0.032459,0.055028,0.197673,-0.019672,0.220797,0.016067
606,0.009921,0.018065,0.006437,0.002908,-0.008254,-0.001745,0.233638,0.012677,0.095262,0.159115,...,-0.006811,0.470752,0.054922,0.015173,0.006059,0.416739,0.162553,0.065994,0.180711,0.032730
607,-0.013574,-0.106825,-0.078976,-0.027913,-0.045479,-0.065509,0.450852,-0.053325,-0.195999,0.084456,...,0.001211,0.314695,0.011948,0.000250,-0.041151,3.030528,2.409733,-0.000113,0.521413,-0.006187
608,0.000452,0.008797,0.009090,0.002931,-0.004531,0.007059,0.028937,0.000872,-0.072144,0.014957,...,0.004234,0.019807,0.012671,-0.002075,0.004310,0.011158,-0.005069,0.026851,0.023596,0.000213


In [24]:
# Recommendation for user id 1 in a sorted manner
preds_1 = preds_df.iloc[1,:].sort_values(ascending=False)

print(preds_1)
# Recommendations if user has not rated the movie previously
recommendations_1 = [i for i in preds_1.index if movie_matrix.loc[1,i]==0]



#Recommend only top 10 items
final_recommendations_1 = recommendations_1[:10]
print("Recommendations for user with id 1 are:" + ','.join(final_recommendations_1))



title
Inception (2010)                                           2.109118
Dark Knight, The (2008)                                    1.856802
Shawshank Redemption, The (1994)                           1.630202
Fight Club (1999)                                          1.517147
Shutter Island (2010)                                      1.462124
                                                             ...   
Lost in Translation (2003)                                -0.406932
Army of Darkness (1993)                                   -0.409852
Harry Potter and the Chamber of Secrets (2002)            -0.440486
Shrek 2 (2004)                                            -0.475239
Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)   -0.483155
Name: 1, Length: 9719, dtype: float64
Recommendations for user with id 1 are:Inception (2010),Dark Knight, The (2008),Shawshank Redemption, The (1994),Shutter Island (2010),Inglourious Basterds (2009),Dark Knight Rises, The (2012),Django Unchaine

### Evaluation Metrics

In [25]:
from sklearn.metrics import mean_squared_error
from math import sqrt

mse =  mean_squared_error(movie_matrix, preds_df, squared = True)
print("Mean Squared Error : " , mse)
rmse = mean_squared_error(movie_matrix, preds_df, squared = False)
print("Root Mean Squared Error : " , rmse)


Mean Squared Error :  0.09369380623776534
Root Mean Squared Error :  0.21498502535538283


#### Extracting genres

In [26]:
#reading dataset
all_movies=pd.read_csv("ml-latest-small\\movies.csv")

#extracting genres of top recommendations for user id =1
genres=[]      
for ele in final_recommendations_1:
    genres.append(all_movies[all_movies.eq(ele).any(1)].genres.astype(str))

print(genres)
    
unique_genres_1=[]

for genre in genres:
    ele=genre.to_numpy()
    f=ele[0].split('|')
    for value in f:
        unique_genres_1.append(value)
    
unique_genres_1 = list(dict.fromkeys(unique_genres_1)) 
    

print(unique_genres_1)

['Action', 'Crime', 'Drama', 'Mystery', 'Sci-Fi', 'Thriller', 'IMAX', 'War', 'Adventure', 'Western', 'Comedy']


## B) Book Recommender System using extracted genres

In [27]:
# Now, we have the top genres that should be recommended to the given user
# Using these genres in books dataset, we will recommend books to this user (here, userId=1)

books = pd.read_csv("booksummaries.txt", sep='\t', names = ['Wikipedia_ID', 'Freebase_ID', 'Title', 'Author', 'Pub_Date', 'Genre', 'Plot'])
#print(books)
books.dropna(subset= ['Genre'], inplace = True)
del books['Freebase_ID']
del books['Plot']
import ast
def getGenre(str):
    return list(ast.literal_eval(str).values())

books['Genre'] = books['Genre'].apply(getGenre)
books.reset_index(inplace = True, drop= True)


In [28]:
books = pd.DataFrame(books)
books_new = books.filter(['Title','Genre'], axis=1)
books_new



Unnamed: 0,Title,Genre
0,Animal Farm,"[Roman à clef, Satire, Children's literature, ..."
1,A Clockwork Orange,"[Science Fiction, Novella, Speculative fiction..."
2,The Plague,"[Existentialism, Fiction, Absurdist fiction, N..."
3,A Fire Upon the Deep,"[Hard science fiction, Science Fiction, Specul..."
4,All Quiet on the Western Front,"[War novel, Roman à clef]"
...,...,...
12836,The Third Lynx,[Science Fiction]
12837,Remote Control,"[Thriller, Fiction, Suspense]"
12838,Transfer of Power,"[Thriller, Fiction]"
12839,Decoded,[Autobiography]


In [29]:
import math
import re
from collections import Counter
from numpy import dot
from numpy.linalg import norm

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



def findSimilarity(X_List, Y_List):
    # sw contains the list of stopwords
    sw = stopwords.words('english') 
    l1 =[];l2 =[]
  
    # remove stop words from the string
    X_set = {w for w in X_List if not w in sw} 
    Y_set = {w for w in Y_List if not w in sw}
  

    #print(X_set)
    #print(Y_set)

    # form a set containing keywords of both strings 
    rvector = X_set.union(Y_set) 

    #print(rvector)
    for w in rvector:
        if w in X_set: l1.append(1) # create a vector
        else: l1.append(0)
        if w in Y_set: l2.append(1)
        else: l2.append(0)
    c = 0

    # cosine formula 
    for i in range(len(rvector)):
            c+= l1[i]*l2[i]
    cosine = c / float((sum(l1)*sum(l2))**0.5)

    return cosine


X_list = np.array(unique_genres_1)

similar_books=[]
for i in range(12840):
    Y_list = books_new.iloc[i].to_numpy()[1]
    cosine_of_X_Y = findSimilarity(X_list, Y_list)
    #print(cosine_of_X_Y)
    similar_books.append([books_new.iloc[i].to_numpy()[0],cosine_of_X_Y])

#print(similar_books)

df = pd.DataFrame(similar_books, columns = ['Name', 'Similarity Score'])

#dataframe containing similarity score
similarity_values = df.filter(['Similarity Score'], axis=1)

actual_similarity_values=[]

for i in range(10):
    actual_similarity_values.append(1);

#dataframe containing ideal similarity values    
actual_similarity_values_df=  pd.DataFrame(actual_similarity_values, columns = ['Similarity Score'])

df.head()


Unnamed: 0,Name,Similarity Score
0,Animal Farm,0.0
1,A Clockwork Orange,0.0
2,The Plague,0.0
3,A Fire Upon the Deep,0.0
4,All Quiet on the Western Front,0.0


In [30]:
top_book_recommendations = df.sort_values(by='Similarity Score',ascending=False)
top_book_recommendations

Unnamed: 0,Name,Similarity Score
10803,Hold Tight,0.522233
6139,The Heralds,0.426401
3510,The Incredible Journey,0.426401
9688,The 5th Horseman,0.426401
2588,The Gun Seller,0.426401
...,...,...
4540,Amber and Iron,0.000000
4542,Boba Fett: Maze Of Deception,0.000000
4543,Boba Fett: Hunted,0.000000
4544,Boba Fett: A New Threat,0.000000


In [31]:
#Recommend only top 10 items
final_top_10_book_recommendations = top_book_recommendations.head(10)
final_top_10_book_recommendations

Unnamed: 0,Name,Similarity Score
10803,Hold Tight,0.522233
6139,The Heralds,0.426401
3510,The Incredible Journey,0.426401
9688,The 5th Horseman,0.426401
2588,The Gun Seller,0.426401
10073,Noite,0.426401
9946,No Second Chance,0.426401
11943,Miracle Cure,0.426401
7423,The Good Guy,0.426401
10192,This Can't Be Happening at Macdonald Hall,0.426401


In [32]:
print("Book Recommendations for user with id 1 are:" );
final_top_10_book_recommendations.Name

Book Recommendations for user with id 1 are:


10803                                   Hold Tight
6139                                   The Heralds
3510                        The Incredible Journey
9688                              The 5th Horseman
2588                                The Gun Seller
10073                                        Noite
9946                              No Second Chance
11943                                 Miracle Cure
7423                                  The Good Guy
10192    This Can't Be Happening at Macdonald Hall
Name: Name, dtype: object

### Evaluation Metrics

In [33]:
similarity_values = similarity_values.sort_values(by='Similarity Score',ascending=False).head(10)
mse =  mean_squared_error(actual_similarity_values_df,similarity_values , squared = True)
print("Mean Squared Error : " , mse)
rmse = mean_squared_error(actual_similarity_values_df, similarity_values, squared = False)
print("Root Mean Squared Error : " , rmse)

Mean Squared Error :  0.31893991845547454
Root Mean Squared Error :  0.5647476590969409


## General Function for Movie and Book Recommendations

In [34]:
#first, it recommends movies
#then, creates a list of genres of those top movie recommendations
#the, finds books of same genres(based on cosine similarity) and recommends top 10 such books

def recommend_movies_and_books(user_id):
    preds = preds_df.iloc[user_id,:].sort_values(ascending=False)

    # Recommendations if user has not rated the movie previuosly
    recommendations = [i for i in preds.index if movie_matrix.loc[user_id,i]==0]

    #Recommend only top 10 items
    final_recommendations = recommendations[:10]
    print("Movie Recommendations for given user are:\n" + '\n'.join(final_recommendations))
    
    #creating a list of genres of recommended movies
    genres=[]      
    for ele in final_recommendations:
        genres.append(all_movies[all_movies.eq(ele).any(1)].genres.astype(str))

    
    
    unique_genres=[]

    for genre in genres:
        ele=genre.to_numpy()
        f=ele[0].split('|')
        for value in f:
            unique_genres.append(value)
    
    unique_genres = list(dict.fromkeys(unique_genres)) 
    
    
    print("\nGenres liked by given user: " , unique_genres)
    
    
    #finding cosine similarity between the given genre subset with genres of all other books in dataset
    X_list1 = np.array(unique_genres)
    
    similar_books=[]
    
    for i in range(12840):
        Y_list1 = books_new.iloc[i].to_numpy()[1]
        cosine_of_X_Y = findSimilarity(X_list1, Y_list1)
        #print(cosine_of_X_Y)
        similar_books.append([books_new.iloc[i].to_numpy()[0],cosine_of_X_Y])
    
    
    df = pd.DataFrame(similar_books, columns = ['Name', 'Similarity Score'])
    df.head()
    top_book_recommendations = df.sort_values(by='Similarity Score',ascending=False)
    
    
    final_top_10_book_recommendations = top_book_recommendations.head(10)
    print("\nBook Recommendations for the given user are: \n" ,final_top_10_book_recommendations.Name)

In [37]:
recommend_movies_and_books(101)

Movie Recommendations for given user are:
Shawshank Redemption, The (1994)
Apollo 13 (1995)
True Lies (1994)
Crimson Tide (1995)
Die Hard: With a Vengeance (1995)
Aladdin (1992)
Firm, The (1993)
Speed (1994)
Outbreak (1995)
Lion King, The (1994)

Genres liked by given user:  ['Crime', 'Drama', 'Adventure', 'IMAX', 'Action', 'Comedy', 'Romance', 'Thriller', 'War', 'Animation', 'Children', 'Musical', 'Sci-Fi']

Book Recommendations for the given user are: 
 0                                             Animal Farm
8563                   Grania: She-King of the Irish Seas
8553                                      One Good Knight
8554                              Kai Lung's Golden Hours
8555                                    Descent into Hell
8556                                     The Last Empress
8557                                   Journey to a Woman
8558                                      Death of a Doxy
8559                                The Stone of Laughter
8560    The Shield