In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

ratings=pd.read_csv("ratings.csv")
users=pd.read_csv("users.csv")
movies=pd.read_csv("movies.csv")
ratings.drop("timestamp",axis=1,inplace=True)
ratings.head()
movies.drop("x",axis=1,inplace=True)
users.drop("age",axis=1,inplace = True)
data = pd.merge(pd.merge(movies,ratings),users)


In [2]:
# Make a census of the genre keywords
genre_labels = set()
for s in movies['genre'].str.split('|').values:
    genre_labels = genre_labels.union(set(s))
genre_labels
# Function that counts the number of times each of the genre keywords appear
def count_word(dataset, ref_col, census):
    keyword_count = dict()
    for s in census: 
        keyword_count[s] = 0
    for census_keywords in dataset[ref_col].str.split('|'):        
        if type(census_keywords) == float and pd.isnull(census_keywords): 
            continue        
        for s in [s for s in census_keywords if s in census]: 
            if pd.notnull(s): 
                keyword_count[s] += 1
    # convert the dictionary in a list to sort the keywords by frequency
    keyword_occurences = []
    for k,v in keyword_count.items():
        keyword_occurences.append([k,v])
    keyword_occurences.sort(key = lambda x:x[1], reverse = True)
    return keyword_occurences, keyword_count

# Calling this function gives access to a list of genre keywords which are sorted by decreasing frequency
keyword_occurences, dum = count_word(movies, 'genre', genre_labels)
keyword_occurences[:5]

[['Drama', 1604],
 ['Comedy', 1200],
 ['Action', 504],
 ['Thriller', 494],
 ['Romance', 471]]

In [3]:
movies.genre = movies.genre.str.split('|')

movies.genre = movies.genre.fillna("").astype('str')

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genre'])
tfidf_matrix.shape
tfidf_matrix

<3885x128 sparse matrix of type '<class 'numpy.float64'>'
	with 9581 stored elements in Compressed Sparse Row format>

In [5]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

array([[1.        , 0.14195895, 0.09012667, 0.1056405 ],
       [0.14195895, 1.        , 0.        , 0.        ],
       [0.09012667, 0.        , 1.        , 0.17202271],
       [0.1056405 , 0.        , 0.17202271, 1.        ]])

In [6]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

Collaborative Filtering

In [7]:
#content based function for new movie feature 
def genre_recommendations1(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return movie_indices

In [8]:
Mean = ratings.groupby(by="user_id",as_index=False)['rating'].mean()
Rating_avg = pd.merge(ratings,Mean,on='user_id')
Rating_avg['adg_rating']=Rating_avg['rating_x']-Rating_avg['rating_y']
Rating_avg.head()

Unnamed: 0,user_id,movie_id,rating_x,rating_y,adg_rating
0,1,1193,5,4.188679,0.811321
1,1,661,3,4.188679,-1.188679
2,1,914,3,4.188679,-1.188679
3,1,3408,4,4.188679,-0.188679
4,1,2355,5,4.188679,0.811321


In [9]:
check = pd.pivot_table(Rating_avg,values='rating_x',index='user_id',columns='movie_id')
check.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,


In [10]:
final = pd.pivot_table(Rating_avg,values='adg_rating',index='user_id',columns='movie_id')
final.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.811321,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,-1.146465,,,,,...,,,,,,,,,,


In [11]:
new_movie = [-5 for i in range(6040)]
final[3953] = new_movie
final.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3944,3945,3946,3947,3948,3949,3950,3951,3952,3953
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.811321,,,,,,,,,,...,,,,,,,,,,-5
2,,,,,,,,,,,...,,,,,,,,,,-5
3,,,,,,,,,,,...,,,,,,,,,,-5
4,,,,,,,,,,,...,,,,,,,,,,-5
5,,,,,,-1.146465,,,,,...,,,,,,,,,,-5


In [12]:
# type(final)
# len(final) 
# new_movie = [-5 for i in range(6040)]
# final['3593'] = new_movie

# final.head()

In [13]:
# Replacing NaN by Movie Average
final_movie = final.fillna(final.mean(axis=0))


In [14]:
final_movie.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3944,3945,3946,3947,3948,3949,3950,3951,3952,3953
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.811321,-0.324143,-0.440247,-0.698816,-0.419777,0.280853,-0.11491,-0.443714,-0.725926,0.000378,...,-1.175689,-1.844868,-1.228738,-0.084698,0.027156,0.549132,0.052553,0.356937,0.165338,-5
2,0.479497,-0.324143,-0.440247,-0.698816,-0.419777,0.280853,-0.11491,-0.443714,-0.725926,0.000378,...,-1.175689,-1.844868,-1.228738,-0.084698,0.027156,0.549132,0.052553,0.356937,0.165338,-5
3,0.479497,-0.324143,-0.440247,-0.698816,-0.419777,0.280853,-0.11491,-0.443714,-0.725926,0.000378,...,-1.175689,-1.844868,-1.228738,-0.084698,0.027156,0.549132,0.052553,0.356937,0.165338,-5
4,0.479497,-0.324143,-0.440247,-0.698816,-0.419777,0.280853,-0.11491,-0.443714,-0.725926,0.000378,...,-1.175689,-1.844868,-1.228738,-0.084698,0.027156,0.549132,0.052553,0.356937,0.165338,-5
5,0.479497,-0.324143,-0.440247,-0.698816,-0.419777,-1.146465,-0.11491,-0.443714,-0.725926,0.000378,...,-1.175689,-1.844868,-1.228738,-0.084698,0.027156,0.549132,0.052553,0.356937,0.165338,-5


In [15]:
# user similarity on replacing NAN by item(movie) avg
cosine = cosine_similarity(final_movie)
np.fill_diagonal(cosine, 0 )
similarity_with_movie = pd.DataFrame(cosine,index=final_movie.index)
similarity_with_movie.columns=final_movie.index
similarity_with_movie.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.954602,0.972706,0.980836,0.918597,0.972286,0.983186,0.958635,0.971956,0.894384,...,0.952271,0.960182,0.974644,0.984564,0.862058,0.846426,0.938203,0.980347,0.96851,0.892837
2,0.954602,0.0,0.950468,0.95637,0.891822,0.947639,0.959516,0.932646,0.946847,0.869886,...,0.928126,0.935664,0.95211,0.961149,0.844912,0.837705,0.919718,0.956105,0.946421,0.868523
3,0.972706,0.950468,0.0,0.977399,0.911927,0.967648,0.97997,0.952631,0.964795,0.890417,...,0.946973,0.953354,0.967943,0.979904,0.856508,0.849043,0.935575,0.976067,0.965108,0.886402
4,0.980836,0.95637,0.977399,0.0,0.921675,0.975317,0.986827,0.962741,0.973144,0.89664,...,0.953546,0.962461,0.977528,0.987842,0.860341,0.85467,0.941936,0.983156,0.97007,0.89925
5,0.918597,0.891822,0.911927,0.921675,0.0,0.911106,0.925681,0.903995,0.908786,0.839097,...,0.897169,0.901016,0.916021,0.925133,0.817965,0.803381,0.877101,0.920718,0.909536,0.842621


In [16]:
def find_n_neighbours(df,n):
    order = np.argsort(df.values, axis=1)[:, :n]
    df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False)
           .iloc[:n].index, 
          index=['top{}'.format(i) for i in range(1, n+1)]), axis=1)
    return df

In [17]:
# top 30 neighbours for each user
sim_user_30_m = find_n_neighbours(similarity_with_movie,30)
sim_user_30_m.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,...,top21,top22,top23,top24,top25,top26,top27,top28,top29,top30
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3324,4628,592,5007,283,5168,4073,1454,1986,2184,...,907,5694,4741,3730,4010,1568,277,2339,1307,4926
2,2467,2111,1454,592,367,2268,4628,124,4073,2388,...,5007,1669,1801,1986,5168,20,128,4741,1708,2339
3,3743,2111,5122,4010,1986,2268,3287,5168,213,2574,...,4881,3421,5068,4741,4537,4038,2538,4254,4665,298
4,446,5168,1236,3828,4073,2295,4176,907,4010,2729,...,4100,5781,2268,4288,5696,1245,455,5007,2184,298
5,5694,3899,1844,5309,782,4664,4628,5440,2633,277,...,1815,5007,298,1811,4558,276,681,5904,427,171


In [18]:
def User_item_score(user,item):
    a = sim_user_30_m[sim_user_30_m.index==user].values
    b = a.squeeze().tolist()
    c = final_movie.loc[:,item]
    d = c[c.index.isin(b)]
    f = d[d.notnull()]
    avg_user = Mean.loc[Mean['user_id'] == user,'rating'].values[0]
    index = f.index.values.squeeze().tolist()
    corr = similarity_with_movie.loc[user,index]
    fin = pd.concat([f, corr], axis=1)
    fin.columns = ['adg_score','correlation']
    fin['score']=fin.apply(lambda x:x['adg_score'] * x['correlation'],axis=1)
    nume = fin['score'].sum()
    deno = fin['correlation'].sum()
    final_score = avg_user + (nume/deno)
    return final_score

In [19]:
score = User_item_score(320,600)
print("score (u,i) is",score)

score (u,i) is 3.7986444904221344


In [20]:
Rating_avg = Rating_avg.astype({"movie_id": str})
Movie_user = Rating_avg.groupby(by = 'user_id')['movie_id'].apply(lambda x:','.join(x))

In [21]:
### Fuction to calculate the rating for a new movie for the neighbours who will be used to recommend movies to our user in collaborative filtering#
x = []
for i in final_movie:
    x.append(i)

def new_movie_rating(b):
    ind = []
    for i in final_movie:
        ind.append(i)
    #print(ind)
    replacable = [-5 for i in range(len(final_movie))]
    final_rat_list = []
    for i in b:
        x = final_movie.iloc[i].tolist()
        if(-5 in x):
            n = x.index(-5)
            sim_mov = genre_recommendations1(titles.iloc[n])
            #print(sim_mov)
            rat1 = []
            for j in sim_mov:
                try:
                    g = ind.index(j)
                    rat1.append(x[g])
                except:
                    continue
            try:
                av_rating = sum(rat1)/len(rat1)
                replacable[ind.index(i)] = av_rating
                final_rat_list.append(av_rating)
            except:
                continue
    final_movie.drop(3953,axis = 1,inplace = True)
    final_movie[3953] = replacable
    #print(replacable)




In [22]:
###Function to return the top recommended movie names



def User_item_score1(user):
    Movie_seen_by_user = check.columns[check[check.index==user].notna().any()].tolist()
    a = sim_user_30_m[sim_user_30_m.index==user].values
    b = a.squeeze().tolist()
    new_movie_rating(b)
    d = Movie_user[Movie_user.index.isin(b)]
    l = ','.join(d.values)
    Movie_seen_by_similar_users = l.split(',')
    Movies_under_consideration = list(set(Movie_seen_by_similar_users)-set(list(map(str, Movie_seen_by_user))))
    Movies_under_consideration = list(map(int, Movies_under_consideration))
    score = []
    for item in Movies_under_consideration:
        c = final_movie.loc[:,item]
        d = c[c.index.isin(b)]
        f = d[d.notnull()]
        avg_user = Mean.loc[Mean['user_id'] == user,'rating'].values[0]
        index = f.index.values.squeeze().tolist()
        corr = similarity_with_movie.loc[user,index]
        fin = pd.concat([f, corr], axis=1)
        fin.columns = ['adg_score','correlation']
        fin['score']=fin.apply(lambda x:x['adg_score'] * x['correlation'],axis=1)
        nume = fin['score'].sum()
        deno = fin['correlation'].sum()
        
        final_score = avg_user + (nume/deno)
        score.append(final_score)
    data = pd.DataFrame({'movie_id':Movies_under_consideration,'score':score})
    top_5_recommendation = data.sort_values(by='score',ascending=False).head(5)
    Movie_Name = top_5_recommendation.merge(movies, how='inner', on='movie_id')
    
    Movie_Names = Movie_Name.title.values.tolist()
    return Movie_Names
    


In [23]:
user = int(input("Enter the user id to whom you want to recommend : "))
mov = input("Enter the current movie:")
predicted_movies = User_item_score1(user)
print(" ")
print("The Recommendations for User Id : ",user)
print("   ")
list1 = genre_recommendations(mov).head(5)
l2=list(list1)
l2.extend(predicted_movies)
for i in l2:
    print(i)

 
The Recommendations for User Id :  157
   
To Die For (1995)
Kicking and Screaming (1995)
Big Bully (1996)
Last Summer in the Hamptons (1995)
Nobody Loves Me (Keiner liebt mich) (1994)
Usual Suspects The (1995)
Sixth Sense The (1999)
Rear Window (1954)
One Flew Over the Cuckoo's Nest (1975)
Third Man The (1949)


In [24]:
rat=pd.read_csv("ratings.csv")
rat.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [25]:
rat = rat.drop("timestamp",axis=1)

In [26]:
rat.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [27]:
b=np.zeros(100000)
c=np.zeros(100000)
for i in range(100000):
    m=rat.user_id[i]
    n=rat.movie_id[i]
    b[i]=User_item_score(m,n)
    c[i]=rat.rating[i]

In [28]:
import sklearn
import math
mse = sklearn.metrics.mean_squared_error(b,c)
rmse = math.sqrt(mse)
print("Root mean squared error: ",rmse)

Root mean squared error:  0.9030916411116015
