In [46]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [50]:
df1 = pd.read_csv("movies.csv")
print(df1.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [52]:
df1.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [54]:
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [56]:
df1['genres_list'] = df1['genres'].str.replace('|', ' ')
df1['clean_title'] = df1['title'].apply(clean_title)

movies_data = df1[['movieId', 'clean_title', 'genres_list']]
print(movies_data.head())

   movieId                       clean_title  \
0        1                    Toy Story 1995   
1        2                      Jumanji 1995   
2        3             Grumpier Old Men 1995   
3        4            Waiting to Exhale 1995   
4        5  Father of the Bride Part II 1995   

                                   genres_list  
0  Adventure Animation Children Comedy Fantasy  
1                   Adventure Children Fantasy  
2                               Comedy Romance  
3                         Comedy Drama Romance  
4                                       Comedy  


In [60]:
df2 = pd.read_csv("ratings.csv")
print(df2.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [62]:
df2.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [64]:
ratings_data = df2.drop(['timestamp'], axis=1)
print(ratings_data.head())

   userId  movieId  rating
0       1        1     4.0
1       1        3     4.0
2       1        6     4.0
3       1       47     5.0
4       1       50     5.0


In [66]:
combined_data = ratings_data.merge(movies_data, on='movieId')
print(combined_data.head())

   userId  movieId  rating              clean_title  \
0       1        1     4.0           Toy Story 1995   
1       1        3     4.0    Grumpier Old Men 1995   
2       1        6     4.0                Heat 1995   
3       1       47     5.0     Seven aka Se7en 1995   
4       1       50     5.0  Usual Suspects The 1995   

                                   genres_list  
0  Adventure Animation Children Comedy Fantasy  
1                               Comedy Romance  
2                        Action Crime Thriller  
3                             Mystery Thriller  
4                       Crime Mystery Thriller  


In [68]:
vectorizer_title = TfidfVectorizer(ngram_range=(1,2))

tfidf_title = vectorizer_title.fit_transform(movies_data['clean_title'])

def search_by_title(title):
    title = clean_title(title)
    query_vec = vectorizer_title.transform([title])
    similarity = cosine_similarity(query_vec, tfidf_title).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies_data.iloc[indices][::-1]
    return results

movie_results = search_by_title("Toy Story")
print(movie_results)

      movieId        clean_title  \
0           1     Toy Story 1995   
7355    78499   Toy Story 3 2010   
2355     3114   Toy Story 2 1999   
3595     4929       Toy The 1982   
4089     5843  Toy Soldiers 1991   

                                           genres_list  
0          Adventure Animation Children Comedy Fantasy  
7355  Adventure Animation Children Comedy Fantasy IMAX  
2355       Adventure Animation Children Comedy Fantasy  
3595                                            Comedy  
4089                                      Action Drama  


In [70]:
vectorizer_genres = TfidfVectorizer(ngram_range=(1,2))

tfidf_genres = vectorizer_genres.fit_transform(movies_data['genres_list'])

def search_similar_genres(genres):
    query_vec = vectorizer_genres.transform([genres])
    similarity = cosine_similarity(query_vec, tfidf_genres).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = movies_data.iloc[indices][::-1]
    return results

gen = 'Adventure Comedy'
print(search_similar_genres(gen))

      movieId                                        clean_title  \
732       952                   Around the World in 80 Days 1956   
3438     4686                         Weekend at Bernies II 1993   
2807     3752                              Me Myself  Irene 2000   
3449     4704                                        Hatari 1962   
6091    42009                        Cheaper by the Dozen 2 2005   
7682    89305                        Inbetweeners Movie The 2011   
3470     4734                Jay and Silent Bob Strike Back 2001   
7331    77841     St Trinians 2 The Legend of Frittons Gold 2009   
8010    97665  Asterix  Obelix God Save Britannia Astrix et O...   
8051    98697    Money Money Money Laventure cest laventure 1972   

           genres_list  
732   Adventure Comedy  
3438  Adventure Comedy  
2807  Adventure Comedy  
3449  Adventure Comedy  
6091  Adventure Comedy  
7682  Adventure Comedy  
3470  Adventure Comedy  
7331  Adventure Comedy  
8010  Adventure Comedy  
8

In [76]:
def scores_calculator(movie_id):
    #find the recommendations from users who like the same movie
    similar_users = combined_data[(combined_data['movieId']== movie_id) & (combined_data['rating']>=4)]['userId'].unique()
    similar_user_recs = combined_data[(combined_data['userId'].isin(similar_users)) & (combined_data['rating']>=4)]['movieId']
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    #print(similar_user_recs)
    
    #find the recommendations from all users who have watch the movies above
    all_users = combined_data[(combined_data['movieId'].isin(similar_user_recs.index)) & (combined_data['rating']>=4)]
    all_users_recs = all_users['movieId'].value_counts() / all_users['userId'].nunique()
    #print(all_users_recs)
    
    genres_of_selected_movie = combined_data[combined_data['movieId']==movie_id]['genres_list'].unique()
    genres_of_selected_movie = np.array2string(genres_of_selected_movie)
    movies_with_similar_genres = search_similar_genres(genres_of_selected_movie)
    
    indices = []
    for index in movies_with_similar_genres[(movies_with_similar_genres['movieId'].isin(similar_user_recs.index))]['movieId']:
        indices.append(index)
    
    #times a factor 1.5 to movies with similar genres and similar users
    similar_user_recs.loc[indices] = similar_user_recs.loc[indices]*1.5 

    #times a factor 0.9 to movies with similar genres and all users
    indices = []
    for index in movies_with_similar_genres[(movies_with_similar_genres['movieId'].isin(all_users_recs.index))]['movieId']:
        indices.append(index)
    all_users_recs.loc[indices] = all_users_recs.loc[indices]*0.9
    
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ['similar', 'all']
    rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']
    
    rec_percentages = rec_percentages.sort_values('score', ascending=False)
    return rec_percentages

scores_calculator(3114)

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3114,1.500000,0.082759,18.125000
5772,0.035714,0.003284,10.875000
6971,0.017857,0.001642,10.875000
102716,0.035714,0.003284,10.875000
6969,0.017857,0.001642,10.875000
...,...,...,...
339,0.035714,0.067323,0.530488
25,0.035714,0.070608,0.505814
454,0.035714,0.078818,0.453125
434,0.017857,0.039409,0.453125


In [78]:
def recommendation_results(user_input, title=0):
    # user_input = clean_title(user_input)
    title_candidates = search_by_title(user_input)
    movie_id = title_candidates.iloc[title]['movieId']
    scores = scores_calculator(movie_id)
    results = scores.head(10).merge(movies_data, left_index=True, right_on='movieId')[['clean_title', 'score', 'genres_list']]
    resutls = results.rename(columns={'clean_title': 'title', 'genres_list': 'genres'}, inplace=True)
    return results

user_input = "Toy Story"
print("Are you looking for (please choose a number): ")
for i in range(5):
    print(i, ": ", search_by_title(user_input)['clean_title'].iloc[i])

title = 0
if int(title) in range(5):
    print("We have following recommendations: ")
    print(recommendation_results(user_input, int(title)))
else:
    print("Sorry! please try again!")

Are you looking for (please choose a number): 
0 :  Toy Story 1995
1 :  Toy Story 3 2010
2 :  Toy Story 2 1999
3 :  Toy The 1982
4 :  Toy Soldiers 1991
We have following recommendations: 
                                             title     score  \
2809   Adventures of Rocky and Bullwinkle The 2000  6.904762   
0                                   Toy Story 1995  6.904762   
1706                                     Antz 1998  5.753968   
2355                              Toy Story 2 1999  5.671769   
2639             All the Vermeers in New York 1990  4.142857   
4835                             Dark Passage 1947  4.142857   
4746                                Red River 1948  4.142857   
4742  Beauty and the Beast La belle et la bte 1946  4.142857   
4740                      Birdman of Alcatraz 1962  4.142857   
4739                             Dark Victory 1939  4.142857   

                                           genres  
2809  Adventure Animation Children Comedy Fantasy  
0  