In [295]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel
import re
from scipy.sparse.linalg import svds

plt.style.use('dark_background')

In [296]:
df_total = pd.read_csv('C:\\Users\\User\\Desktop\\Data_Science\\Projects\\HW_23_Great_Rec\\movies_metadata.csv', 
                       low_memory = False)

df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [297]:
df_rating = pd.read_csv('C:\\Users\\User\\Desktop\\Data_Science\\Projects\\HW_23_Great_Rec\\ratings_small.csv',
                       low_memory = False)

df_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [298]:
columns = ['id', 'title', 'overview', 'genres', 'vote_average', 'vote_count', 'popularity', 'release_date']
df_movies = df_total.loc[:][columns] # Columns selected for future recomendations

non_numeric_ids = df_movies[~df_movies['id'].str.isnumeric()]['id'].unique() # Check for not numeric values in ID column
print(f"Non-numeric IDs found: {non_numeric_ids}")




Non-numeric IDs found: ['1997-08-20' '2012-09-29' '2014-01-01']


In [299]:
df_movies.drop(df_movies[df_movies['id'].isin(non_numeric_ids)].index, inplace=True) # Drop not numeric values

In [300]:
df_movies['id'] = df_movies['id'].astype(int) # Data type trasform

In [301]:
df_movies = df_movies.drop_duplicates(subset='id') # Drop id duplicates

In [302]:
df_movies.head()

Unnamed: 0,id,title,overview,genres,vote_average,vote_count,popularity,release_date
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",7.7,5415.0,21.946943,1995-10-30
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",6.9,2413.0,17.015539,1995-12-15
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",6.5,92.0,11.7129,1995-12-22
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",6.1,34.0,3.859495,1995-12-22
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]",5.7,173.0,8.387519,1995-02-10


In [444]:
# Find unique movies id in movies and rating datasets
y_cross = np.array(df_rating['movieId'].unique())
x_cross = np.array(df_movies['id'].unique())

In [304]:
# Creates mask that find matches between movie and rating movies IDs
mask_movies_id = np.in1d(x_cross, y_cross)
mask_movies_id.shape

(45433,)

In [305]:
df_movies_final = df_movies[mask_movies_id] # Final movies dataset for recomendations analysis
df_movies_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2830 entries, 5 to 45450
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            2830 non-null   int32  
 1   title         2830 non-null   object 
 2   overview      2809 non-null   object 
 3   genres        2830 non-null   object 
 4   vote_average  2830 non-null   float64
 5   vote_count    2830 non-null   float64
 6   popularity    2830 non-null   object 
 7   release_date  2829 non-null   object 
dtypes: float64(2), int32(1), object(5)
memory usage: 187.9+ KB


In [306]:
df_movies_final.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_final.dropna(inplace = True)


In [307]:
df_rating_final = df_rating[df_rating['movieId'].isin(df_movies_final['id'])] # Final rating dataset for analysis

In [308]:
df_movies_final.head()

Unnamed: 0,id,title,overview,genres,vote_average,vote_count,popularity,release_date
5,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",7.7,1886.0,17.924927,1995-12-15
9,710,GoldenEye,James Bond must unmask the mysterious head of ...,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",6.6,1194.0,14.686036,1995-11-16
14,1408,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ...","[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",5.7,137.0,7.284477,1995-12-22
15,524,Casino,The life of the gambling paradise – Las Vegas ...,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",7.8,1343.0,10.137389,1995-11-22
16,4584,Sense and Sensibility,"Rich Mr. Dashwood dies, leaving his second wif...","[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",7.2,364.0,10.673167,1995-12-13


In [309]:
df_keys = pd.read_csv('C:\\Users\\User\\Desktop\\Data_Science\\Projects\\HW_23_Great_Rec\\keywords.csv',
                     low_memory = False)
df_keys.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


# Recommend

# Title

## Overview Similarity

In [310]:
# Create vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
overview_matrix = vectorizer.fit_transform(df_movies_final['overview'])

overview_sim = linear_kernel(overview_matrix, overview_matrix) # Overview similarity matrix

In [455]:
def overview_rec(title: str, nrec: int = 10):
    
      """
        The funcion that finds recomendations analysing overview similarity

        output: List of 'nrec' IDs for top recommended movies
      """
      
      title = title.capitalize()  
      movie_idx = df_movies_final[df_movies_final['title'] == title].index[0]

      title_sim = list(enumerate(overview_sim[movie_idx]))
      title_sim = sorted(title_sim, key = lambda x: x[1], reverse = True)

      movie_rec = title_sim[1:nrec+1]


      movie_rec = [i[0] for i in movie_rec]


      return movie_rec


In [439]:
print(overview_rec('Heat')) # Test

[871, 2768, 808, 1457, 746, 1800, 2763, 2409, 788, 980]


## Genre Similarity

In [456]:
def str_detect(df, column: str,  substr = ['name', '}'], new_column:str = None):

      """
        The funcion that extracts all substrings from each row of the column in dataframe that are located between the first and 
        the second substring defined. 

        output: List of 'nrec' IDs for top recommended movies
      """
    
      todos_genres = []
      for line in df[column]:

        start_search = [m.end() for m in re.finditer(substr[0], line)]
        end_search = [m.end() for m in re.finditer(substr[1], line)]


        genres_list = []
        for i, j in zip(start_search, end_search):
            genres_list.append(line[i+4:j-2])
        todos_genres.append(genres_list)

      df[new_column] = todos_genres
      df[new_column] = df['genre'].apply(lambda x: str(x).replace('[', '').replace(']', '').replace("'", '').replace('"', ''))

      return todos_genres

In [447]:
todos_generes = str_detect(df_movies_final, 'genres', new_column = 'genre')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_column] = todos_genres
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_column] = df['genre'].apply(lambda x: str(x).replace('[', '').replace(']', '').replace("'", '').replace('"', ''))


In [448]:
df_movies_final.head()

Unnamed: 0,id,title,overview,genres,vote_average,vote_count,popularity,release_date,genre,Adventure,...,History,Science Fiction,Documentary,Animation,Western,War,Crime,Action,Romance,Horror
5,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",7.7,1886.0,17.924927,1995-12-15,"Action, Crime, Drama, Thriller",0,...,0,0,0,0,0,0,1,1,0,0
9,710,GoldenEye,James Bond must unmask the mysterious head of ...,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",6.6,1194.0,14.686036,1995-11-16,"Adventure, Action, Thriller",1,...,0,0,0,0,0,0,0,1,0,0
14,1408,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ...","[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",5.7,137.0,7.284477,1995-12-22,"Action, Adventure",1,...,0,0,0,0,0,0,0,1,0,0
15,524,Casino,The life of the gambling paradise – Las Vegas ...,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",7.8,1343.0,10.137389,1995-11-22,"Drama, Crime",0,...,0,0,0,0,0,0,1,0,0,0
16,4584,Sense and Sensibility,"Rich Mr. Dashwood dies, leaving his second wif...","[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",7.2,364.0,10.673167,1995-12-13,"Drama, Romance",0,...,0,0,0,0,0,0,0,0,1,0


In [316]:
genre_matrix = vectorizer.fit_transform(df_movies_final['genre'], )

genre_sim = linear_kernel(genre_matrix, genre_matrix) # Full genres line similarity matrix

In [457]:
def genre_rec(title, nrec = 10):
    
      """
        The funcion that finds recomendations analysing genre line similarity

        output: List of 'nrec' IDs for top recommended movies
      """
      
      
      title = title.capitalize()  
      movie_idx = df_movies_final[df_movies_final['title'] == title].index[0]

      title_sim = list(enumerate(genre_sim[movie_idx]))
      title_sim = sorted(title_sim, key = lambda x: x[1], reverse = True)

      movie_rec = title_sim[1:nrec+1]


      movie_rec = [i[0] for i in movie_rec]


      return movie_rec

In [318]:
print(genre_rec('Heat')) Test

[295, 484, 489, 496, 612, 702, 943, 1059, 1130, 1411]


## Keywords Similarity

In [319]:
df_keys_final = df_keys[df_keys['id'].isin(df_movies_final['id'])]

In [450]:
all_keys = str_detect(df_keys_final, 'keywords', new_column = 'keywords')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_column] = todos_genres
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_column] = df['genre'].apply(lambda x: str(x).replace('[', '').replace(']', '').replace("'", '').replace('"', ''))


In [451]:
df_keys_final.head()

Unnamed: 0,id,keywords,genre
5,949,"robbery, detective, bank, obsession, chase, sh...","robbery, detective, bank, obsession, chase, sh..."
9,710,"cuba, falsely accused, secret identity, comput...","cuba, falsely accused, secret identity, comput..."
14,1408,"exotic island, treasure, map, ship, scalp, pirate","exotic island, treasure, map, ship, scalp, pirate"
15,524,"poker, drug abuse, 1970s, overdose, illegal pr...","poker, drug abuse, 1970s, overdose, illegal pr..."
16,4584,"bowling, based on novel, servant, country life...","bowling, based on novel, servant, country life..."


In [322]:
keys_vector = vectorizer.fit_transform(df_keys_final['keywords'])

keys_sim = linear_kernel(keys_vector, keys_vector) # Keywords line similarity matrix

In [458]:
def key_recomended(title, nrec = 10):
   
      """
        The funcion that finds recomendations analysing keywords line similarity

        output: List of 'nrec' IDs for top recommended movies
      """
      
      title = title.capitalize()  
      movie_idx = df_movies_final[df_movies_final['title'] == title]['id'].values[0]
      keys_idx = df_keys_final[df_keys_final['id'] == movie_idx].index[0]

      key_sim = list(enumerate(keys_sim[keys_idx]))
      key_sim = sorted(key_sim, key = lambda x: x[1], reverse = True)

      key_sim = key_sim[1:nrec+1]
      key_sim = [i[0] for i in key_sim]

      movies_id = [i for i in df_keys_final.iloc[key_sim]['id'].values]

      return movies_id


In [377]:
print(key_recomended('Heat')) #Test

[26686, 6166, 31035, 54272, 1266, 3021, 318, 299, 86059, 37736]


# BLANK

## Blank Start Genre

In [452]:
# Find all unique genres in dataset
flatten_genres = [item for sublist in todos_generes for item in sublist]
unique_genres = list(set(flatten_genres))
unique_genres

['Adventure',
 'Thriller',
 'Fantasy',
 'Family',
 'Mystery',
 'Music',
 'Foreign',
 'Drama',
 'Comedy',
 'TV Movie',
 'History',
 'Science Fiction',
 'Documentary',
 'Animation',
 'Western',
 'War',
 'Crime',
 'Action',
 'Romance',
 'Horror']

In [326]:
# Create binary columns for each genre in movies dataset
for genre in unique_genres:

    df_movies_final[genre] = df_movies_final['genre'].apply(lambda x: 1 if genre in x else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_final[genre] = df_movies_final['genre'].apply(lambda x: 1 if genre in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_final[genre] = df_movies_final['genre'].apply(lambda x: 1 if genre in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_final[genre] = 

In [327]:
df_movies_final.head()

Unnamed: 0,id,title,overview,genres,vote_average,vote_count,popularity,release_date,genre,Adventure,...,History,Science Fiction,Documentary,Animation,Western,War,Crime,Action,Romance,Horror
5,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",7.7,1886.0,17.924927,1995-12-15,"Action, Crime, Drama, Thriller",0,...,0,0,0,0,0,0,1,1,0,0
9,710,GoldenEye,James Bond must unmask the mysterious head of ...,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",6.6,1194.0,14.686036,1995-11-16,"Adventure, Action, Thriller",1,...,0,0,0,0,0,0,0,1,0,0
14,1408,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ...","[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",5.7,137.0,7.284477,1995-12-22,"Action, Adventure",1,...,0,0,0,0,0,0,0,1,0,0
15,524,Casino,The life of the gambling paradise – Las Vegas ...,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",7.8,1343.0,10.137389,1995-11-22,"Drama, Crime",0,...,0,0,0,0,0,0,1,0,0,0
16,4584,Sense and Sensibility,"Rich Mr. Dashwood dies, leaving his second wif...","[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",7.2,364.0,10.673167,1995-12-13,"Drama, Romance",0,...,0,0,0,0,0,0,0,0,1,0


In [372]:
df_movies_final['popularity'] = df_movies_final['popularity'].apply(lambda x: float(x)) # Transform datatype

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_final['popularity'] = df_movies_final['popularity'].apply(lambda x: float(x))


In [387]:
df_movies_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2808 entries, 5 to 45450
Data columns (total 29 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               2808 non-null   int32  
 1   title            2808 non-null   object 
 2   overview         2808 non-null   object 
 3   genres           2808 non-null   object 
 4   vote_average     2808 non-null   float64
 5   vote_count       2808 non-null   float64
 6   popularity       2808 non-null   float64
 7   release_date     2808 non-null   object 
 8   genre            2808 non-null   object 
 9   Adventure        2808 non-null   int64  
 10  Thriller         2808 non-null   int64  
 11  Fantasy          2808 non-null   int64  
 12  Family           2808 non-null   int64  
 13  Mystery          2808 non-null   int64  
 14  Music            2808 non-null   int64  
 15  Foreign          2808 non-null   int64  
 16  Drama            2808 non-null   int64  
 17  Comedy           2

In [459]:
def genre_wo_title_rec(genres: list[str], nrec= 10):
   
      """
        The funcion that finds recomendations analysing each genre from a list of genres indicated for the function

        output: List of 'nrec' IDs for top recommended movies
      """
    
      temp_movies = df_movies_final.copy()

      for genre in genres:
          genre = genre.capitalize()  
          temp_movies = temp_movies[temp_movies[genre] == 1]

      temp_movies = temp_movies.sort_values(by = ['release_date', 'popularity', 'vote_count','vote_average'], ascending = False)
      temp_movies = temp_movies.head(nrec)
      temp_movies = temp_movies.loc[:,['id']]  
        

      return [y for i in temp_movies.values for y in i]

In [433]:
print(genre_wo_title_rec(['drama', 'comedy'])) #Test

[84187, 98369, 62764, 91690, 91628, 74458, 100272, 64501, 27022, 31035]


## Blank Start Keywords

In [460]:
def key_recomended_blank(keys: list[str], nrec = 10):
    
      """
        The funcion that finds recomendations analysing each keywords from a list of keywords indicated for the function

        output: List of 'nrec' IDs for top recommended movies
      """
        
      temp_movies = df_keys_final.copy()

      for key in keys:
          temp_movies = temp_movies[temp_movies['genre'].str.contains(key, case = False, na = False)]

      temp_movies = temp_movies['id']
      movies_rec = df_movies_final[df_movies_final['id'].isin(temp_movies)]
      movies_rec = movies_rec.sort_values(by = ['release_date', 'popularity', 'vote_count','vote_average'], ascending = False)
      movies_rec = movies_rec.head(nrec)

      return [i for i in temp_movies.values]

In [435]:
print(key_recomended_blank(['Space'])) # Test

[568, 11, 2164, 62, 1891, 679, 1892, 348, 828, 830, 8851, 199, 152, 174, 172, 154, 168, 18, 782, 563, 8656, 846, 95, 841, 1091, 4437, 200, 1687, 8974, 2900, 861, 957, 926, 2067, 2102, 593, 698, 7450, 3980, 2210, 869, 6974, 1894, 608, 2103, 201, 667, 2789, 831, 25952, 7348, 1884, 8765, 2191, 7453, 1895, 25874, 5172, 1272, 1979, 2397, 33380, 27441, 3007, 8464, 26947, 78088, 30, 3786, 6436, 2791, 2154]


# USER

## User Similarity

In [333]:
df_rating_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44823 entries, 10 to 99997
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     44823 non-null  int64  
 1   movieId    44823 non-null  int64  
 2   rating     44823 non-null  float64
 3   timestamp  44823 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 1.7 MB


In [334]:
# Split rating dataset because of calculation complexity limitations
df_rating_final_train, df_rating_final_test = train_test_split(df_rating_final, test_size = 0.9, 
                                                               shuffle = True, random_state = 42)

In [335]:
df_rating_final_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4482 entries, 84983 to 34865
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     4482 non-null   int64  
 1   movieId    4482 non-null   int64  
 2   rating     4482 non-null   float64
 3   timestamp  4482 non-null   int64  
dtypes: float64(1), int64(3)
memory usage: 175.1 KB


In [336]:
df_rating_final_train.head()

Unnamed: 0,userId,movieId,rating,timestamp
84983,570,37727,2.5,1475784380
88829,590,380,4.0,848677295
93695,624,1125,4.0,1019127479
80999,550,338,4.0,942669183
24081,173,260,5.0,875356589


In [337]:
user_matrix = df_rating_final_train.pivot(index = 'movieId', columns = 'userId', values = 'rating')
user_corr = user_matrix.corr(method = 'pearson') # Users' similarity matrix

In [461]:
def user_recommend(user, nrec = 10):
       
        """
        The funcion that finds recomendations analysing users similarities based on ratings for movies

        output: List of 'nrec' IDs for top recommended movies
        """
        
        user_vector = list(enumerate(user_corr[user]))
        user_vector = sorted(user_vector, key = lambda x: x[1], reverse = True)
        
        user_movies = df_rating_final[df_rating_final['userId'] == user]['movieId']
        
        sim_users = [i[0] for i in user_vector]
        
        movies_rec= []
        
        for sim in sim_users:
            if len(movies_rec) == nrec:
                    break
            else:
                    mask_1 = df_rating_final['userId'] == sim
                    mask_2 = df_rating_final['rating'] >= 4.0
                    
                    sim_movies = df_rating_final[mask_1 & mask_2]['movieId']
                    
                    for movie in sim_movies:
                            if len(movies_rec) == nrec:
                                    break
                            else:
                                    movies_rec.append(movie)
                                    
        movies_fin = df_movies_final[df_movies_final['id'].isin(movies_rec)]
        
        return movies_rec

In [339]:
print(user_recommend(user = 570)) # Test

[2105, 17, 110, 150, 153, 222, 253, 261, 265, 266]


## SVD

In [340]:
# Test to check how many rating has each user in rating dataset
us, cnts = np.unique(df_rating_final_train['userId'], return_counts= True)
print(list(zip(us, cnts)))

[(2, 3), (3, 2), (4, 21), (5, 5), (6, 4), (7, 2), (8, 4), (9, 2), (10, 4), (14, 1), (15, 53), (17, 21), (18, 3), (19, 22), (20, 7), (21, 8), (22, 13), (23, 32), (24, 1), (25, 4), (26, 9), (27, 1), (28, 3), (30, 48), (31, 2), (32, 3), (33, 6), (34, 7), (36, 8), (37, 1), (38, 2), (39, 5), (41, 7), (42, 1), (43, 3), (44, 1), (47, 2), (48, 21), (49, 4), (50, 2), (52, 1), (54, 1), (55, 2), (56, 21), (57, 17), (58, 1), (59, 2), (60, 1), (61, 9), (63, 3), (64, 2), (65, 3), (66, 2), (67, 5), (68, 3), (69, 2), (70, 4), (72, 8), (73, 57), (74, 1), (75, 10), (76, 2), (77, 7), (78, 13), (80, 3), (81, 10), (82, 3), (83, 7), (84, 3), (85, 11), (86, 22), (87, 1), (88, 10), (89, 2), (90, 2), (91, 3), (92, 12), (93, 5), (94, 8), (95, 18), (97, 7), (98, 3), (99, 11), (100, 2), (101, 5), (102, 33), (103, 7), (104, 1), (105, 20), (106, 2), (107, 1), (108, 2), (109, 3), (110, 10), (111, 11), (113, 2), (114, 2), (115, 1), (117, 2), (118, 10), (119, 35), (120, 7), (121, 8), (122, 5), (123, 1), (124, 2), (125

In [453]:
# Data preparation for SVD
user_movie_matrix = df_rating_final_train.pivot(index = 'userId', columns = 'movieId', values = 'rating')
col_mean = user_movie_matrix.mean(axis = 1)
user_movie_matrix.fillna(0, inplace = True)

user_movie_demean = user_movie_matrix - col_mean.values.reshape(-1, 1)

In [342]:
U, sigma, Vt = svds(user_movie_demean.values, k = 50) # SVD

In [343]:
# SVD predictions
sigma = np.diag(sigma)
user_movie_pred = np.dot(np.dot(U, sigma), Vt) + col_mean.values.reshape(-1, 1)
user_movie_pred_df = pd.DataFrame(user_movie_pred, columns = user_movie_matrix.columns, index = user_movie_matrix.index)

In [344]:
user_movie_pred_df.loc[641,[253]].sort_values(ascending = False) # Prediction test for a user and a movie

movieId
253    0.309333
Name: 641, dtype: float64

In [462]:
def svd_user_recommend(user: int, nrec = 10):
       
        """
        The funcion that finds recomendations based on SVD rating predictions

        output: List of 'nrec' IDs for top recommended movies
        """
                
        user_vector = user_movie_pred_df.loc[user]
        user_vector = user_vector.sort_values(ascending = False).index
#         print(user_vector)
        
        movies_id = user_vector
#         print(movies_id)
        user_movies = df_rating_final_train[df_rating_final_train['userId'] == user]['movieId']
        
        rec_movies = []
        
        for movie in movies_id:
                if len(rec_movies) == nrec:
                        break
                else:
                        if movie not in user_movies:
                                if len(rec_movies) < nrec:
                                        rec_movies.append(movie)
                                else:
                                        break
                                        
        return rec_movies

In [346]:
print(svd_user_recommend(user = 641)) # Test

[364, 1073, 500, 587, 1732, 586, 262, 4993, 953, 588]


In [347]:
user_movies = df_rating_final_train[df_rating_final_train['userId'] == 641] # Test
print(user_movies)

       userId  movieId  rating  timestamp
96245     641      587     4.0  841551394
96285     641     1381     3.0  850023798
96238     641      500     5.0  836533738
96281     641     1073     4.0  850021177
96265     641      802     4.0  855980932
96286     641     1391     3.0  856748022
96219     641      364     4.0  834636704
96199     641      262     5.0  834984658
96228     641      434     3.0  834636305


# FINAL RECOMMENDATION SYSTEM

In [464]:
class Complex_Recommendation:
        def __init__(self, title: str = None, user_id:int = None, genres: list[str] = None, keywords: list[str] = None, 
                     nrec:int = 10):
                self.title = title
                self.user_id = user_id
                self.genres = genres
                self.keywords = keywords
                self.nrec = nrec
                self.rec_list = None
                
        
        # Title approach 
        def title_process(self, title, nrec):
            
                """
                Complex function that finds recomendations by title with 3 different methods, join results and select 
                the top by popularity, vote count and vote average 

                output: Dataframe with 'nrec' recommended movies 
                """
            
            
                movies_overview = overview_rec(title, nrec)
                movies_genres = genre_rec(title, nrec)
                movies_keys = key_recomended(title, nrec)
                
                total_movies = movies_overview + movies_genres + movies_keys
                total_movies = set(total_movies)
                total_movies = list(total_movies)
                
                movies_rec = df_movies_final[df_movies_final['id'].isin(total_movies)]
                movies_rec = movies_rec.sort_values(by = ['popularity', 'vote_count', 'vote_average'], ascending = False)
                movies_rec = movies_rec[:self.nrec]

                return movies_rec
            
        # User approach
        def user_process(self, user_id, nrec):
            
                """
                Complex function that finds recomendations by user with 2 different methods, join results and select the 
                top by popularity, vote count and vote average 

                output: Dataframe with 'nrec' recommended movies 
                """
                
                movies_user = user_recommend(user_id, nrec)
                movies_svd = svd_user_recommend(user_id, nrec)
                
                
                total_movies = movies_user + movies_svd
                total_movies = set(total_movies)
                total_movies = list(total_movies)
                
                movies_rec = df_movies_final[df_movies_final['id'].isin(total_movies)]
                movies_rec = movies_rec.sort_values(by = ['popularity', 'vote_count', 'vote_average'], 
                                                    ascending = [False,False,False])
                movies_rec = movies_rec[:self.nrec]
            
            
                return movies_rec
            
        # Cold start approach with genres/keywords query   
        def blank_process(self, genres, keywords, nrec):
            
                """
                Complex function that finds recomendations by genres, keys or both methods, join results and select the 
                top by popularity, vote count and vote average 

                output: Dataframe with 'nrec' recommended movies 
                """
                
                if self.genres != None and self.keywords != None:
                        movies_genre_blank = genre_wo_title_rec(genres, nrec)
                        movies_key_blank = key_recomended_blank(keywords, nrec)

                        total_movies = [x for x in movies_key_blank if x in movies_genre_blank]
                        
                if self.keywords == None:
                        total_movies = genre_wo_title_rec(genres, nrec)
                    
                if self.genres == None:
                        total_movies = key_recomended_blank(keywords, nrec)

                        
                return df_movies_final[df_movies_final['id'].isin(total_movies[:nrec])]
            
        # Cold start with no data   
        def full_blank(self, nrec):
            
                """
                Function that finds recomendations with mo data selecting the top by release date, popularity, 
                vote count and vote average 

                output: Dataframe with 'nrec' recommended movies 
                """       
            
                movies_rec = df_movies_final.sort_values(by = ['release_date','popularity', 'vote_count', 'vote_average'], 
                                                    ascending = [False,False,False,False])
                movies_rec = movies_rec[:nrec]
                
                return movies_rec
                
                
        # Processing function        
        def fit(self):
                if self.title != None:
                    rec_movies = self.title_process(self.title, self.nrec)
                    
                elif self.user_id != None:
                    rec_movies = self.user_process(self.user_id, self.nrec)
                   
                
                elif self.genres != None or self.keywords != None:
                    rec_movies = self.blank_process(self.genres, self.keywords, self.nrec)
                    
                else:
                    rec_movies = self.full_blank(self.nrec)
                    
                
                self.rec_list = rec_movies

                
        # Function to return only title and overview for final result       
        def pred(self):
                return self.rec_list[['title', 'overview']]
                

In [443]:
# Test Cell
rec = Complex_Recommendation()
rec.fit()
rec.pred()


Unnamed: 0,title,overview
28213,Knight of Cups,Rick is a screenwriter living in Los Angeles. ...
23957,Sin City: A Dame to Kill For,Some of Sin City's most hard-boiled citizens c...
14765,Dante's Hell Animated,Dante's Hell Animated is a real and truthful e...
31927,Heart of the Country,With her husband jailed for Wall Street-based ...
22176,The Cosmonaut,What if you got back home... and there was nob...
20850,Mud,Two teenage boys encounter a fugitive and make...
20612,Stolen Seas,Stolen Seas presents a chilling exploration of...
20052,This Is 40,"Pete and Debbie are both about to turn 40, the..."
41171,Tchoupitoulas,A lyrical documentary that follows three adol...
26119,The End,A group of old friends gets together for a wee...
