In [2]:
# Script dependencies
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Importing data
movies = pd.read_csv('movies.csv', sep = ',')
ratings = pd.read_csv('ratings.csv')
movies.dropna(inplace=True)

In [14]:
# Drop duplicate movie titles?
movies.drop_duplicates( inplace=True)

In [15]:
movies.dropna(axis=0,inplace=True)

In [18]:
movies['title'] = movies['title'].astype('str')
movies['genres'] = movies['genres'].astype('str')

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
def data_preprocessing(subset_size):
    """Prepare data for use within Content filtering algorithm.

    Parameters
    ----------
    subset_size : int
        Number of movies to use within the algorithm.

    Returns
    -------
    Pandas Dataframe
        Subset of movies selected for content-based filtering.

    """
    # Split genre data into individual words.
    movies['keyWords'] = movies['genres'].str.replace('|', ' ')
    # Subset of the data
    movies_subset = movies[:subset_size]
    return movies_subset
 
def content_model(movie_list,top_n=10): 
    """Performs Content filtering based upon a list of movies supplied
       by the app user.

    Parameters
    ----------
    movie_list : list (str)
        Favorite movies chosen by the app user.
    top_n : type
        Number of top recommendations to return to the user.

    Returns
    -------
    list (str)
        Titles of the top-n movie recommendations to the user.

    """
    # Initializing the empty list of recommended movies
    data = data_preprocessing(2700)
    # Instantiating and generating the count matrix
    #count_vec = CountVectorizer()
    #count_matrix = count_vec.fit_transform(data['keyWords'])
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2),
                     min_df=0, stop_words='english')
    t_matrix = tf.fit_transform(data['keyWords'])
    indices = pd.Series(data['title'])
    cosine_sim = cosine_similarity(t_matrix, t_matrix)
    cosine_sim = pd.DataFrame(cosine_sim, index = data.index, columns = data.index)
    # Getting the index of the movie that matches the title
    idx_1 = indices[indices == movie_list[0]].index[0]
    idx_2 = indices[indices == movie_list[1]].index[0]
    idx_3 = indices[indices == movie_list[2]].index[0]
    # Creating a Series with the similarity scores in descending order
    rank_1 = cosine_sim[idx_1]
    rank_2 = cosine_sim[idx_2]
    rank_3 = cosine_sim[idx_3]
    # Calculating the scores
    score_series_1 = pd.Series(rank_1).sort_values(ascending = False)
    score_series_2 = pd.Series(rank_2).sort_values(ascending = False)
    score_series_3 = pd.Series(rank_3).sort_values(ascending = False)
    # Getting the indexes of the 10 most similar movies
    listings = score_series_1.append(score_series_2).append(score_series_3).sort_values(ascending = False)
    # Store movie names
    recommended_movies = []
    # Appending the names of movies
    top_50_indexes = list(listings.iloc[1:50].index)
    # Removing chosen movies
    top_indexes = np.setdiff1d(top_50_indexes,[idx_1,idx_2,idx_3])
    for i in top_indexes[:top_n]:
        recommended_movies.append(list(movies['title'])[i])
    return recommended_movies

In [34]:
user_row1 = {'userId': 500000, 'movieId': mov_ids[0], 'title': movie_list[0], 'rating': 5.0}
user_row2 = {'userId': 500000, 'movieId': mov_ids[1], 'title': movie_list[1], 'rating': 5.0}
user_row3 = {'userId': 500000, 'movieId': mov_ids[2], 'title': movie_list[2], 'rating': 5.0}


NameError: name 'mov_ids' is not defined

In [43]:
movie_list = ['Grumpier Old Men (1995)','Ace Ventura: When Nature Calls (1995)','Father of the Bride Part II (1995)']
content_model(movie_list,top_n=10)

  movies['keyWords'] = movies['genres'].str.replace('|', ' ')
  listings = score_series_1.append(score_series_2).append(score_series_3).sort_values(ascending = False)
  listings = score_series_1.append(score_series_2).append(score_series_3).sort_values(ascending = False)


['Sabrina (1995)',
 'Clueless (1995)',
 'Two if by Sea (1996)',
 'French Twist (Gazon maudit) (1995)',
 'Boomerang (1992)',
 'Forget Paris (1995)',
 'Nina Takes a Lover (1994)',
 'Only You (1994)',
 'Perez Family, The (1995)',
 "Pyromaniac's Love Story, A (1995)"]