### Recommendations with MovieTweetings: Most Popular Recommendation

#### Part I: Will Recommend n top  most popular movies without filter
#### Part II: Will Recommend n top  most popular movies with filters of year of release and genre

##### Load  Libraries

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# Read in the datasets
movies = pd.read_csv('movies_clean.csv')
reviews = pd.read_csv('reviews_clean.csv')
del movies['Unnamed: 0']
del reviews['Unnamed: 0']

#### Part I: The Most Popular Movies?

This is a simple recommendation, no matter the user, we need to provide a list of the recommendations based on simply the most popular items.

For this task, we will consider what is most popular based on the following criteria:

* A movie with the highest average rating is considered best
* With ties, movies that have more ratings are better
* A movie must have a minimum of 5 ratings to be considered among the best movies
* If movies are tied in their average rating and number of ratings, the ranking is determined by the movie that is the most recent rating

With these criteria, the goal for this notebook is to take a **user_id** and provide back the **n_top** recommendations.  

In [2]:
#Calculate the criterieas
movie_ratings = reviews.groupby('movie_id')['rating']
avg_ratings = movie_ratings.mean()
num_ratings = movie_ratings.count()
last_rating = pd.DataFrame(reviews.groupby('movie_id').max()['date'])
last_rating.columns = ['last_rating']

In [10]:
sorted(num_ratings)[-1]

3029

In [12]:
last_rating.head()

Unnamed: 0_level_0,last_rating
movie_id,Unnamed: 1_level_1
8,2014-04-08 18:20:11
10,2014-10-09 18:15:53
12,2015-08-10 23:16:19
25,2017-02-27 10:04:59
91,2013-11-23 18:59:55


In [13]:
 # Add Dates
rating_count_df = pd.DataFrame({'avg_rating': avg_ratings, 'num_ratings': num_ratings})
rating_count_df = rating_count_df.join(last_rating)

In [15]:
rating_count_df.head()

Unnamed: 0_level_0,avg_rating,num_ratings,last_rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,5.0,1,2014-04-08 18:20:11
10,10.0,1,2014-10-09 18:15:53
12,10.0,1,2015-08-10 23:16:19
25,8.0,1,2017-02-27 10:04:59
91,6.0,1,2013-11-23 18:59:55


In [16]:
 # merge with the movies dataset
movie_recs = movies.set_index('movie_id').join(rating_count_df)

In [17]:
movie_recs.head()

Unnamed: 0_level_0,movie,genre,date,1800's,1900's,2000's,History,News,Horror,Musical,...,Action,Documentary,Animation,Comedy,Short,Western,Thriller,avg_rating,num_ratings,last_rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short,1894,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,5.0,1,2014-04-08 18:20:11
10,La sortie des usines Lumière (1895),Documentary|Short,1895,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,10.0,1,2014-10-09 18:15:53
12,The Arrival of a Train (1896),Documentary|Short,1896,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,10.0,1,2015-08-10 23:16:19
25,The Oxford and Cambridge University Boat Race ...,,1895,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,8.0,1,2017-02-27 10:04:59
91,Le manoir du diable (1896),Short|Horror,1896,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,6.0,1,2013-11-23 18:59:55


## Create a function which return all these criterias

In [20]:
def create_ranked_df(movies, reviews):
        '''
        INPUT
        movies - the movies dataframe
        reviews - the reviews dataframe
        
        OUTPUT
        ranked_movies - a dataframe with movies that are sorted by highest avg rating, more reviews, 
                        then time, and must have more than 4 ratings
        '''
        
        # Pull the average ratings and number of ratings for each movie
        movie_ratings = reviews.groupby('movie_id')['rating']
        avg_ratings = movie_ratings.mean()
        num_ratings = movie_ratings.count()
        last_rating = pd.DataFrame(reviews.groupby('movie_id').max()['date'])
        last_rating.columns = ['last_rating']

        # Add Dates
        rating_count_df = pd.DataFrame({'avg_rating': avg_ratings, 'num_ratings': num_ratings})
        rating_count_df = rating_count_df.join(last_rating)

        # merge with the movies dataset
        movie_recs = movies.set_index('movie_id').join(rating_count_df)

        # sort by top avg rating and number of ratings
        ranked_movies = movie_recs.sort_values(['avg_rating', 'num_ratings', 'last_rating'], ascending=False)

        # movie only with 5 or more reviews
        ranked_movies = ranked_movies[ranked_movies['num_ratings'] > 4]
        
        return ranked_movies
    

### Create a function to recommend ranked movies to a specific user

In [21]:
def popular_recommendations(user_id, n_top, ranked_movies):
    '''
    INPUT:
    user_id - the user_id (str) of the individual you are making recommendations for
    n_top - an integer of the number recommendations you want back
    ranked_movies - a pandas dataframe of the already ranked movies based on avg rating, count, and time

    OUTPUT:
    top_movies - a list of the n_top recommended movies by movie title in order best to worst
    '''

    top_movies = list(ranked_movies['movie'][:n_top])

    return top_movies
        

In [22]:
# Top 20 movies recommended for id 1
ranked_movies = create_ranked_df(movies, reviews) 

In [24]:
recs_20_for_1 = popular_recommendations('1', 20, ranked_movies)

# Top 5 movies recommended for id 53968
recs_5_for_53968 = popular_recommendations('53968', 5, ranked_movies)

# Top 100 movies recommended for id 70000
recs_100_for_70000 = popular_recommendations('70000', 100, ranked_movies)

# Top 35 movies recommended for id 43
recs_35_for_43 = popular_recommendations('43', 35, ranked_movies)

In [30]:
recs_5_for_53968

['MSG 2 the Messenger (2015)',
 'Avengers: Age of Ultron Parody (2015)',
 'Sorry to Bother You (2018)',
 'Selam (2013)',
 "Quiet Riot: Well Now You're Here, There's No Way Back (2014)"]

### Part II: Adding Filters

Now that you have created a function to give back the **n_top** movies, to make it a bit more robust we will add arguments that will act as filters for the movie **year** and **genre**.  

Results are filtered to only movies within the lists of provided years and genres (as `or` conditions).  If no list is provided, there should be no filter applied.



In [36]:
# Filter movies based on year and genre
ranked_movies[ranked_movies['date'].isin(['2010'])].head(3)

Unnamed: 0_level_0,movie,genre,date,1800's,1900's,2000's,History,News,Horror,Musical,...,Action,Documentary,Animation,Comedy,Short,Western,Thriller,avg_rating,num_ratings,last_rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1198101,Kites (2010),Action|Drama|Romance,2010,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,9.2,5,2017-09-16 17:30:31
1935828,Jim Jefferies Alcoholocaust (2010),Comedy,2010,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,9.0,5,2016-08-30 07:28:21
1375666,Inception (2010),Action|Mystery|Sci-Fi,2010,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,8.974185,736,2018-06-25 11:01:41


In [41]:
#filter for 2010
temp_2010 = ranked_movies[ranked_movies['date'].isin(['2010'])]
temp_2010[['Drama', 'Action']].sum(axis=1)

movie_id
1198101    2
1935828    0
1375666    1
1255953    1
1188996    1
          ..
1459013    0
1313244    0
1392197    0
1316037    0
1523267    0
Length: 388, dtype: int64

In [43]:
# filter for Drama or Action genre
temp_2010.loc[temp_2010[['Drama', 'Action']].sum(axis=1) > 0,:].head(3)

Unnamed: 0_level_0,movie,genre,date,1800's,1900's,2000's,History,News,Horror,Musical,...,Action,Documentary,Animation,Comedy,Short,Western,Thriller,avg_rating,num_ratings,last_rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1198101,Kites (2010),Action|Drama|Romance,2010,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,9.2,5,2017-09-16 17:30:31
1375666,Inception (2010),Action|Mystery|Sci-Fi,2010,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,8.974185,736,2018-06-25 11:01:41
1255953,Incendies (2010),Drama|Mystery|War,2010,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,8.900826,363,2018-06-20 08:06:25


#### Create a function to return n top popular movies with year and genre filter

In [44]:
def popular_recs_filtered(user_id, n_top, ranked_movies, years=None, genres=None):
    '''
    REDO THIS DOC STRING
    
    INPUT:
    user_id - the user_id (str) of the individual you are making recommendations for
    n_top - an integer of the number recommendations you want back
    ranked_movies - a pandas dataframe of the already ranked movies based on avg rating, count, and time
    years - a list of strings with years of movies
    genres - a list of strings with genres of movies
    
    OUTPUT:
    top_movies - a list of the n_top recommended movies by movie title in order best to worst
    '''
    # Filter movies based on year and genre
    if years is not None:
        ranked_movies = ranked_movies[ranked_movies['date'].isin(years)]

    if genres is not None:
        num_genre_match = ranked_movies[genres].sum(axis=1)
        ranked_movies = ranked_movies.loc[num_genre_match > 0, :]
            
            
    # create top movies list 
    top_movies = list(ranked_movies['movie'][:n_top])

    return top_movies


In [45]:
# Top 20 movies recommended for id 1 with years=['2015', '2016', '2017', '2018'], genres=['History']
recs_20_for_1_filtered = popular_recs_filtered('1', 20, ranked_movies, years=['2015', '2016', '2017', '2018'], genres=['History'])

# Top 5 movies recommended for id 53968 with no genre filter but years=['2015', '2016', '2017', '2018']
recs_5_for_53968_filtered = popular_recs_filtered('53968', 5, ranked_movies, years=['2015', '2016', '2017', '2018'])

# Top 100 movies recommended for id 70000 with no year filter but genres=['History', 'News']
recs_100_for_70000_filtered = popular_recs_filtered('70000', 100, ranked_movies, genres=['History', 'News'])



In [46]:
recs_20_for_1_filtered

['Ayla: The Daughter of War (2017)',
 'I Believe in Miracles (2015)',
 'The Farthest (2017)',
 'Sado (2015)',
 'Hatred (2016)',
 'Kincsem (2017)',
 'Nise - O Coração da Loucura (2015)',
 'LA 92 (2017)',
 'Straight Outta Compton (2015)',
 'Manjhi: The Mountain Man (2015)',
 'Only the Dead (2015)',
 'Spotlight (2015)',
 'Under sandet (2015)',
 'Airlift (2016)',
 'Dunkirk (2017)',
 'Taeksi Woonjunsa (2017)',
 'The Battleship Island (2017)',
 'Darkest Hour (2017)',
 'Best of Enemies (2015)',
 'The Ghazi Attack (2017)']

In [48]:
recs_5_for_53968_filtered

['MSG 2 the Messenger (2015)',
 'Avengers: Age of Ultron Parody (2015)',
 'Sorry to Bother You (2018)',
 'Make Like a Dog (2015)',
 'Pandorica (2016)']