### Most Popular Movie Simple Recommendation


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tests as t

%matplotlib inline

# Read in the datasets
movies = pd.read_csv('movies_clean.csv')
reviews = pd.read_csv('reviews_clean.csv')
del movies['Unnamed: 0']
del reviews['Unnamed: 0']


In [10]:
reviews.head()
# shape(863866, 5)

Unnamed: 0,user_id,movie_id,rating,timestamp,date
0,1,114508,8,1381006850,2013-10-05 21:00:50
1,2,208092,5,1586466072,2020-04-09 21:01:12
2,2,358273,9,1579057827,2020-01-15 03:10:27
3,2,10039344,5,1578603053,2020-01-09 20:50:53
4,2,6751668,9,1578955697,2020-01-13 22:48:17


In [4]:
reviews[(reviews['user_id'] == 2)]['rating'].count()

10

In [5]:
reviews[(reviews['movie_id'] == 10039344 )]['rating'].mean()

5.1226993865030677

In [11]:
#full_movies = pd.merge(reviews,movies,on='movie_id')

In [3]:
#full_movies.head()

In [5]:
#last_rate = pd.DataFrame(reviews.groupby('movie_id').max()['date'])

In [6]:
reviews[reviews['user_id'] == 3 ]

Unnamed: 0,user_id,movie_id,rating,timestamp,date
11,3,790636,8,1391207279,2014-01-31 22:27:59
12,3,1800241,7,1388955438,2014-01-05 20:57:18
13,3,2278871,8,1383419733,2013-11-02 19:15:33
14,3,2395417,8,1388170007,2013-12-27 18:46:47
15,3,3344922,8,1422652427,2015-01-30 21:13:47


#### To Find The Most Popular Movies

"most popular" based on the following criteria:

* A movie with the highest average rating is considered best
* With ties, movies that have more ratings are better
* A movie must have a minimum of 5 ratings to be considered among the best movies
* If movies are tied in their average rating and number of ratings, the ranking is determined by the movie that is the most recent rating

With these criteria, the goal is to take a **user_id** and provide back the **n_top** recommendations. 

In [7]:
def ranked_df (movies, reviews):
    '''
        INPUT
        movies - the movies dataframe
        reviews - the reviews dataframe
        
        OUTPUT
        ranked_movies - a dataframe with movies that are sorted by highest avg rating, more reviews, 
                        then time, and must have more than 4 ratings
    '''
    
    
    #avg_rate = reviews[(reviews['movie_id'] == x)]['rating'].mean()
    movies_rate = reviews.groupby('movie_id')['rating']
    avg_rate = movies_rate.mean()
    count_rate = movies_rate.count()
    last_rate = pd.DataFrame(reviews.groupby('movie_id').max()['date'])
    last_rate.columns = ['last_rate']
    
    # Add Dates
    rating_count_df = pd.DataFrame({'avg_rating': avg_rate, 'num_ratings': count_rate})
    rating_count_df = rating_count_df.join(last_rate)
    
    # merge the new df with the movies dataset
    movies_new = movies.set_index('movie_id').join(rating_count_df)
    # sort by top avg rating and number of ratings
    ranked_movies = movies_new.sort_values(['avg_rating','num_ratings','last_rate'], ascending=False)
    
    ranked_movies = ranked_movies[ranked_movies['num_ratings'] > 4]
    ranked_movies.head()
    return ranked_movies

In [61]:
x = ranked_df(movies, reviews)
x

Unnamed: 0_level_0,movie,genre,Date,1800's,1900's,2000's,Animation,Music,Reality-TV,Documentary,...,Mystery,Film-Noir,History,Comedy,Horror,Musical,Biography,avg_rating,num_ratings,last_rate
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4921860,MSG 2 the Messenger (2015),Comedy|Drama|Fantasy|Horror,2015,0,0,1,0,0,0,0,...,0,0,0,1,1,0,0,10.000000,48,2016-08-14 17:16:50
5262972,Avengers: Age of Ultron Parody (2015),Short|Comedy,2015,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,10.000000,28,2016-01-08 00:44:43
6662050,Five Minutes (2017),Short|Comedy,2017,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,10.000000,22,2019-04-20 22:29:19
2737018,Selam (2013),Drama|Romance,2013,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,10.000000,10,2015-05-10 22:56:01
5804314,Let There Be Light (2017),Drama,2017,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,10.000000,7,2019-12-25 16:27:47
2560840,"Quiet Riot: Well Now You're Here, There's No W...",Documentary|Music,2014,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,10.000000,6,2016-01-23 00:30:44
2219210,Crawl Bitch Crawl (2012),Horror|Sci-Fi|Thriller,2012,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,10.000000,6,2013-07-22 23:30:52
9882084,Chasing Happiness (2019),Documentary,2019,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,10.000000,5,2019-06-29 03:37:46
4448444,Make Like a Dog (2015),Short|Comedy|Drama,2015,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,10.000000,5,2017-09-09 13:51:48
5131914,Pandorica (2016),Sci-Fi,2016,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,10.000000,5,2016-04-04 05:52:22


In [49]:
def popular_recommendations(user_id, n_top, ranked_movies):
    '''
    INPUT:
    user_id - the user_id of the individual you are making recommendations for
    n_top - an integer of the number recommendations you want back
    OUTPUT:
    top_movies - a list of the n_top recommended movies by movie title in order best to worst
    '''
    #ranked_df.iloc[n_top]
    top_movies = list(ranked_movies['movie'][:n_top])
    
    return top_movies # a list of the n_top movies as recommended

In [44]:
ranked_movies = ranked_df(movies, reviews) # only run this once - it is not fast

In [52]:
# Top 20 movies recommended for id 1
recs_20_for_1 = popular_recommendations(1, 20, ranked_movies)
recs_20_for_1

['MSG 2 the Messenger (2015)',
 'Avengers: Age of Ultron Parody (2015)',
 'Five Minutes (2017)',
 'Selam (2013)',
 'Let There Be Light (2017)',
 "Quiet Riot: Well Now You're Here, There's No Way Back (2014)",
 'Crawl Bitch Crawl (2012)',
 'Chasing Happiness (2019)',
 'Make Like a Dog (2015)',
 'Pandorica (2016)',
 'Third Contact (2011)',
 'Romeo Juliet (2009)',
 'Be Somebody (2016)',
 'Birlesen Gonuller (2014)',
 'Kitbull (2019)',
 'Agnelli (2017)',
 'Sátántangó (1994)',
 'Foster (2011)',
 'CM101MMXI Fundamentals (2013)',
 'Crystal Lake Memories: The Complete History of Friday the 13th (2013)']

### Adding Filters

Add arguments that will act as filters for the movie **year** and **genre**.  
example:
```
popular_recs_filtered('1', 20, ranked_movies, years=['2015', '2016', '2017', '2018'], genres=['History'])

```



In [91]:
def popular_recs_filtered(user_id, n_top, ranked_movies, years=None, genres=None):
    '''
    INPUT:
    user_id - the user_id of the individual you are making recommendations for
    n_top - an integer of the number recommendations you want back
    OUTPUT:
    top_movies - a list of the n_top recommended movies by movie title in order best to worst
    '''

    #ranked_df.iloc[n_top]
    #ranked_movies = ranked_movies[ranked_movies['Date'] == years]
    # Filter movies based on year and genre
    if years is not None:
        ranked_movies = ranked_movies[ranked_movies['Date'].isin(years)]

    if genres is not None:
        num_genre_match = ranked_movies[genres].sum(axis=1)
        ranked_movies = ranked_movies.loc[num_genre_match > 0, :]
            
            
    # create top movies list 
    top_movies = list(ranked_movies['movie'][:n_top])


    return top_movies 

In [92]:
popular_recs_filtered('1', 10, ranked_movies, years=['2015', '2016', '2017', '2018'], genres=['History'])

["Hillary's America: The Secret History of the Democratic Party (2016)",
 'I Believe in Miracles (2015)',
 'O.J.: Made in America (2016)',
 'Ayla: The Daughter of War (2017)',
 'Hacksaw Ridge (2016)',
 'They Shall Not Grow Old (2018)',
 'Namhansanseong (2017)',
 'The Farthest (2017)',
 'Kono sekai no katasumi ni (2016)',
 'Sado (2015)']