# Baseline Recommendations

Build a baseline recommendations system based on a selection of genres and the option to select a year range from the movies dataset.    

In [1]:
#Run previous notebook first
%run ./01_Data_Preparation.ipynb

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tagId   1128 non-null   int64 
 1   tag     1128 non-null   object
dtypes: int64(1), object(1)
memory usage: 17.8+ KB
None


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


(1128, 2)

Missing data:  tagId    0
tag      0
dtype: int64
No. of duplicates 0
No. of uniques tagId    1128
tag      1128
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14862528 entries, 0 to 14862527
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   tagId      int64  
 2   relevance  float64
dtypes: float64(1), int64(2)
memory usage: 340.2 MB
None


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.029
1,1,2,0.02375
2,1,3,0.05425
3,1,4,0.06875
4,1,5,0.16


(14862528, 3)

Missing data:  movieId      0
tagId        0
relevance    0
dtype: int64
No. of duplicates 0
No. of uniques movieId      13176
tagId         1128
relevance     4000
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  58098 non-null  int64 
 1   title    58098 non-null  object
 2   genres   58098 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB
None


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


(58098, 3)

Missing data:  movieId    0
title      0
genres     0
dtype: int64
No. of duplicates 0
No. of uniques movieId    58098
title      58020
genres      1643
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 847.0 MB
None


Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


(27753444, 4)

Missing data:  userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
No. of duplicates 0
No. of uniques userId         283228
movieId         53889
rating             10
timestamp    22131556
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1108997 entries, 0 to 1108996
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   userId     1108997 non-null  int64 
 1   movieId    1108997 non-null  int64 
 2   tag        1108981 non-null  object
 3   timestamp  1108997 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 33.8+ MB
None


Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,110,Medieval,1443148532
2,14,260,sci-fi,1442169410
3,14,260,space action,1442169421
4,14,318,imdb top 250,1442615195


(1108997, 4)

Missing data:  userId        0
movieId       0
tag          16
timestamp     0
dtype: int64
No. of duplicates 0
No. of uniques userId        19325
movieId       45981
tag           74714
timestamp    911869
dtype: int64
Number of movies in movies dataframe:  58098
Number of movies in ratings:  53889
Number of movies in genome_scores:  13176
Number of movies in tags:  45981


Unnamed: 0,movieId,title,genres
21121,102190,"20,000 Leagues Under the Sea (1997)","[Adventure, Romance, Sci-Fi]"
24626,114130,"20,000 Leagues Under the Sea (1997)","[Romance, Sci-Fi]"
33067,136696,Absolution (2015),"[Action, Adventure, Crime, Thriller]"
39377,151797,Absolution (2015),"[Drama, Thriller]"
20141,98485,Aftermath (2012),"[Drama, Thriller]"
24599,114040,Aftermath (2012),"[Action, Thriller]"
582,588,Aladdin (1992),"[Adventure, Animation, Children, Comedy, Musical]"
24657,114240,Aladdin (1992),"[Adventure, Animation, Children, Comedy, Fantasy]"
34711,140633,Another World (2014),[Documentary]
41491,156686,Another World (2014),[(no genres listed)]


(156, 3)
58020


In [2]:
def baseline_rec(no_rec, selected_genres, start_year, final_year):
    '''
    Provides a number of top recommendations based on selected genres and movie release years
    
    Paramaters
    ---
    no_rec: number of recommendations to return
    selected_genres: list of genre(s) to select from; all genres if not specified 
    start_year: earliest year of release; earliest year of release if not specified
    final_year: latest year of release; latest year of release if not specified
    
    
    Returns 
    --- 
    Number of top recommendations based on selected genres and year of movie releases.
    
    ---
    '''
     
    if selected_genres == '': 
        #List of all genres
        genres_list = sorted(list(set(ratings_movies_df['genre'])))

        #Collect movies based on subset of genres
        baseline_rec_df = ratings_movies_df[ratings_movies_df['genre'].isin(genres_list)]

    else:
        #Collect movies based on subset of genres
        baseline_rec_df = ratings_movies_df[ratings_movies_df['genre'].isin(selected_genres)]
    
    if start_year == '': 
        start_year = ratings_movies_df['year'].min()
    
    if final_year == '':
        final_year = ratings_movies_df['year'].max()
        
    
    #Subset this further from subsetting based on year range  
    baseline_rec_df = baseline_rec_df[(baseline_rec_df['year']>=start_year) & (baseline_rec_df['year']<=final_year)]

    #Remove duplicates
    baseline_rec_df = baseline_rec_df[~baseline_rec_df['title'].duplicated()]

    #Drop 'genre' column
    baseline_rec_df = baseline_rec_df.drop('genre', axis=1)

    #Sort by number of ratings, followed by average ratings for each movie
    baseline_rec_df = baseline_rec_df.sort_values(['no_of_ratings','avg_ratings'],ascending=False)
    
    return baseline_rec_df[:no_rec]

## Feature Engineering 

1. Collect rating information for each movie 
2. Convert 'year' from string to numbers 
3. Sort by the number of ratings, followed by each movie's average rating
4. Explode each genre of each movie's list of genres.

In [3]:
#1. Collect rating information for each movie 

#Create 'ratings_' dataframe - average and number of ratings for each movie 
no_of_ratings = ratings.groupby(['movieId'])['rating'].count()
avg_ratings = ratings.groupby(['movieId'])['rating'].mean()

ratings_ = pd.concat([no_of_ratings, avg_ratings], axis=1)
ratings_.columns = ['no_of_ratings','avg_ratings']

In [4]:
#2. Create 'ratings_movies_df' - merging 'movies_df' and 'ratings_'  
ratings_movies_df = movies_df.merge(ratings_, how='left', on='movieId')

#Format 'no_of_ratings' appropriately into integers
ratings_movies_df['no_of_ratings'] = ratings_movies_df['no_of_ratings'].astype('Int64')

#Sort movies by the number of ratings, followed by the average rating of each movie
ratings_movies_df = ratings_movies_df.sort_values(['no_of_ratings', 'avg_ratings'], ascending=[False, False])

In [5]:
#3. Convert years - from string into numbers 

ratings_movies_df['year'] = pd.to_numeric(ratings_movies_df['year'])
ratings_movies_df['year'] = ratings_movies_df['year'].astype('Int64')

print('Findings: ') 
if ratings_movies_df['year'].isnull().sum()>0:
    print(
    'Some movies do not have "year" information and will not be picked up as recommendations. '
    'This is the reason why pd.to_numeric() and .astype("Int64") are both used to convert years into integers. '
    'Note that these movies make up a small portion of the dataframe, so they were left in the datframe. '
    'They do not significantly affect results besides not picked up for recommendations.'
    )

Findings: 
Some movies do not have "year" information and will not be picked up as recommendations. This is the reason why pd.to_numeric() and .astype("Int64") are both used to convert years into integers. Note that these movies make up a small portion of the dataframe, so they were left in the datframe. They do not significantly affect results besides not picked up for recommendations.


In [6]:
#4. Collect each indidivual genre of each movie

#Explode genres for every movie 
genres_ = movies_df['genres'].explode()

#'ratings_movies_df' - merging 'ratings_movies_df' and 'genres_' 
ratings_movies_df = pd.merge(ratings_movies_df, genres_, left_index=True, right_index=True)
ratings_movies_df.columns = ['movieId','title','genres', 'year','no_of_ratings','avg_ratings','genre']

## Baseline Recommendations
Baseline recommendations are based on a selection of genres options on the range of years for movies to recommend. The complete list of genres is below in the 'genres_list'.

In [7]:
#List of all genres
genres_list = sorted(list(set(ratings_movies_df['genre'])))

#Preview
genres_list

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [8]:
#Top 15 comedy and/or romance films between 2005 and 2018 
baseline_rec(selected_genres = ['Comedy','Romance'], no_rec = 15, start_year = 2005, final_year = 2018)

Unnamed: 0,movieId,title,genres,year,no_of_ratings,avg_ratings
12780,60069,WALL·E,"[Adventure, Animation, Children, Romance, Sci-Fi]",2008,28116,4.007345
11140,46578,Little Miss Sunshine,"[Adventure, Comedy, Drama]",2006,20038,3.873416
13139,63082,Slumdog Millionaire,"[Crime, Drama, Romance]",2008,20006,3.847871
12304,56367,Juno,"[Comedy, Drama, Romance]",2007,19131,3.749987
13829,69122,"Hangover, The","[Comedy, Crime]",2009,16717,3.620626
10363,35836,"40-Year-Old Virgin, The","[Comedy, Romance]",2005,15332,3.461812
15466,78499,Toy Story 3,"[Adventure, Animation, Children, Comedy, Fanta...",2010,14841,3.87009
22455,106782,"Wolf of Wall Street, The","[Comedy, Crime, Drama]",2013,14748,3.869135
11686,51255,Hot Fuzz,"[Action, Comedy, Crime, Mystery]",2007,14379,3.846964
13978,69844,Harry Potter and the Half-Blood Prince,"[Adventure, Fantasy, Mystery, Romance, IMAX]",2009,14115,3.848672


# Appendix

In [9]:
#Preview
ratings_movies_df

Unnamed: 0,movieId,title,genres,year,no_of_ratings,avg_ratings,genre
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,68469,3.886649,Adventure
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,68469,3.886649,Animation
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,68469,3.886649,Children
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,68469,3.886649,Comedy
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,68469,3.886649,Fantasy
...,...,...,...,...,...,...,...
58018,193882,Flora,"[Adventure, Drama, Horror, Sci-Fi]",2017,1,2.000000,Horror
58018,193882,Flora,"[Adventure, Drama, Horror, Sci-Fi]",2017,1,2.000000,Sci-Fi
58019,193886,Leal,"[Action, Crime, Drama]",2018,2,3.250000,Action
58019,193886,Leal,"[Action, Crime, Drama]",2018,2,3.250000,Crime


Markdown tables as reference.

In [10]:
#Top 15 movies across all genres and years
print(baseline_rec(selected_genres='', start_year = '', final_year = '', no_rec = 15).to_markdown())

|      |   movieId | title                                                                   | genres                                                      |   year |   no_of_ratings |   avg_ratings |
|-----:|----------:|:------------------------------------------------------------------------|:------------------------------------------------------------|-------:|----------------:|--------------:|
|  315 |       318 | Shawshank Redemption, The                                               | ['Crime', 'Drama']                                          |   1994 |           97999 |       4.42419 |
|  352 |       356 | Forrest Gump                                                            | ['Comedy', 'Drama', 'Romance', 'War']                       |   1994 |           97040 |       4.05658 |
|  293 |       296 | Pulp Fiction                                                            | ['Comedy', 'Crime', 'Drama', 'Thriller']                    |   1994 |           92406 |       4.17397 |


In [11]:
#Top 15 comedy and/or romance films between 2005 and 2018 
print(baseline_rec(selected_genres = ['Comedy','Romance'], no_rec = 15, start_year = 2005, final_year = 2018).to_markdown())

|       |   movieId | title                                                                               | genres                                                               |   year |   no_of_ratings |   avg_ratings |
|------:|----------:|:------------------------------------------------------------------------------------|:---------------------------------------------------------------------|-------:|----------------:|--------------:|
| 12780 |     60069 | WALL·E                                                                              | ['Adventure', 'Animation', 'Children', 'Romance', 'Sci-Fi']          |   2008 |           28116 |       4.00734 |
| 11140 |     46578 | Little Miss Sunshine                                                                | ['Adventure', 'Comedy', 'Drama']                                     |   2006 |           20038 |       3.87342 |
| 13139 |     63082 | Slumdog Millionaire                                                                 | ['Cr