In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/movies_metadata.csv')

# print all features of the dataframe
df.columns

  df = pd.read_csv('../data/movies_metadata.csv')


Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [5]:
df = df[['title', 'genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0


In [6]:
# convert release_date into pandas datetime format
df['release_date'] = pd.to_datetime(df['release_date'], errors = 'coerce')

# extract year from the datetime
df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

# transform year from an object(NaT) to int(0)
# helper function to convert NaT to 0 and all other years to integers
def convert_int(x):
    try:
        return int (x)
    except: 
        return 0
    
# Apply convert_int to the year feature
df['year'] = df['year'].apply(convert_int)

# drop the release_date column
df = df.drop('release_date', axis = 1)

# display dataframe
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",81.0,7.7,5415.0,1995
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",127.0,6.1,34.0,1995
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",106.0,5.7,173.0,1995


In [7]:
# print genres of the first movie 
df.iloc[0]['genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [8]:
# we need to transform the stringified dictionary into native py dictionary. We'll use literal_eval

from ast import literal_eval
a = '[1,2,3]'
print (type(a)) 
b = literal_eval(a)
print(type(b))

<class 'str'>
<class 'list'>


In [9]:
# convert all NaN into stringified empty lists
df['genres'] = df['genres'].fillna('[]')

# apply literal_eval to convert to the list object
df['genres'] = df['genres'].apply(literal_eval)
# convert list of dictionaries to a list of strings
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in x] if isinstance (x, list) else [])
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[Animation, Comedy, Family]",81.0,7.7,5415.0,1995
1,Jumanji,"[Adventure, Fantasy, Family]",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[Romance, Comedy]",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[Comedy, Drama, Romance]",127.0,6.1,34.0,1995
4,Father of the Bride Part II,[Comedy],106.0,5.7,173.0,1995


In [10]:
# if a particular movie has multiple genres, we will create multiple copies of the movie, with each movie having one of the genres this is called exploding the genres
#  create a new feature by exploding genres
s = df.apply(lambda x:
    pd.Series(x['genres']), axis = 1).stack().reset_index(level = 1, drop =True)
s.name = 'genre'

#Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new genre
gen_df = df.drop('genres', axis=1).join(s)

gen_df.head()


  pd.Series(x['genres']), axis = 1).stack().reset_index(level = 1, drop =True)


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre
0,Toy Story,81.0,7.7,5415.0,1995,Animation
0,Toy Story,81.0,7.7,5415.0,1995,Comedy
0,Toy Story,81.0,7.7,5415.0,1995,Family
1,Jumanji,104.0,6.9,2413.0,1995,Adventure
1,Jumanji,104.0,6.9,2413.0,1995,Fantasy


In [18]:
# What we will need for our recommender function (The build_chart function)
# - user input on their preference
# - extract movies that match the conditions set by the user
# - calculate the values of m and c for the extracted movies and proceed to build the chart as in the previous section
def build_chart(gen_df, percentile = 0.8):
    # Ask for preferred genres
    print ('Input preferred genre')
    genre = input()
    
    # Ask for the lower limit of duration
    print ('Input shortest duration')
    low_time = int(input())
    
    # Ask for upper limit of duration
    print ('Input longest duration')
    high_time = int(input())
    
    # Ask for lower limit of timeline
    print('Input earliest year')
    low_year = int(input())
    
    # upper limit of timeline
    print('Input latest year')
    high_year = int(input())
    
    # Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies
    movies = gen_df.copy()
    # filter based on the condition
    movies =movies[(movies['genre'] == genre) &
                   (movies['runtime'] >= low_time) &
                   (movies['runtime'] <= high_time) &
                   (movies['year'] >= low_year) &
                   (movies['year'] <= high_year)]
    # compute the value of C and m for the filtered movies
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)
    
    # Only consider movies that have higher than m votes. Save this is in a nes df q_movies
    q_movies = movies.copy().loc[movies['vote_count'] >= m]
    # calculate score using the IMDB formula
    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) 
                                       + (m/(m+x['vote_count']) * C)
                                       ,axis=1)
    # Sort movies in descending order of their scores
    q_movies = q_movies.sort_values('score', ascending = False)
    return q_movies

In [21]:
# lets get recommendations for animated movies between 30 min and 2hrs in length, and released anywhere between 1990 and 2005
# generate the chart for top comedy movies and display top 5
build_chart(gen_df).head()


Input preferred genre
Input shortest duration
Input longest duration
Input earliest year
Input latest year


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre,score
10309,Dilwale Dulhania Le Jayenge,190.0,9.1,661.0,1995,Comedy,8.610255
2211,Life Is Beautiful,116.0,8.3,3643.0,1997,Comedy,8.22294
351,Forrest Gump,142.0,8.2,8147.0,1994,Comedy,8.166313
1604,The Truman Show,103.0,7.8,4702.0,1998,Comedy,7.751427
4843,Amélie,122.0,7.8,3403.0,2001,Comedy,7.733471


In [22]:
# Convert the cleaned (non-exploaded) dataframe df into a csv file and save it in the data folder
# set parameter index info False as the index of the Dataframe has no inherent meaning

df.to_csv('../data/metadata_clean.csv', index=False)