## IMDb Movie Metadata Collection

In [1]:
# Intialization
import os
import time

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

# spark imports
from pyspark.sql import SparkSession

# data science imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
# Set path for loading and saving data
data_path = 'hdfs:///user/andrew/'

#### Import and collect movies data
This includes the movieId, title, and genres provided in the Movie Lens data.

In [2]:
# Read in data through spark since the data is sored in hadoop and format the columns
from pyspark.sql.types import *
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)

Item = Row('item_id', 'title' ,'genres')

movies = sc.textFile(data_path + 'movies.dat') \
    .map(lambda line: line.split("::")[0:3]) \
    .map(lambda line: (int(line[0]), line[1], line[2].split('|'))) \
    .map(lambda r: Item(*r))
movies = sqlContext.createDataFrame(movies)

# Collect the data in a simple pandas data frame for easier manipulation
movies_df = movies.toPandas()
movies_df.head(5)

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),"[Animation, Children's, Comedy]"
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [1]:
# Use the OMDb API to collect more movie information
import requests
from bs4 import BeautifulSoup
# Needed for string conversion
import ast
import unicodedata

def movie_metadata_extract(title_val, api_key = ''):
    time.sleep(2)
    # remove the year from the title
    title = title_val[0:-7]
    title = title.split(' (')[0]
    if title[-5:] == ', The':
        title = 'The ' + title[0:-5]
    elif title[-4:] == ', An':
        title = 'An ' + title[0:-4]
    elif title[-4:] == ', La':
        title = 'La ' + title[0:-4]
    elif title[-3:] == ', A':
        title = 'A ' + title[0:-3]

    year = title_val[-5:-1]

    # request movie information and parse
    movie_request = requests.get('http://www.omdbapi.com/?apikey=' + api_key, 
                               {'t': title, 'y': year})
    movie_dict = ast.literal_eval(movie_request.text.encode('utf-8'))
    # when movie metadata is found in OMDb just based on name and year
    if movie_dict.get('Response') == 'True':
        # extract numeric features
        imdb_rating = [0 if movie_dict.get('imdbRating') == 'N/A' 
                       else float(movie_dict.get('imdbRating'))][0]

        imdb_votes = [0 if movie_dict.get('imdbVotes') == 'N/A' 
                     else int(movie_dict.get('imdbVotes').replace(',', ''))][0]

        metascore = [0 if movie_dict.get('Metascore') == 'N/A' 
                     else int(movie_dict.get('Metascore'))][0]

        runtime = [0 if movie_dict.get('Runtime') == 'N/A' 
                     else int(movie_dict.get('Runtime')[0:-4])][0] 
        # extract text features
        # normalize with decode and encode is because there are many names 
        # with non ascii standard characters, such as accents, etc
        # split genres, directors, and actors into lists
        # some names had parenthesis at the end, not sure why, but they are removed. (might be nicknames?)
        genre_list = [unicodedata.normalize('NFKD', g.decode('utf-8')).encode('ascii', 'ignore') 
                      for g in movie_dict.get('Genre').split(', ')]
        director_list = [unicodedata.normalize('NFKD', d.decode('utf-8')).encode('ascii', 'ignore') 
                         for d in movie_dict.get('Director').split(', ')]
        director_list = [director.split('(')[0] for director in director_list]
        actor_list = [unicodedata.normalize('NFKD', a.decode('utf-8')).encode('ascii', 'ignore') 
                      for a in movie_dict.get('Actors').split(', ')]
        actor_list = [actor.split('(')[0] for actor in actor_list]
        # specify column return order
        return(pd.Series([movie_dict.get('imdbID'), 
                          imdb_rating,
                          imdb_votes,
                          metascore,
                          runtime,
                          movie_dict.get('Rated'), 
                          genre_list,
                          director_list,
                          actor_list],
                         index=['imdb_id', 
                                'imdb_rating', 'imdb_votes', 
                                'metascore', 'runtime', 
                                'MPAA_rating', 'imdb_genres', 
                                'director', 'actors']))
    # when movie metadata is not found in OMDb by name and year
    # must find IMDb id instead, and then collect from OMDb
    else:
        # screape IMDb id from google search list
        text = title + ' ' + year + ' movie'
        text = text.replace(' ', '%20')

        response = requests.get('https://google.com/search?q=' + text)

        imdbID = ''
        soup = BeautifulSoup(response.text)
        for a in soup.find_all('a'):
              if a.get('href')[0:34] == '/url?q=https://www.imdb.com/title/':
                imdbID = a.get('href')[32:45].split('/')[1]
                break
        
        # request movie information and parse
        movie_request = requests.get('http://www.omdbapi.com/?apikey=' + api_key, 
                                     {'i': imdbID})
        movie_dict = ast.literal_eval(movie_request.text.encode('utf-8'))
        # when movie metadata is found in OMDb using IMDb id
        if movie_dict.get('Response') == 'True':
            imdb_rating = [0 if movie_dict.get('imdbRating') == 'N/A' 
                           else float(movie_dict.get('imdbRating'))][0]

            imdb_votes = [0 if movie_dict.get('imdbVotes') == 'N/A' 
                         else int(movie_dict.get('imdbVotes').replace(',', ''))][0]

            metascore = [0 if movie_dict.get('Metascore') == 'N/A' 
                         else int(movie_dict.get('Metascore'))][0]

            runtime = [0 if movie_dict.get('Runtime') == 'N/A' 
                         else int(movie_dict.get('Runtime')[0:-4])][0] 
            
            genre_list = [unicodedata.normalize('NFKD', g.decode('utf-8')).encode('ascii', 'ignore') 
                          for g in movie_dict.get('Genre').split(', ')]
            director_list = [unicodedata.normalize('NFKD', d.decode('utf-8')).encode('ascii', 'ignore') 
                             for d in movie_dict.get('Director').split(', ')]
            director_list = [director.split('(')[0] for director in director_list]
            actor_list = [unicodedata.normalize('NFKD', a.decode('utf-8')).encode('ascii', 'ignore') 
                          for a in movie_dict.get('Actors').split(', ')]
            actor_list = [actor.split('(')[0] for actor in actor_list]

            return(pd.Series([movie_dict.get('imdbID'), 
                              imdb_rating,
                              imdb_votes,
                              metascore,
                              runtime,
                              movie_dict.get('Rated'), 
                              genre_list,
                              director_list,
                              actor_list],
                             index=['imdb_id', 
                                    'imdb_rating', 'imdb_votes', 
                                    'metascore', 'runtime', 
                                    'MPAA_rating', 'imdb_genres', 
                                    'director', 'actors']))
        
        # when movie metadata is not found in OMDb
        else:
            # return blank values
            print(movie_dict.get('Error') + ' ' + title)
            return(pd.Series(['', 
                              0, 0, 0, 0,
                              '', [''], [''], ['']],
                             index=['imdb_id', 
                                    'imdb_rating', 'imdb_votes', 
                                    'metascore', 'runtime', 
                                    'MPAA_rating', 'imdb_genres', 
                                    'director', 'actors']))

In [None]:
# Collect metadata
movies_df_2 = pd.concat([movies_df, 
                         movies_df['title'].apply(movie_metadata_extract)],
                        axis = 1)
# separate the year from the title
movies_df_2['year'] = [y[-5:-1] for y in movies_df_2.title]
# remove the year from the title
movies_df_2['title'] = [t[:-7] for t in movies_df_2.title]

movies_df_2.head(2)

In [None]:
# one hot encode the genres using the sklearn MultiLabelBinarizer
# this works better than pandas get_dummies because it can handle multiple class inputs as lists
mlb_movie_lens_genres = MultiLabelBinarizer()
movies_df_3 = movies_df_2.join(pd.DataFrame(mlb_movie_lens_genres.fit_transform(movies_df_2.genres),
                                            columns = mlb_movie_lens_genres.classes_,
                                            index = movies_df_2.index).add_prefix('ml_genre_'))

In [None]:
mlb_imdb_genres = MultiLabelBinarizer()
movies_df_3 = movies_df_3.join(pd.DataFrame(mlb_imdb_genres.fit_transform(movies_df_3.imdb_genres),
                                            columns = mlb_imdb_genres.classes_,
                                            index = movies_df_3.index).add_prefix('imdb_genre_')) 

In [None]:
mlb_imdb_director = MultiLabelBinarizer()
movies_df_3 = movies_df_3.join(pd.DataFrame(mlb_imdb_director.fit_transform(movies_df_3.director),
                                            columns = mlb_imdb_director.classes_,
                                            index = movies_df_3.index).add_prefix('director_')) 

In [None]:
mlb_imdb_actors = MultiLabelBinarizer()
movies_df_3 = movies_df_3.join(pd.DataFrame(mlb_imdb_actors.fit_transform(movies_df_3.actors),
                                            columns = mlb_imdb_actors.classes_,
                                            index = movies_df_3.index).add_prefix('actor_'))

In [None]:
# one-hot encode MPAA ratings
ohe_mpaa_ratings = pd.get_dummies(movies_df_3.MPAA_rating, prefix = 'MPAA_rating')
movies_df_3 = pd.concat([movies_df_3, ohe_mpaa_ratings], axis = 1, sort = False)
# remove original columns which have been one-hot encoded
movies_df_3.drop(['genres', 'imdb_genres', 'director', 'actors', 'MPAA_rating'], axis = 1, inplace = True)
# replace spaces with underscored and remove punctuation
movies_df_3.columns = [c.replace(' ', '_') for c in movies_df_3.columns]
movies_df_3.columns = [c.replace('/', '') for c in movies_df_3.columns]
movies_df_3.columns = [c.replace('.', '') for c in movies_df_3.columns]
movies_df_3.columns = [c.replace('-', '') for c in movies_df_3.columns]
movies_df_3.columns = [c.replace("'", '') for c in movies_df_3.columns]
# make sure all titles are ascii standard
movies_df_3.title = [unicodedata.normalize('NFKD', t).encode('ascii', 'ignore') 
                     for t in movies_df_3.title]
movies_df_3.head()

In [None]:
movies_df_3['item_id'] = movies_df_3.item_id.astype(int)
movies_df_3['title'] = movies_df_3.title.astype(str)
movies_df_3['imdb_id'] = movies_df_3.imdb_id.astype(str)
movies_df_3['imdb_rating'] = movies_df_3.imdb_rating.astype(float)
movies_df_3.iloc[:, 4:] = movies_df_3.iloc[:, 4:].astype(int)
# Combine MPAA NOT RATED, Not Rated, Unrated, and UNRATED
movies_df_3['MPAA_rating_Unrated'] = movies_df_3[[
    'MPAA_rating_Unrated', 'MPAA_rating_UNRATED', 'MPAA_rating_NOT_RATED', 
    'MPAA_rating_Not_Rated']].apply(sum, axis = 1)
# Combine APPROVED, Approved, blank, NA, Passed, PASSED
movies_df_3['MPAA_rating_Other'] = movies_df_3[[
    'MPAA_rating_APPROVED', 'MPAA_rating_Approved', 'MPAA_rating_', 
    'MPAA_rating_NA', 'MPAA_rating_Passed', 'MPAA_rating_PASSED']].apply(sum, axis = 1)
# Combine M, GP, MPG, and PG - These are all the same, just a how the ratings evolved
movies_df_3['MPAA_rating_PG'] = movies_df_3[[
    'MPAA_rating_PG', 'MPAA_rating_GP', 'MPAA_rating_M', 
    'MPAA_rating_MPG']].apply(sum, axis = 1)
# Drop now unnecessary columns
movies_df_3.drop(['MPAA_rating_UNRATED', 'MPAA_rating_NOT_RATED', 'MPAA_rating_APPROVED', 
                  'MPAA_rating_Approved', 'MPAA_rating_', 'MPAA_rating_NA', 
                  'MPAA_rating_Passed', 'MPAA_rating_PASSED', 'MPAA_rating_Not_Rated', 
                  'MPAA_rating_GP', 'MPAA_rating_M', 'MPAA_rating_MPG'], axis = 1, inplace = True)

In [None]:
# Convert to spark dataframe
movies_metadata_OHE = sqlContext.createDataFrame(movies_df_3)
# Write movies metadata binarized to parquet file
movies_metadata_OHE.write.format('parquet').mode('overwrite').save(data_save_path + 'movie_metadata_OHE')

In [None]:
# Stringify list fields. this simplifies saving, and this can be used as the back up for the OHE dataset
movies_df_2.genres = movies_df_2.genres.apply(', '.join)
movies_df_2.imdb_genres = movies_df_2.imdb_genres.apply(', '.join)
movies_df_2.director = movies_df_2.director.apply(', '.join)
movies_df_2.actors = movies_df_2.actors.apply(', '.join)
movies_df_2.head(2)

In [None]:
# Convert to spark dataframe
movies_metadata = sqlContext.createDataFrame(movies_df_2)
# Write movies metadata to parquet file
movies_metadata.write.format('parquet').mode('overwrite').save(data_save_path + 'movie_metadata_original')