## IMDb Movie Metadata Collection

In [1]:
# Intialization
import os
import time

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

# spark imports
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)

# data science imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# data collection imports
import requests
from bs4 import BeautifulSoup
import ast
import unicodedata

In [2]:
data_path = 'hdfs:///user/andrew/'

#### Import and collect movies data
This includes the movieId, title, and genres provided in the Movie Lens data.

In [3]:
# # Read in data through spark since the data is sored in hadoop and format the columns
# from pyspark.sql.types import *
# from pyspark.sql import SQLContext, Row
# sqlContext = SQLContext(sc)

# movies = pd.read_csv('movies.csv', header = 0, dtype = {'movieId': 'int64', 'title': str, 'genres': str})
# links = pd.read_csv('links.csv', header = 0, dtype = {'movieId': 'int64', 'imdbId': str, 'tmdbId': str})
# ratings = pd.read_csv('ratings.csv', header = 0, dtype = {'userId': str, 'movieId': 'int64', 'rating': np.float64, 'timestamp': str})
# movies2 = movies.merge(links[['movieId', 'imdbId']], left_on=['movieId'], right_on = ['movieId'], how = 'left')

# movies_sdf = sqlContext.createDataFrame(movies2)
# ratings_sdf = sqlContext.createDataFrame(ratings)

# # write movies and ratings to parquet file
# movies_sdf.write.format('parquet').mode('overwrite').save(data_path + 'movies_20m')
# ratings_sdf.write.format('parquet').mode('overwrite').save(data_path + 'ratings_20m')

In [4]:
# Read in data through spark since the data is sored in hadoop and format the columns
Item = Row('item_id', 'title' ,'genres', 'imdbId')

movies = sqlContext.read.parquet(data_path + 'movies_20m')

# Collect the data in a simple pandas data frame for easier manipulation
movies_df = movies.toPandas()
movies_df.head(10)

Unnamed: 0,movieId,title,genres,imdbId
0,66509,Funny People (2009),Comedy|Drama,1201167
1,66511,Berlin Calling (2008),Comedy|Drama,1213019
2,66513,Devil Hides in Doubt (Sollbruchstelle) (2008),Documentary,1322381
3,66517,Against the Dark (2009),Action|Horror,1194271
4,66537,"Letter for the King, The (Brief voor de koning...",Adventure,490377
5,66539,Firepower (1979),Action|Drama|Thriller,79153
6,66544,Nuremberg (2000),Drama|War,208629
7,66547,Bigger Than Life (1956),Drama|Mystery|Thriller,49010
8,66549,Going to Kansas City (1998),Drama|Romance|Thriller,119211
9,66551,Red Sands (2009),Action|Horror,1103256


In [None]:
# Use the OMDb API to collect more movie information
def movie_metadata_extract(input_val, api_key = ''):
    time.sleep(0.5)
    title_val = input_val[0]
    imdb_id = input_val[1]
    
    # remove the year from the title
    title = title_val[0:-7]
    title = title.split(' (')[0]
    if title[-5:] == ', The':
        title = 'The ' + title[0:-5]
    elif title[-4:] == ', An':
        title = 'An ' + title[0:-4]
    elif title[-4:] == ', La':
        title = 'La ' + title[0:-4]
    elif title[-3:] == ', A':
        title = 'A ' + title[0:-3]

    year = title_val[-5:-1]
    
    if imdb_id == '':
        imdb_id = 'tt' + imdb_id
        movie_request = requests.get('http://www.omdbapi.com/?apikey=' + api_key, 
                                     {'i': imdb_id, 't': title, 'y': year})
    else:
        movie_request = requests.get('http://www.omdbapi.com/?apikey=' + api_key, 
                                     {'t': title, 'y': year})
    movie_dict = ast.literal_eval(movie_request.text.encode('utf-8'))
    
    if movie_dict.get('Response') == 'True':
        # Attempt to extract IMDb Rating
        try:
            imdb_rating = float(movie_dict.get('imdbRating'))
        except:
            imdb_rating = 0
        # Attempt to extract IMDb Votes
        try:
            imdb_votes = int(movie_dict.get('imdbVotes').replace(',', ''))
        except:
            imdb_votes = 0
        # Attempt to extract Metascore
        try:
            metascore = int(movie_dict.get('Metascore'))
        except:
            metascore = 0
        # Attempt to extract Runtime
        try:
            runtime = int(movie_dict.get('Runtime')[0:-4].replace(',', ''))
        except:
            runtime = 0
        # Attempt to extract Year
        try:
            year = int(movie_dict.get('Year'))
        except:
            year = 0
        
        genre_list = [unicodedata.normalize('NFKD', g.decode('utf-8')).encode('ascii', 'ignore') 
                      for g in movie_dict.get('Genre').split(', ')]
        director_list = [unicodedata.normalize('NFKD', d.decode('utf-8')).encode('ascii', 'ignore') 
                         for d in movie_dict.get('Director').split(', ')]
        director_list = [director.split('(')[0] for director in director_list]
        actor_list = [unicodedata.normalize('NFKD', a.decode('utf-8')).encode('ascii', 'ignore') 
                      for a in movie_dict.get('Actors').split(', ')]
        actor_list = [actor.split('(')[0] for actor in actor_list]

        return(pd.Series([movie_dict.get('imdbID'), 
                          imdb_rating,
                          imdb_votes,
                          metascore,
                          runtime,
                          year,
                          movie_dict.get('Rated'), 
                          genre_list,
                          director_list,
                          actor_list],
                         index=['imdb_id', 
                                'imdb_rating', 'imdb_votes', 
                                'metascore', 'runtime', 'year',
                                'MPAA_rating', 'imdb_genres', 
                                'director', 'actors']))
    else:
        text = title.replace('&', 'and') + ' ' + year + ' movie'
        text = text.replace(' ', '+')

        response = requests.get('https://google.com/search?q=' + text)

        imdbID = ''
        soup = BeautifulSoup(response.text)
        for a in soup.find_all('a'):
              if a.get('href')[0:34] == '/url?q=https://www.imdb.com/title/':
                imdbID = a.get('href')[32:45].split('/')[1]
                break
                
        movie_request = requests.get('http://www.omdbapi.com/?apikey=' + api_key, 
                                     {'i': imdbID})
        movie_dict = ast.literal_eval(movie_request.text.encode('utf-8'))
        if movie_dict.get('Response') == 'True':
            # Attempt to extract IMDb Rating
            try:
                imdb_rating = float(movie_dict.get('imdbRating'))
            except:
                imdb_rating = 0
            # Attempt to extract IMDb Votes
            try:
                imdb_votes = int(movie_dict.get('imdbVotes').replace(',', ''))
            except:
                imdb_votes = 0
            # Attempt to extract Metascore
            try:
                metascore = int(movie_dict.get('Metascore'))
            except:
                metascore = 0
            # Attempt to extract Runtime
            try:
                runtime = int(movie_dict.get('Runtime')[0:-4].replace(',', ''))
            except:
                runtime = 0
            # Attempt to extract Year
            try:
                year = int(movie_dict.get('Year'))
            except:
                year = 0
            
            genre_list = [unicodedata.normalize('NFKD', g.decode('utf-8')).encode('ascii', 'ignore') 
                          for g in movie_dict.get('Genre').split(', ')]
            director_list = [unicodedata.normalize('NFKD', d.decode('utf-8')).encode('ascii', 'ignore') 
                             for d in movie_dict.get('Director').split(', ')]
            director_list = [director.split('(')[0] for director in director_list]
            actor_list = [unicodedata.normalize('NFKD', a.decode('utf-8')).encode('ascii', 'ignore') 
                          for a in movie_dict.get('Actors').split(', ')]
            actor_list = [actor.split('(')[0] for actor in actor_list]

            return(pd.Series([movie_dict.get('imdbID'), 
                              imdb_rating,
                              imdb_votes,
                              metascore,
                              runtime,
                              year,
                              movie_dict.get('Rated'), 
                              genre_list,
                              director_list,
                              actor_list],
                             index=['imdb_id', 
                                    'imdb_rating', 'imdb_votes', 
                                    'metascore', 'runtime', 'year',
                                    'MPAA_rating', 'imdb_genres', 
                                    'director', 'actors']))
        
        
        else:
            print movie_dict.get('Error'), title, year, imdbID
            return(pd.Series(['', 
                              0, 0, 0, 0, 0,
                              '', [''], [''], ['']],
                             index=['imdb_id', 
                                    'imdb_rating', 'imdb_votes', 
                                    'metascore', 'runtime', 'year',
                                    'MPAA_rating', 'imdb_genres', 
                                    'director', 'actors']))

In [None]:
%%time
movies_df_subset_1 = pd.concat([movies_df.iloc[0:3000], 
                                movies_df[['title', 'imdbId']].iloc[0:3000].apply(movie_metadata_extract, axis = 1)],
                               axis = 1)

In [None]:
movies_df_subset_2 = pd.concat([movies_df.iloc[3000:6000], 
                                movies_df[['title', 'imdbId']].iloc[3000:6000].apply(movie_metadata_extract, axis = 1)],
                               axis = 1)

In [None]:
movies_df_subset_3 = pd.concat([movies_df.iloc[6000:9000], 
                                movies_df[['title', 'imdbId']].iloc[6000:9000].apply(movie_metadata_extract, axis = 1)],
                               axis = 1)

In [None]:
movies_df_subset_4 = pd.concat([movies_df.iloc[9000:12000], 
                                movies_df[['title', 'imdbId']].iloc[9000:12000].apply(movie_metadata_extract, axis = 1)],
                               axis = 1)

In [None]:
movies_df_subset_5 = pd.concat([movies_df.iloc[12000:15000], 
                                movies_df[['title', 'imdbId']].iloc[12000:15000].apply(movie_metadata_extract, axis = 1)],
                               axis = 1)

In [None]:
movies_df_subset_6 = pd.concat([movies_df.iloc[15000:18000], 
                                movies_df[['title', 'imdbId']].iloc[15000:18000].apply(movie_metadata_extract, axis = 1)],
                               axis = 1)

In [None]:
movies_df_subset_7 = pd.concat([movies_df.iloc[18000:21000], 
                                movies_df[['title', 'imdbId']].iloc[18000:21000].apply(movie_metadata_extract, axis = 1)],
                               axis = 1)

In [None]:
movies_df_subset_8 = pd.concat([movies_df.iloc[21000:24000], 
                                movies_df[['title', 'imdbId']].iloc[21000:24000].apply(movie_metadata_extract, axis = 1)],
                               axis = 1)

In [None]:
movies_df_subset_9 = pd.concat([movies_df.iloc[24000:], 
                                movies_df[['title', 'imdbId']].iloc[24000:].apply(movie_metadata_extract, axis = 1)],
                               axis = 1)

In [None]:
# Collect metadata
movies_df_2 = pd.concat([movies_df_subset_1, movies_df_subset_2, movies_df_subset_3, 
                         movies_df_subset_4, movies_df_subset_5, movies_df_subset_6, 
                         movies_df_subset_7, movies_df_subset_8, movies_df_subset_9],
                        axis = 0)
movies_df_2.drop(['imdbId'], axis = 1, inplace = True)

movies_df_2.head(2)

In [None]:
movies_df_2.genres = movies_df_2.genres.apply(lambda x: x.replace('|', ', '))
movies_df_2.imdb_genres = movies_df_2.imdb_genres.apply(', '.join)
movies_df_2.director = movies_df_2.director.apply(', '.join)
movies_df_2.actors = movies_df_2.actors.apply(', '.join)
movies_df_2.head(2)

In [None]:
movies_metadata = sqlContext.createDataFrame(movies_df_2)
del movies_df_2
# write movies metadata to parquet file
movies_metadata.write.format('parquet').mode('overwrite').save(data_path + 'movie_20m_metadata_original')
del movies_metadata
# Read parquet: movies_metadata = sqlContext.read.parquet(data_path + 'movie_20m_metadata_original')