In [1]:
from bs4 import BeautifulSoup
import requests

import pandas as pd
import random
import math
import time
import json
import re

In [2]:
with open('movie_links.json', 'r') as file:
    movie_links = json.load(file)

In [3]:
print('You uploaded the following amount of links for each genre:')

for genre, links in movie_links.items():
    print('{}: {}'.format(genre,len(links)))

You uploaded the following amount of links for each genre:
comedy: 10000
sci-fi: 10000
horror: 10000
romance: 10000
action: 10000
thriller: 10000
drama: 10000
mystery: 10000
crime: 10000
animation: 10000
adventure: 10000
fantasy: 10000
comedy,romance: 10000
action,comedy: 10000


In [None]:
# Scraping info from each movie link using BS4

all_info_dict = []
errors = []

for genre, links in movie_links.items():
    print(genre)
    
    for link in links:
        try:
            html = requests.get(link, headers={'user-agent': 'Mozilla/5.0'}).content
            soup = BeautifulSoup(html, features="lxml")

            time.sleep(random.uniform(0.5,1.2))

            all_info = soup.find('script', {'type':'application/ld+json'}).text

            all_info_dict += [json.loads(all_info)]
            
        except:
            errors += [link]
            
print('You got {} right and {} errors.'.format(len(all_info_dict), len(errors)))

comedy


In [None]:
# Saving to file
with open('movie_info_raw.json', 'w') as outfile:
    json.dump(all_info_dict, outfile)

In [None]:
# Loading saved file
with open('movie_info_raw.json', 'r') as file:
    all_info_dict = json.load(file)

In [None]:
# Naming dictionary keys:
headings = list(set([i for dct in all_info_dict[::5000] for i in list(dct.keys())]))

# Creating dictionary to store items:
movie_dc = {k:[] for k in headings}

# Turning list of dictionaries into one dictionary with lists as values
for dct in all_info_dict:
    headings = list(set(dct.keys()))
    for key in movie_dc.keys():
        if key in headings:
            movie_dc[key] += [dct[key]]
        else:
            movie_dc[key] += [None]

# Checking results 
print('Checking placement:')
for key, values in movie_dc.items():
    print('Found {} values for {}'.format(len(values), key))

In [None]:
# Keeping only what is needed

movie_dict = {'id': [], 'title':[], 'year':[], 'type':[], 'genre':[], 'rating':[],
              'rating_count':[], 'duration':[], 'age':[], 'keywords' : [], 'summary':[]} 

for val in movie_dc.values():
    movie_dict['id'] = [i[9:-1] for i in movie_dc['url']]
    movie_dict['title'] = movie_dc['name']
    movie_dict['year'] = movie_dc['datePublished']
    movie_dict['type'] = movie_dc['@type']
    movie_dict['genre'] = movie_dc['genre']
    movie_dict['rating'] = [movie_dc['aggregateRating'][i]['ratingValue'] 
                            for i in range(len(movie_dc['aggregateRating']))]
    movie_dict['rating_count'] = [movie_dc['aggregateRating'][i]['ratingCount']
                                  for i in range(len(movie_dc['aggregateRating']))]
    movie_dict['duration'] = movie_dc['duration']
    movie_dict['age'] = movie_dc['contentRating']
    movie_dict['keywords'] = movie_dc['keywords']
    movie_dict['summary'] = movie_dc['description']

In [None]:
# Turn it into a dataframe
df = pd.DataFrame.from_dict(movie_dict)

df.head()

In [None]:
# Saving to file
df.to_csv('movie_info_df.csv')