In [1]:
from bs4 import BeautifulSoup
import requests

import pandas as pd
import random
import math
import time
import json
import os
import re

In [2]:
with open('movie_links.json', 'r') as file:
    movie_links = json.load(file)

In [3]:
print('You uploaded the following amount of links for each genre:')

for genre, links in movie_links.items():
    print('{}: {}'.format(genre,len(links)))

You uploaded the following amount of links for each genre:
comedy: 10000
sci-fi: 10000
horror: 10000
romance: 10000
action: 10000
thriller: 10000
drama: 10000
mystery: 10000
crime: 10000
animation: 10000
adventure: 10000
fantasy: 10000
comedy,romance: 10000
action,comedy: 10000


In [4]:
# Getting only unique links to scrape
uniques = list(set([link for links in movie_links.values() for link in links])) 
print('You have {} different movies to scrape.'.format(len(uniques)))

You have 75679 different movies to scrape.


In [5]:
# Scraping info from each movie link using BS4
all_info_dict = []
errors = []
c = 0

for link in uniques:
    c += 1
    
    try:
        html = requests.get(link, headers={'user-agent': 'Mozilla/5.0'}).content
        soup = BeautifulSoup(html, features="lxml")

        time.sleep(random.uniform(0.5,1.2))

        all_info = soup.find('script', {'type':'application/ld+json'}).text

        all_info_dict += [json.loads(all_info)]

    except:
        errors += [link]
        
        
    # Saving after every 5000 links    
    if c%5000 == 0:
        
        print(c, 'links scraped.')
        time.sleep(1)
        print("Saving... don't quit!")
        
        with open('movie_info_raw_new.json', 'w') as outfile:
            json.dump(all_info_dict, outfile)
            
print('You got {} right and {} errors.'.format(len(all_info_dict), len(errors)))

5000 links scraped.
Saving... don't quit!
10000 links scraped.
Saving... don't quit!
15000 links scraped.
Saving... don't quit!
20000 links scraped.
Saving... don't quit!
25000 links scraped.
Saving... don't quit!
30000 links scraped.
Saving... don't quit!
You got 30677 right and 2 errors.


In [5]:
# Loading saved files and putting them on the same list

info_dict = []

for file in os.listdir():
    if file.startswith('movie_info_raw'):
        with open(file, 'r') as file:
            js = json.load(file)
        info_dict += js 
        
print('Total of {} movies information scraped.'.format(len(info_dict)))

Total of 69997 movies information scraped.


In [6]:
# Naming dictionary keys:
headings = list(set([i for dct in info_dict[::5000] for i in list(dct.keys())]))

# Creating dictionary to store items:
movie_dc = {k:[] for k in headings}

# Turning list of dictionaries into one dictionary with lists as values
for dct in info_dict:
    headings = list(set(dct.keys()))
    for key in movie_dc.keys():
        if key in headings:
            movie_dc[key] += [dct[key]]
        else:
            movie_dc[key] += [None]

# Checking results 
print('Checking placement:')
for key, values in movie_dc.items():
    assert len(values) == len(info_dict)
    print('Found {} values for {}'.format(len(values), key))

Checking placement:
Found 69997 values for trailer
Found 69997 values for contentRating
Found 69997 values for genre
Found 69997 values for url
Found 69997 values for director
Found 69997 values for duration
Found 69997 values for @context
Found 69997 values for actor
Found 69997 values for creator
Found 69997 values for keywords
Found 69997 values for image
Found 69997 values for datePublished
Found 69997 values for aggregateRating
Found 69997 values for name
Found 69997 values for description
Found 69997 values for @type
Found 69997 values for review


In [11]:
# Keeping only what is needed

movie_dict = {'id': [], 'title':[], 'year':[], 'type':[], 'genre':[], 'rating':[],
              'rating_count':[], 'duration':[], 'age':[], 'keywords' : [], 'summary':[]} 


movie_dict['id'] = [i[9:-1] for i in movie_dc['url']]
movie_dict['title'] = movie_dc['name']
movie_dict['year'] = movie_dc['datePublished']
movie_dict['type'] = movie_dc['@type']
movie_dict['genre'] = movie_dc['genre']
movie_dict['rating'] = [movie_dc['aggregateRating'][i]['ratingValue'] 
                        if movie_dc['aggregateRating'][i] != None else None
                        for i in range(len(movie_dc['aggregateRating']))]
movie_dict['rating_count'] = [movie_dc['aggregateRating'][i]['ratingCount']
                              if movie_dc['aggregateRating'][i] != None else None
                              for i in range(len(movie_dc['aggregateRating']))]
movie_dict['duration'] = movie_dc['duration']
movie_dict['age'] = movie_dc['contentRating']
movie_dict['keywords'] = movie_dc['keywords']
movie_dict['summary'] = movie_dc['description']

In [12]:
# Turn it into a dataframe
df = pd.DataFrame.from_dict(movie_dict)

df.head()

Unnamed: 0,id,title,year,type,genre,rating,rating_count,duration,age,keywords,summary
0,99704,Grim Prairie Tales: Hit the Trail... to Terror,1990-09-14,Movie,"[Horror, Western]",5.4,877.0,PT1H26M,R,"campfire,vagina dentata,campfire story,antholo...",Grim Prairie Tales: Hit the Trail... to Terror...
1,1295093,Trust,2009-06-29,Movie,"[Crime, Drama, Mystery]",5.6,145.0,PT1H30M,TV-14,one word title,"Trust is a TV movie starring Jamie Luner, Nels..."
2,8525070,Les fauves,2019-01-23,Movie,"[Drama, Mystery, Thriller]",5.0,454.0,PT1H23M,,"night,sex scene,sex in car,breaking and enteri...","Les fauves is a movie starring Lily-Rose Depp,..."
3,8038720,Bulletproof,2018-05-15,TVSeries,"[Action, Crime, Drama, Thriller]",6.4,2291.0,,TV-14,,Bulletproof is a TV series starring Noel Clark...
4,5660206,Spider Woman,,Movie,Horror,2.7,38.0,PT1H28M,,psychotronic film,Spider Woman is a movie starring Krerk-krai Un...


In [13]:
# Saving to file
df.to_csv('movie_info_df.csv')