**This script is for crawling data from the data from TOP 250 movies on IMDB.**

The basic logic is that we firstly request the html data of the url and then parse the html with beautiful soup. Beautiful soup enables us to find the exact element that we want. 

In [42]:
# import all the required packages
import requests
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
import time
import numpy as np

In [43]:
url="https://www.imdb.com/chart/top?ref_=nv_mv_250"
rawhtml = requests.get(url).content
# this is the raw html file
rawhtml

b'\n\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/2008/fbml">\n    <head>\n         \n\n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n    \n    \n    \n\n    \n    \n    \n\n\n\n\n        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};</script>\n\n<script>\n    if (typeof uet == \'function\') {\n      uet("bb", "LoadTitle", {wb: 1});\n    }\n</script>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>\n        <title>Top 250 Movies - IMDb</title>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>\n<script>\n    if (typeof uet == \'function\') {\n      uet("be", "LoadTitle", {wb: 1});\n    }\n</script>\n<script>\n    if (typeof uex == \'function\') {\n      uex("ld", "LoadTitle", {wb: 1});\n    }\n

In [44]:
# use beautiful soup to parse the html file
# a document for some bs attributes
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/
soup = BeautifulSoup(rawhtml, 'html.parser') 

In [None]:
# all the data related to movied are in the <tbody> label, whose class='lister-list'.
# soup.find_all() returns a list
movie1=soup.find_all("tbody",{'class':'lister-list'})[0]
# movie1 is the content about the 250 movie list
f = open('content.html','w')
f.write(movie1.prettify())
f.close()
# type(movie1)
movie1

In [None]:
# movie2 contains a list of 250 movies' info
movie2=movie1.find_all('tr') # movie1 should be a bs4.element
movie2

In [111]:
# get links, titles, ratings, and release years of each movie
movie_links = []
movie_titles = []
movie_ratings = []
movie_years = []
for item in movie2:
  # link for each movie is like https://www.imdb.com/title/tt0111161/
  link = 'https://www.imdb.com/'+item.find_all('td')[1].a.get('href')
  movie_links.append(link)
  # use a.get('href) to get the href link in <a></a>
  # e.g. <a href="/title/tt0111161/">
  # use a.text to get the text between <a>text</a>
  title = item.find_all('td')[1].a.text
  movie_titles.append(title)
  # use strip to remove the "(",")" in (1994)
  year = item.find_all('td')[1].span.text.strip('(')
  year = year.strip(')')
  movie_years.append(year)
  rating = item.find_all('td')[2].strong.text
  movie_ratings.append(rating)

In [None]:
# generate the movie rankings
ranking = list(np.arange(1,251,1))
ranking

In [113]:
# summary the data in a dataframe
movie_data = pd.DataFrame({'movie_rankings':ranking,\
              'movie_titles':movie_titles,\
              'movie_links':movie_links,\
              'movie_ratings':movie_ratings,\
              'movie_years':movie_years})
movie_data

Unnamed: 0,movie_rankings,movie_titles,movie_links,movie_ratings,movie_years
0,1,The Shawshank Redemption,https://www.imdb.com//title/tt0111161/,9.2,1994
1,2,The Godfather,https://www.imdb.com//title/tt0068646/,9.1,1972
2,3,The Godfather: Part II,https://www.imdb.com//title/tt0071562/,9.0,1974
3,4,The Dark Knight,https://www.imdb.com//title/tt0468569/,9.0,2008
4,5,12 Angry Men,https://www.imdb.com//title/tt0050083/,8.9,1957
...,...,...,...,...,...
245,246,Drishyam 2,https://www.imdb.com//title/tt12361178/,8.0,2021
246,247,The Battle of Algiers,https://www.imdb.com//title/tt0058946/,8.0,1966
247,248,Hera Pheri,https://www.imdb.com//title/tt0242519/,8.0,2000
248,249,Nights of Cabiria,https://www.imdb.com//title/tt0050783/,8.0,1957


Now let's get deeper into the data of each movie.
We have the links in movie_links
Try to catch the valuable information in each movie web page

In [68]:
# define a function to get data for each movie
def get_content(link):
  html_content = requests.get(link).content
  soup = BeautifulSoup(html_content, 'html.parser')
  return soup

In [140]:
import json
# get the required info: genre,description,keywords,director,creator,content_rating,language,actor
def generate_info(soup):
  json_string = soup.find_all('script',{'type':'application/ld+json'})[0].text
  movie_info = json.loads(json_string)
  print(movie_info)
  genre = movie_info['genre']
  keywords = movie_info['keywords']
  description = movie_info['description']
  director = movie_info['director'][0]['name']
  # in the creator list, some elements don't have the name, so let's pick the 
  # element with a name
  creator_data = movie_info['creator']
  creator = []
  for i in creator_data:
    if i.get('name'):
      creator.append(i.get('name'))
  # some movies are not rated, so let's have a try test
  try:
    if movie_info['contentRating']:
      content_rating = movie_info['contentRating']
  # if the movie is not rated, run except
  except:
    content_rating = ''
  language = movie_info['review']['inLanguage']
  actors_data = movie_info['actor']
  actor = []
  for i in actors_data:
    if i.get('name'):
      actor.append(i.get('name'))
  return genre,description,keywords,director,creator,content_rating,language,actor
movie_info=generate_info(soup)
movie_info

{'@context': 'https://schema.org', '@type': 'Movie', 'url': '/title/tt0114709/', 'name': 'Toy Story', 'image': 'https://m.media-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_.jpg', 'description': 'A cowboy doll is profoundly threatened and jealous when a new spaceman figure supplants him as top toy in a boy&apos;s room.', 'review': {'@type': 'Review', 'itemReviewed': {'@type': 'CreativeWork', 'url': '/title/tt0114709/'}, 'author': {'@type': 'Person', 'name': 'SmileysWorld'}, 'dateCreated': '2005-12-30', 'inLanguage': 'English', 'name': 'Proof that Pixar not only cares about the quality of their work,they care about our kids.', 'reviewBody': 'Though I am not a big fan of computer animation,I have to give the folks at Pixar credit.This brand of animation is nothing short of brilliant.The attention to detail,such as eye and body movement is quite remarkable.Computers allow them to make their characters as close to human like as po

(['Animation', 'Adventure', 'Comedy'],
 'A cowboy doll is profoundly threatened and jealous when a new spaceman figure supplants him as top toy in a boy&apos;s room.',
 'toy,toy comes to life,rivalry,cowboy,claw crane',
 'John Lasseter',
 ['John Lasseter', 'Pete Docter', 'Andrew Stanton'],
 'G',
 'English',
 ['Tom Hanks', 'Tim Allen', 'Don Rickles'])

In [141]:
movie_genres = []
movie_descriptions = []
movie_keywords = []
movie_directors = []
movie_creators = []
content_ratings = []
movie_languages = []
movie_actors = []
# genre,description,keywords,director,creator,countent_rating,language,actor
for link in movie_links:
  soup = get_content(link)
  genre,description,keywords,director,creator,content_rating,language,actor = generate_info(soup)
  movie_genres.append(genre)
  movie_descriptions.append(description)
  movie_keywords.append(keywords)
  movie_directors.append(director)
  movie_creators.append(creator)
  content_ratings.append(content_rating)
  movie_languages.append(language)
  movie_actors.append(actor)

{'@context': 'https://schema.org', '@type': 'Movie', 'url': '/title/tt0111161/', 'name': 'The Shawshank Redemption', 'image': 'https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_.jpg', 'description': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.', 'review': {'@type': 'Review', 'itemReviewed': {'@type': 'CreativeWork', 'url': '/title/tt0111161/'}, 'author': {'@type': 'Person', 'name': 'weswalker'}, 'dateCreated': '2002-08-27', 'inLanguage': 'English', 'name': 'Shawshank Redeems Hollywood', 'reviewBody': 'Can Hollywood, usually creating things for entertainment purposes only, create art?  To create something of this nature, a director must approach it in a most meticulous manner, due to the delicacy of the process.  Such a daunting task requires an extremely capable artist with an undeniable managerial capacity and an acutely developed awareness 

In [143]:
# summary the detailed data in a new dataframe
detailed_data = pd.DataFrame({'movie_rankings':ranking,\
              'movie_titles':movie_titles,\
              'movie_links':movie_links,\
              'movie_ratings':movie_ratings,\
              'movie_years':movie_years,\
              'movie_genres':movie_genres,\
              'movie_descriptions':movie_descriptions,\
              'movie_keywords':movie_keywords,\
              'movie_directors':movie_directors,\
              'movie_creators':movie_creators})
detailed_data

Unnamed: 0,movie_rankings,movie_titles,movie_links,movie_ratings,movie_years,movie_genres,movie_descriptions,movie_keywords,movie_directors,movie_creators
0,1,The Shawshank Redemption,https://www.imdb.com//title/tt0111161/,9.2,1994,[Drama],Two imprisoned men bond over a number of years...,"wrongful imprisonment,prison,based on the work...",Frank Darabont,"[Stephen King, Frank Darabont]"
1,2,The Godfather,https://www.imdb.com//title/tt0068646/,9.1,1972,"[Crime, Drama]",The aging patriarch of an organized crime dyna...,"crime family,mafia,patriarch,organized crime,g...",Francis Ford Coppola,"[Mario Puzo, Francis Ford Coppola]"
2,3,The Godfather: Part II,https://www.imdb.com//title/tt0071562/,9.0,1974,"[Crime, Drama]",The early life and career of Vito Corleone in ...,"revenge,1950s,corrupt politician,cuban revolut...",Francis Ford Coppola,"[Francis Ford Coppola, Mario Puzo]"
3,4,The Dark Knight,https://www.imdb.com//title/tt0468569/,9.0,2008,"[Action, Crime, Drama]",When the menace known as the Joker wreaks havo...,"dc comics,moral dilemma,psychopath,clown,super...",Christopher Nolan,"[Jonathan Nolan, Christopher Nolan, David S. G..."
4,5,12 Angry Men,https://www.imdb.com//title/tt0050083/,8.9,1957,"[Crime, Drama]",The jury in a New York City murder trial is fr...,"jury,dialogue driven,courtroom,single set prod...",Sidney Lumet,[Reginald Rose]
...,...,...,...,...,...,...,...,...,...,...
245,246,Drishyam 2,https://www.imdb.com//title/tt12361178/,8.0,2021,"[Crime, Drama, Thriller]",A gripping tale of an investigation and a fami...,"casting,cctv,alcohol,church,investigation",Jeethu Joseph,[Jeethu Joseph]
246,247,The Battle of Algiers,https://www.imdb.com//title/tt0058946/,8.0,1966,"[Drama, War]","In the 1950s, fear and violence escalate as th...","battle of algiers,torture,algeria,resistance,s...",Gillo Pontecorvo,"[Franco Solinas, Gillo Pontecorvo]"
247,248,Hera Pheri,https://www.imdb.com//title/tt0242519/,8.0,2000,"[Action, Comedy, Crime]",Three unemployed men look for answers to all t...,"remake of malayalam film,wrong telephone numbe...",Priyadarshan,"[Siddique, Lal, Neeraj Vora]"
248,249,Nights of Cabiria,https://www.imdb.com//title/tt0050783/,8.0,1957,[Drama],A waifish prostitute wanders the streets of Ro...,"prostitute,rescue from drowning,italian woman,...",Federico Fellini,"[Federico Fellini, Ennio Flaiano, Tullio Pinelli]"


In [144]:
# store the data into a csv file
movie_data.to_csv('top250_movie_data.csv')