### Scraping film data from imdb.com with request and BeautifulSoup

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import json

In [2]:
def get_detail(url):
    headers = {'User-Agent':'Mozilla/5.0'}
    result = requests.get(url, headers=headers)
    soup = BeautifulSoup(result.text,  "html.parser")
    budget = np.nan
    gross = np.nan
    relesasedate = np.nan
    countries = np.nan
    languages = np.nan
    companies = np.nan
    locations = np.nan
    keywords = np.nan
    genres = np.nan
    # budget
    budget_text = soup.find('li', attrs = {'data-testid':"title-boxoffice-budget"})
    if budget_text:
        budget_text.find('label')
        budget_text = budget_text.text.split(' ')[0]
        budget = float(''.join(x for x in budget_text if x.isdigit())) / 1000000   
    # gross
    gross_text = soup.find('li', attrs = {'data-testid':"title-boxoffice-cumulativeworldwidegross"})
    if gross_text:
        gross_text= gross_text.find('label')
        gross_text = gross_text.text.split(' ')[0]
        gross = float(''.join(x for x in gross_text if x.isdigit())) / 1000000
    #relesasedate
    relesasedate_text = soup.find('li', attrs = {'data-testid':"title-details-releasedate"})
    if relesasedate_text:
        relesasedate_text = relesasedate_text.find('li')
        relesasedate = relesasedate_text.text
    #countries
    countries_text = soup.find('li', attrs = {'data-testid':"title-details-origin"})
    if countries_text:
        countries_text = countries_text.find_all('li')
        countries_list = []
        for c in countries_text:
            countries_list.append(c.text)
        countries = ', '.join(countries_list)
    else:
        countries = np.nan
    #languages
    languages_text = soup.find('li', attrs = {'data-testid':"title-details-languages"})
    if languages_text:
        languages_text = languages_text.find_all('li')
        languages_list = []
        for l in languages_text :
            languages_list.append(l.text)
        languages = ', '.join(languages_list)
    else:
        languages = np.nan
    #location
    location_text = soup.find('li', attrs = {'data-testid':"title-details-filminglocations"})
    if location_text:
        location_text = location_text.find_all('li')
        location_list = []
        for l in location_text:
            location_list.append(l.text)
        locations = ', '.join(location_list)
    #companies
    companies_text = soup.find('li', attrs = {'data-testid':"title-details-companies"})
    if companies_text:
        companies_text = companies_text.find_all('li')
        companies_list = []
        for c in companies_text:
            companies_list.append(c.text)
        companies = ', '.join(companies_list)
    #keywords
    json_string = soup.find("script", attrs = {'type':"application/ld+json"})
    if json_string:
        json_string = json_string.string
        json_tag = json.loads(json_string)
        if 'keywords' in json_tag:
            keywords = json_tag['keywords']
    
    json_string_2 = soup.find("script", attrs = {'id':"__NEXT_DATA__" ,'type':"application/json"})
    if json_string_2:
        json_string_2 =json_string_2.string
        json_tag_2 = json.loads(json_string_2)
        l_genres = json_tag_2['props']['pageProps']['aboveTheFoldData']['genres']['genres']
        genres = []
        for genre in l_genres:
            genres.append(genre['text'])
        genres = ', '.join(genres)

    detail = [genres, keywords, relesasedate, countries, languages, locations, companies, budget, gross]
    
    return detail

In [3]:
def get_info(film_tag):
    imdb_rate = np.nan
    film_id = np.nan
    name = np.nan
    overview = np.nan 
    certificate = np.nan 
    runtime = np.nan 
    num_vote = np.nan
    imdb_rate = np.nan
    #href
    film_href = film_tag.find('a').get('href')
    #id
    film_id = film_href.split('/')[2]
    film_href = 'https://www.imdb.com' + film_href
    #nvote
    num_vote = film_tag.find('span', attrs = {'name':'nv'}).get_text().replace(',', '')
    num_vote = int(num_vote)
    #name
    name = film_tag.find('h3').get_text().split('\n')[2]
    #overview
    muted_text = film_tag.find_all( 'p', class_ = 'text-muted')
    overview = muted_text[1].get_text().strip()
    #certificate
    cer = film_tag.find('span', class_ = 'certificate')
    if cer:
        certificate = cer.get_text()
    else:
        certificate = ''
    #runtime
    runtime_tag = film_tag.find('span', class_ = 'runtime')
    if runtime_tag:
        runtime = float(runtime_tag.get_text().split(' ')[0])
    else:
        runtime = np.nan
    #imdb_rate
    imdb_rate_tag = film_tag.find('strong')
    if imdb_rate_tag:
        imdb_rate = float(imdb_rate_tag.get_text())
    else:
        imdb_rate = np.nan
    #director and cast    
    star_tag = film_tag.find('p', class_ = "").get_text().strip()
    star_tag = star_tag.split('|')
    director = ''
    casts = ''
    if len(star_tag) > 1:
        director_tag =  star_tag[0].split(':\n')
        star_tag = star_tag[1].split(':\n')
        director = director_tag[1].strip().split(', \n')
        director = ', '.join(director)
        casts = star_tag[1].strip().split(', \n')
        casts = ', '.join(casts)
    else:
        temp = star_tag[0].strip().split(':\n')
        if temp[0] == 'Director':
            director = temp[1]
            director = ', '.join(director)
            cast = ''
        else:
            casts = temp[1]
            casts = ', '.join(casts)
            director = ''
    
    film_info = [film_id, name, overview, certificate, runtime, num_vote, imdb_rate,  director, casts] + get_detail(film_href)
    
    return film_info

In [4]:
film_list = []
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2021-12-31&sort=num_votes,desc&count=250"
while len(film_list) <= 10000:
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    scraped_list = soup.find_all( 'div', class_ = 'lister-item mode-advanced')
    for film in scraped_list:
        film_list.append(get_info(film))
    next_tag = soup.find('a', class_ = 'lister-page-next next-page')
    url = 'https://www.imdb.com/' + next_tag.get('href')

In [5]:
key = ['id', 'name', 'overview', 'certificate', 'runtime', 'nvote', 
       'imdb_rate', 'dicrector', ' casts', 'genres', 'keywords', 
       'relesase_date', 'countries', 'languages', 'locations', 'companies', 'budget', 'gross']
df = pd.DataFrame(film_list,columns = key)

In [6]:
df

Unnamed: 0,id,name,overview,certificate,runtime,nvote,imdb_rate,dicrector,casts,genres,keywords,relesase_date,countries,languages,locations,companies,budget,gross
0,tt1375666,Kẻ Đánh Cắp Giấc Mơ,A thief who steals corporate secrets through t...,PG-13,148.0,2347285,8.8,Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellio...","Action, Adventure, Sci-Fi, Thriller","dream,ambiguous ending,subconscious,mindbender...","August 6, 2010 (Vietnam)","United States, United Kingdom","English, Japanese, French","Fortress Mountain, Kananaskis Country, Alberta...","Warner Bros., Legendary Entertainment, Syncopy",160.000000,836.848102
1,tt0816692,Hố Đen Tử Thần,A team of explorers travel through a wormhole ...,C13,169.0,1824486,8.6,Christopher Nolan,"Matthew McConaughey, Anne Hathaway, Jessica Ch...","Adventure, Drama, Sci-Fi","astronaut,saving the world,space travel,wormho...","November 7, 2014 (Vietnam)","United States, United Kingdom, Canada",English,Iceland,"Paramount Pictures, Warner Bros., Legendary En...",165.000000,773.867216
2,tt1345836,Kỵ Sĩ Bóng Đêm Trỗi Dậy,Eight years after the Joker's reign of anarchy...,PG-13,164.0,1702927,8.4,Christopher Nolan,"Christian Bale, Tom Hardy, Anne Hathaway, Gary...","Action, Drama","dc comics,batman character,bruce wayne charact...","July 27, 2012 (Vietnam)","United States, United Kingdom","English, Arabic","Mehrangarh Fort, Jodhpur, Rajasthan, India","Warner Bros., Legendary Entertainment, DC Ente...",250.000000,1081.169825
3,tt1853728,Hành Trình Django,"With the help of a German bounty-hunter, a fre...",R,165.0,1551167,8.4,Quentin Tarantino,"Jamie Foxx, Christoph Waltz, Leonardo DiCaprio...","Drama, Western","racial vengeance,racial violence,slavery,one a...","March 15, 2013 (Vietnam)",United States,"English, German, French, Italian","Evergreen Plantation, 4677 Highway 18, Edgard,...","The Weinstein Company, Columbia Pictures",100.000000,426.074373
4,tt0993846,Sói Già Phố Wall,"Based on the true story of Jordan Belfort, fro...",R,180.0,1407088,8.2,Martin Scorsese,"Leonardo DiCaprio, Jonah Hill, Margot Robbie, ...","Biography, Comedy, Crime, Drama","based on true story,stockbroker,female nudity,...","January 11, 2014 (Vietnam)",United States,"English, French","Portofino, Genoa, Liguria, Italy","Red Granite Pictures, Appian Way, Sikelia Prod...",100.000000,406.878233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10244,tt13279528,Was wir wollten,A couple facing fertility issues finds their m...,,93.0,1863,5.9,Ulrike Kofler,"Lavinia Wilson, Elyas M'Barek, Anna Unterberge...",Drama,"female nudity,child wish,sardinia,suicide atte...","November 11, 2020 (Argentina)",Austria,German,"Sardinia, Italy","Film AG Produktion, Netflix, Österreichischer ...",,
10245,tt12059016,Red Snow,A struggling vampire romance novelist must def...,,80.0,1863,5.4,Sean Nichols Lynch,"Dennice Cisneros, Nico Bellamy, Laura Kennon, ...","Comedy, Horror, Thriller","vampire,christmas,novelist,lake tahoe,bat","December 6, 2021 (United Kingdom)",United States,English,"South Lake Tahoe, California, USA","26th Ave Films, Bursell Productions, Evolve Me...",,
10246,tt8688912,Chi La Sow,"While Arjun doesn't want to get married, his p...",,135.0,1862,7.7,Rahul Ravindran,"Sushanth, Ruhani Sharma, Vennela Kishore, Anur...","Comedy, Romance",girl,"August 3, 2018 (India)",India,Telugu,,"Annapurna Studios, Manam Enterprises, Siruni C...",,0.004551
10247,tt8011328,Seema Raja,A youngster who belongs to a royal family need...,,158.0,1862,4.2,Ponram,"Sivakarthikeyan, Samantha Ruth Prabhu, Soori, ...","Action, Drama",play,"September 13, 2018 (India)",India,Tamil,,24AM Studios,,0.362593


In [7]:
df.to_csv('film.csv', index = False, sep='\t', encoding='utf-8')