In [1]:
import os
import time
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing
from bs4 import BeautifulSoup

In [2]:
def download(link, no_of_movies):

    file_name = link.lower()
    link = 'https://www.imdb.com/search/title/?genres=' + link
    res = requests.get(link)
    soup = BeautifulSoup(res.content, 'html.parser')
    data = []

    page = 0

    try:
        for _ in tqdm(range(int(no_of_movies/50))):

            votes = np.nan
            gross = np.nan

            for m in soup.find('div', class_ = 'lister-list').find_all('div', class_ = 'lister-item mode-advanced'):

                try:
                    id_         = m.find('div', class_ = 'lister-item-image float-left').find('a').get('href').split('/')[-2].strip()
                except:
                    id_         = np.nan

                try:
                    name        = m.find('h3').find('a').text
                except:
                    name        = np.nan

                try:
                    year        =  m.find('h3').find('span', class_ = 'lister-item-year').text
                except:
                    year        = np.nan

                try:
                    certificate = m.find('p', class_ = "text-muted").find('span', class_ = 'certificate').text.strip()
                except:
                    certificate = np.nan

                try:
                    duration    = m.find('p', class_ = "text-muted").find('span', class_ = 'runtime').text.strip()
                except:
                    duration    = np.nan

                try:    
                    genre       = m.find('p', class_ = "text-muted").find('span', class_ = 'genre').text.strip()
                except:
                    genre       = np.nan

                try:
                    rating      = m.find('div', class_ = 'ratings-bar').find('strong').text.strip()
                except:
                    rating      = np.nan

                try:
                    description = m.find_all('p', class_ = 'text-muted')[1].text.strip()
                except:
                    description = np.nan



                try:     # If were having both directors and stars
                    temp = BeautifulSoup(str(m.find_all('p')[2]).split('<span class="ghost">|</span>')[0], 'html.parser')
                    if ('Director' in temp.text):

                        directors_id   = ','.join([i.get('href').split('/')[-2] for i in temp.find_all('a')])
                        directors_name = ','.join([i.text.strip() for i in temp.find_all('a')])

                    temp = BeautifulSoup(str(m.find_all('p')[2]).split('<span class="ghost">|</span> ')[1], 'html.parser')
                    if ("Star" in temp.text):

                        stars_id   = ','.join([i.get('href').split('/')[-2] for i in temp.find_all('a')])
                        stars_name = ','.join([i.text.strip() for i in temp.find_all('a')])
                except:

                    directors_id   = np.nan
                    stars_id       = np.nan
                    directors_name = np.nan
                    stars_name     = np.nan


                try:       # Directors but not Stars
                    if('Director' in m.find_all('p')[2].text and 'Stars' not in m.find_all('p')[2].text):
                        directors_id   = ','.join([i.get('href') for i in m.find_all('p')[2].find_all('a')])
                        directors_name = ','.join([i.text.strip() for i in m.find_all('p')[2].find_all('a')])
                except:
                    directors_id = np.nan
                    directors_name = np.nan

                try:        # Stars but not Directors
                    if('Stars' in m.find_all('p')[2].text and 'Director' not in m.find_all('p')[2].text):
                        stars_id     = ','.join([i.get('href').split('/')[-2] for i in m.find_all('p')[2].find_all('a')])
                        stars_name   = ','.join([i.text.strip() for i in m.find_all('p')[2].find_all('a')])
                except:
                    stars_id = np.nan
                    stars_name = np.nan


                try:
                    if len(m.find('p', class_ = 'sort-num_votes-visible').find_all('span')) == 2:

                        if('Vote' in m.find('p', class_ = 'sort-num_votes-visible').find_all('span')[0].text):    
                            votes =  m.find('p', class_ = 'sort-num_votes-visible').find_all('span')[1].text.strip()

                        elif('Gros' in m.find('p', class_ = 'sort-num_votes-visible').find_all('span')[0].text):    
                            gross =  m.find('p', class_ = 'sort-num_votes-visible').find_all('span')[1].text.strip()

                    else:
                        votes = m.find('p', class_ = 'sort-num_votes-visible').find_all('span')[1].get('data-value')
                        gross = m.find('p', class_ = 'sort-num_votes-visible').find_all('span')[4].get('data-value')
                except:

                    votes = np.nan
                    gross = np.nan

                data.append([id_, name, year, rating ,certificate, duration, genre, votes , gross ,directors_id, 
                             directors_name , stars_id, stars_name, description])



            next_page_link = 'https://www.imdb.com' + soup.find('a',class_ = 'lister-page-next next-page').get('href')

            res = requests.get(next_page_link)
            soup = BeautifulSoup(res.content, 'html.parser')

    except:
        pass
    
    df = pd.DataFrame(data, columns = ['id', 'name', 'year', 'rating' ,'certificate', 'duration', 'genre', 'votes'
                               , 'gross_income' ,'directors_id', 'directors_name' , 'stars_id', 'stars_name', 
                               'description'])
    
    df.to_csv(file_name + '.csv', index = False)

    print(file_name + '.csv is created!')

In [None]:
p1  = multiprocessing.Process(target = download , args = ('drama'        , 2580167))
p2  = multiprocessing.Process(target = download , args = ('comedy'       , 1936679))
p3  = multiprocessing.Process(target = download , args = ('talk-show'    , 1318249))
p4  = multiprocessing.Process(target = download , args = ('short'        , 1117705))
p5  = multiprocessing.Process(target = download , args = ('romance'      , 938806))
p6  = multiprocessing.Process(target = download , args = ('documentary'  , 883693))
p7  = multiprocessing.Process(target = download , args = ('news'         , 883562))
p8  = multiprocessing.Process(target = download , args = ('family'       , 767551))


p1.start()
p2.start()
p3.start()
p4.start()
p5.start()
p6.start()
p7.start()
p8.start()

p1.join()
p2.join()
p3.join()
p4.join()
p5.join()
p6.join()
p7.join()
p8.join()

  2%|▏         | 436/17671 [31:28<20:44:18,  4.33s/it]


news.csv is created!


  1%|▏         | 719/51603 [34:03<40:10:03,  2.84s/it]
  4%|▍         | 602/15351 [34:04<15:31:37,  3.79s/it]

drama.csv is created!


  4%|▍         | 1522/38733 [1:12:18<29:27:41,  2.85s/it]
  7%|▋         | 1286/17673 [1:12:19<14:53:50,  3.27s/it]

comedy.csv is created!


 43%|████▎     | 8113/18776 [6:49:52<7:48:56,  2.64s/it]

In [2]:
!ls -l

total 4
drwxr-xr-x 1 root root 4096 Jul  6 13:22 sample_data
