## Multiprocessing Basics

In [None]:
import multiprocessing
import time

In [None]:
def f1(n):
    print("f1 started")
    time.sleep(2)
    print("f1 done")
    
def f2(n):
    print("f2 started")
    time.sleep(2)
    print("f2 done")
    
def f3(n):
    print("f3 started")
    time.sleep(2)
    print("f3 done")

In [None]:
p1 = multiprocessing.Process(target=f1, args=(5,))
p2 = multiprocessing.Process(target=f2, args=(5,))
p3 = multiprocessing.Process(target=f3, args=(5,))

p1.start()
p2.start()
p3.start()

print('All Started')

p1.join()
p2.join()
p3.join()

f1 startedf2 started

f3 started
All Started
f1 done
f2 done
f3 done


# IMDB Scraper using Multiprocessing


In [None]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time
import os
from tqdm import tqdm
import multiprocessing

In [None]:
def download(link, file_name, no_of_movies):
    data = []
    res = requests.get(link)
    soup = BeautifulSoup(res.content, 'html.parser')
    for _ in tqdm(range(int(no_of_movies/50)+1)):
        for movie in soup.find('div', class_ = 'lister-list').find_all('div', class_ = 'lister-item mode-advanced'):
            try:
                id_ = movie.find('div', class_ = 'lister-item-image float-left').find('a').get('href').split('/')[-2]
            except:
                id_ = np.nan
            try:
                movie_name = movie.find('h3', class_ = 'lister-item-header').find('a').text.strip()
            except:
                movie_name = np.nan
            try:
                year = movie.find('span', class_ = 'lister-item-year text-muted unbold').text.strip().split(' ')[-1][1:-1]
            except:
                year = np.nan
            try:
                certificate = movie.find('p', class_ = 'text-muted').find('span', class_ = 'certificate').text.strip()
            except:
                certificate = np.nan
            try:
                runtime = movie.find('p', class_ = 'text-muted').find('span', class_ = 'runtime').text.strip()
            except:
                runtime = np.nan
            try:
                genre = movie.find('p', class_ = 'text-muted').find('span', class_ = 'genre').text.strip()
            except:
                genre = np.nan
            try:
                rating = movie.find('div', class_ = 'ratings-bar').find('div', class_ = 'inline-block ratings-imdb-rating').text.strip()
            except:
                rating = np.nan
            try:
                description = movie.find_all('p', class_ = 'text-muted')[1].text.strip()
            except:
                description = np.nan
            try:
                director = ",".join(movie.find_all('p')[2].text.strip().split('|')[0].split(":")[1:]).strip()
            except:
                director = np.nan
            try:
                star = ",".join(movie.find_all('p')[2].text.strip().split('|')[1].split(":")[1:]).strip()
            except:
                star = np.nan
            try:
                director_id = movie.find_all('p')[2].find_all('a')[0].get('href')
            except:
                director_id = np.nan
            try:
                star_id = ",".join([i.get('href') for i in movie.find_all('p')[2].find_all('a')[1:]])
            except:
                star_id = np.nan
            try:
                votes = int(movie.find('p', class_ = 'sort-num_votes-visible').find_all('span')[1].get('data-value'))
            except:
                votes = np.nan
            try:
                gross = int(movie.find('p', class_ = 'sort-num_votes-visible').find_all('span')[4].get('data-value').replace(',',''))
            except:
                gross = np.nan
            data.append([id_, movie_name, year, certificate, runtime, genre, rating, description, 
                         director, director_id, star, star_id, votes, gross])
        try:
            next_page_link = 'https://www.imdb.com'+soup.find('a', class_ = 'lister-page-next next-page').get('href')
        except:
            pass
        res = requests.get(next_page_link)
        soup = BeautifulSoup(res.content, 'html.parser')
    df = pd.DataFrame(data, columns = ['movie_id', 'movie_name', 'year', 'certificate', 'runtime', 'genre', 'rating', 'description', 
                         'director', 'director_id', 'star', 'star_id', 'votes', 'gross(in $)'])
    df.to_csv(file_name+'.csv', index = False)

In [None]:
p1 = multiprocessing.Process(target=download, args=('https://www.imdb.com/search/title/?genres=action&title_type=feature', 'action', 52485))
p2 = multiprocessing.Process(target=download, args=('https://www.imdb.com/search/title/?genres=adventure&title_type=feature', 'adventure', 25666))
p3 = multiprocessing.Process(target=download, args=('https://www.imdb.com/search/title/?genres=crime&title_type=feature', 'crime', 35856))
p4 = multiprocessing.Process(target=download, args=('https://www.imdb.com/search/title/?genres=sci-fi&title_type=feature', 'scifi', 16561))
p5 = multiprocessing.Process(target=download, args=('https://www.imdb.com/search/title/?genres=comedy&title_type=feature', 'comedy', 105130))
p6 = multiprocessing.Process(target=download, args=('https://www.imdb.com/search/title/?genres=family&title_type=feature', 'family', 17091))
p7 = multiprocessing.Process(target=download, args=('https://www.imdb.com/search/title/?genres=history&title_type=feature', 'history', 8997))
p8 = multiprocessing.Process(target=download, args=('https://www.imdb.com/search/title/?genres=horror&title_type=feature', 'horror', 36688))
p9 = multiprocessing.Process(target=download, args=('https://www.imdb.com/search/title/?genres=mystery&title_type=feature', 'mystery', 18963))
p10 = multiprocessing.Process(target=download, args=('https://www.imdb.com/search/title/?genres=romance&title_type=feature', 'romance', 52415))
p11 = multiprocessing.Process(target=download, args=('https://www.imdb.com/search/title/?genres=fantasy&title_type=feature', 'fantasy', 17167))
p12 = multiprocessing.Process(target=download, args=('https://www.imdb.com/search/title/?genres=thriller&title_type=feature', 'thriller', 53368))
p13 = multiprocessing.Process(target=download, args=('https://www.imdb.com/search/title/?genres=war&title_type=feature', 'war', 9911))


p1.start()
p2.start()
p3.start()
p4.start()
p5.start()
p6.start()
p7.start()
p8.start()
p9.start()
p10.start()
p11.start()
p12.start()
p13.start()

p1.join()
p2.join()
p3.join()
p4.join()
p5.join()
p6.join()
p7.join()
p8.join()
p9.join()
p10.join()
p11.join()
p12.join()
p13.join()