## Web Scraping

This script will scrape content from IMDB, such as:



In [1]:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import selenium

import random
import time
import json
import re

options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')

In [2]:
def get_movie_links(genre_links, total_pages):
    '''
    This function will get the movie links per page in each of the respective genre.

    :params: genre_links: list of genre links in 'https://www.imdb.com/feature/genre'.
             total_pages: int indicating number of pages to scrape in each genre.

    :return: dictionary with keys as genre and list of each movie links as values.
    '''
    # Separate genre name from links
    genres = [re.findall('(?<==)(.+?)(?=&)', i)[0] for i in genre_links]
    
    # Creates dictionary to return
    movie_links = {k:[] for k in genres}
    
    # Start driver
    driver = webdriver.Chrome(options=options)
    
    # Loops through genre_links
    for i in range(len(genre_links)):
        driver.get(genre_links[i])
        time.sleep(2)
        print('Scraping', genres[i])
        
        # Loops through desired number of pages in each genre
        for page in total_pages:
            movie_links_per_page = [i.get_attribute("href")
                                    for i in driver.find_elements_by_xpath("//h3[@class='lister-item-header']/a")]

            movie_links[genres[i]] += movie_links_per_page  

            time.sleep(random.uniform(0.3,0.6))

            next_page = driver.find_elements_by_xpath("//a[@class='lister-page-next next-page']")

            time.sleep(random.uniform(0.3,0.6))

            if len(next_page) > 0:

                try:
                    next_page[0].click()

                except:
                    print('Page {} of genre {} did not work!'.format(page+1, genres[i]))

            else:
                print('{} genre movie ended on page {}'.format(list(movie_links.keys())[i],page+1))
                driver.close()
                break

    driver.quit() 
    
    return movie_links

In [3]:
url = 'https://www.imdb.com/feature/genre'

driver = webdriver.Chrome(options=options)

driver.get(url)
time.sleep(random.uniform(0.6,1.2))

# get links to genre clustered movies
genre_links = [i.get_attribute("href") for i in driver.find_elements_by_xpath("//div[@class='image']/a")]

driver.quit()

In [9]:
# Set number of pages you want to get movies from each genre
total_pages = range(200)

# Scrape each movie link, except superheros
# movie_links = get_movie_links(genre_links[:-1], total_pages)

Scraping comedy
Scraping sci-fi
Scraping horror
Scraping romance
Scraping action
Scraping thriller
Scraping drama
Scraping mystery
Scraping crime
Scraping animation
Scraping adventure
Scraping fantasy
Scraping comedy,romance
Scraping action,comedy


In [13]:
# Saving to file
#with open('movie_links', 'w') as outfile:
#    json.dump(movie_links, outfile)

In [18]:
# Load movie_links

with open('movie_links.json', 'r') as file:
    movie_links = json.load(file)

In [19]:
print('We scraped the following amount of links for each genre:')

for genre, links in movie_links.items():
    print('{}: {}'.format(genre,len(links)))

We scraped the following amount of links for each genre:
comedy: 10000
sci-fi: 10000
horror: 10000
romance: 10000
action: 10000
thriller: 10000
drama: 10000
mystery: 10000
crime: 10000
animation: 10000
adventure: 10000
fantasy: 10000
comedy,romance: 10000
action,comedy: 10000
