In [2]:
#Importing the libraries
import scrapy
import crochet
import json
import logging

from crochet import setup, wait_for
from scrapy.crawler import CrawlerRunner

In [3]:
# Creating the class that will guide the creation of our json file
class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('imdb.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [209]:
setup()

# Creating the spider
class ImdbSpider(scrapy.Spider):
    name = 'imdb'
    start_urls = ['https://www.imdb.com/search/title/?count=100&groups=oscar_best_picture_winners&sort=year%2Cdesc&ref_=nv_ch_osc']
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'imdb.json',                        # Used for pipeline 2
    }
    
    # Creating the parser responsible to give that path to the elements that will be scraped
    def parse(self, response):
        exp = ['(',')','I']
        for movies in response.css('.lister-item-content'):
            yield{
                'title': movies.css('.lister-item-header a::text').get(),
                'year': movies.css('.lister-item-header span::text')[1].get().replace('(','').replace(')','').replace('I',''),
                'rating': movies.css('.lister-item-content strong::text').get(),
                'metascore': movies.css('.favorable::text').get(),
                'director': movies.css('.lister-item-content p a::text')[0].get(),
                'stars': movies.css('.lister-item-content p a::text')[1:].getall(),
                'runtime(min)': movies.css('.runtime::text').get().replace('min',''),
                'genre': movies.css('.genre::text').get().replace('\n', '').replace('            ','')
            }

# Function to run the spider with a single command
@wait_for(10)
def run_spider():
    crawler = CrawlerRunner()
    d = crawler.crawl(ImdbSpider)
    return d

In [210]:
# Running the spider
run_spider()

In [211]:
# Seeing if this worked
import pandas as pd
df_movies = pd.read_json('imdb.json')
df_movies

Unnamed: 0,title,year,rating,metascore,director,stars,runtime(min),genre
0,CODA,2021,8.0,72.0,Sian Heder,"[Emilia Jones, Marlee Matlin, Troy Kotsur, Dan...",111,"Comedy, Drama, Music"
1,Nomadland,2020,7.3,92.0,Chloé Zhao,"[Frances McDormand, David Strathairn, Linda Ma...",107,Drama
2,Parasite,2019,8.5,96.0,Bong Joon Ho,"[Song Kang-ho, Lee Sun-kyun, Cho Yeo-jeong, Ch...",132,"Drama, Thriller"
3,Green Book,2018,8.2,69.0,Peter Farrelly,"[Viggo Mortensen, Mahershala Ali, Linda Cardel...",130,"Biography, Comedy, Drama"
4,The Shape of Water,2017,7.3,87.0,Guillermo del Toro,"[Sally Hawkins, Octavia Spencer, Michael Shann...",123,"Drama, Fantasy, Romance"
...,...,...,...,...,...,...,...,...
90,Cimarron,1931,5.8,70.0,Wesley Ruggles,"[Richard Dix, Irene Dunne, Estelle Taylor, Nan...",123,"Drama, Western"
91,All Quiet on the Western Front,1930,8.1,91.0,Lewis Milestone,"[Lew Ayres, Louis Wolheim, John Wray, Arnold L...",152,"Drama, War"
92,The Broadway Melody,1929,5.6,,Harry Beaumont,"[Bessie Love, Anita Page, Charles King, Eddie ...",100,"Drama, Musical, Romance"
93,Wings,1927,7.6,,William A. Wellman,"[Harry d'Abbadie d'Arrast, Clara Bow, Charles ...",144,"Drama, Romance, War"


In [212]:
# Saving into a csv file
df_movies.to_csv('imdb.csv')