FIrst, we need to open a terminal and use our PIC16B virtual environment to generate our webscraping project.

Type the following code in terminal

In [None]:
conda activate PIC16B
scrapy startproject IMDB_scraper
cd IMDB_scraper

Now, we get down to write your scraper. 

In [None]:
import scrapy

class ImdbSpider(scrapy.Spider):
    name = 'imdb_spider'
    
    start_urls = ['https://www.imdb.com/title/tt0106145/']

### 1. `parse(self, response)`

In [None]:
def parse(self, response):

    cast_crew  = response.css("a.ipc-metadata-list-item__icon-link").attrib["href"]
        
    if cast_crew is not None:
        cast_crew = response.urljoin(cast_crew)  
    
        yield scrapy.Request(cast_crew, callback = self.parse_full_credits)

### 2. `parse_full_credits(self, response)`

In [None]:
def parse_full_credits(self, response):
 
    for actor_link in [a.attrib["href"] for a in response.css("td.primary_photo a")]:

        if actor_link is not None:
            actor_link = response.urljoin(actor_link)  
        
        yield scrapy.Request(actor_link, callback = self.parse_actor_page)

### 3. `parse_actor_page(self, response)` 

In [None]:
def parse_actor_page(self, response):
        
        actor_name = response.css("span.itemprop::text").get()
        
        for movie in response.css("div.filmo-row"):
            movie_name = [movie.css("a::text").get()]

            yield {
                   "actor" : actor_name, 
                   "movie_name" : movie_name
            }

In [None]:
scrapy crawl imdb_spider -o results.csv

First, import panda package.

In [1]:
import pandas as pd

In [2]:
results = pd.read_csv("results.csv")
results

Unnamed: 0,actor,movie_name
0,Fay Genens,Forrest Gump
1,Frank Geyer,Bet Your Life
2,Tyler Long,All Saints Eve
3,Frank Geyer,Forrest Gump
4,Tyler Long,Taking Chances
...,...,...
16800,John Lennon,Go Go Mania
16801,John Lennon,Jaaroverzicht
16802,John Lennon,Follow the Beatles
16803,John Lennon,The Jack Paar Program


Now, we use `df.groupby.aggregate` to extract the number of actors in each movie.

In [3]:
results = results.groupby(["movie_name"])["actor"].aggregate(len).reset_index()
results

Unnamed: 0,movie_name,actor
0,$ellebrity,1
1,'92 Skybox Alonzo Mourning Rookie Card,1
2,'Catch Me If You Can': Behind the Camera,1
3,'Catch Me If You Can': In Closing,1
4,'Catch Me If You Can': The Casting of the Film,1
...,...,...
10823,À l'affiche du monde,1
10824,Ànima,1
10825,Året der gik,1
10826,Ídolos,1


In [4]:
results = results.sort_values(by=["actor"],ascending=False,ignore_index=True)
results = results.rename(columns={'actor': 'number of shared actors'})
results.head(10)

Unnamed: 0,movie_name,number of shared actors
0,Forrest Gump,179
1,Biography,30
2,Xscape,22
3,Today,22
4,Entertainment Tonight,21
5,60 Minutes,19
6,Partners,19
7,Cumulus 9,19
8,The Tonight Show Starring Johnny Carson,17
9,The Sixties,17
