In [1]:
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import pandas as pd

In [2]:
# first pull the HTML from the page that links to all of the pages with the links.
# in this case, this page gives the links list pages of sci-fi films by decade.
# just go to https://en.wikipedia.org/wiki/Lists_of_science_fiction_films
# to see what I'm pulling from.
html = requests.get('https://en.wikipedia.org/wiki/Lists_of_science_fiction_films')

#turn the HTML into a beautiful soup text object
b = BeautifulSoup(html.text, 'lxml')

In [3]:

# create an mpty list where those links will go.
links = []

# in this case, all of the links we're in a '<li>' brackets.
for i in b.find_all(name = 'li'):
    # pull the actual link for each one
    for link in i.find_all('a', href=True):
        links.append(link['href'])
# the above code ends up pulling more links than I want,
# so I just use the ones I want
links = links[1:11]
# each link only returns something like 'wiki/List_of_science_fiction_films_of_the_1920s'
# so I add the other part of the URL to each.
decade_links = ['https://en.wikipedia.org' + i for i in links]

In [4]:
decade_links

['https://en.wikipedia.org/wiki/List_of_science_fiction_films_before_1920',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1920s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1930s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1940s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1950s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1960s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1970s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1980s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1990s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2000s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2010s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2020s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_television_films',
 '

In [12]:
# create two new lists, one for the title of the page, 
# and one for the link to the page
film_titles = []
film_links = []
# for loop to pull from each decade page with list of films.
# look at https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1920s
# to follow along as an exampe
for decade in decade_links[-2:]:
    print(f'Collecting films from {decade}')
    html = requests.get(decade)
    b = BeautifulSoup(html.text, 'lxml')
    # get to the table on the page
    for i in b.find_all(name='table', class_='wikitable'):
        # get to the row of each film
        for j in i.find_all(name='tr'):
            #get just the title cell for each row.
            # contains the title and the URL
            for k in j.find_all(name='i'):
                # get within that cell to just get the words
                for link in k.find_all('a', href=True):
                    # get the title and add to the list
                    film_titles.append(link['title'])
                    # get the link and add to that list
                    film_links.append(link['href'])
    #be a conscientious scraper and pause between scrapes
    time.sleep(1)
print(f'Number of Film Links Collected: {len(film_links)}')
print(f'Number of Film Titles Collected: {len(film_titles)}')
# remove film links that don't have a description page on Wikipedia
new_film_links = [i for i in film_links if 'redlink' not in i]
# same goes for titles
new_film_titles = [i for i in film_titles if '(page does not exist)' not in i]
print(f'Number of Film Links with Wikipedia Pages: {len(new_film_links)}')
print(f'Number of Film Titles with Wikipedia Pages: {len(new_film_titles)}')
#use this list to fetch from the API
title_links = list(zip(new_film_titles, new_film_links))

Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2010s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2020s
Number of Film Links Collected: 361
Number of Film Titles Collected: 361
Number of Film Links with Wikipedia Pages: 358
Number of Film Titles with Wikipedia Pages: 358


In [None]:
title_links

In [41]:
titulo = []
director = []
actores = []
pais = []
fecha = []

In [46]:
for decade in decade_links[-2:]:
    print(f'Collecting films from {decade}')
    html = requests.get(decade)
    b = BeautifulSoup(html.text, 'lxml')
    # get to the table on the page
    for i in b.find_all(name='table', class_='wikitable'):
        for row in i.find_all('tr'):
#             print(row)
            cells=row.find_all('td')
            if len(cells)==5:
                titulo.append(cells[0].find(text=True))
                director.append(cells[1].find(text=True))
                actores.append(cells[2].find(text=True))
                pais.append(cells[3].find("a",href=True)['title'])
                fecha.append(cells[4].find(text=True).rstrip('\n'))
#                 break

Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2010s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2020s


In [47]:
len(titulo)

373

In [40]:
# cells[0].find(text=True)
# cells[1].find(text=True)
# cells[2].find(text=True)
# cells[3].find("a",href=True)['title']
# cells[4].find(text=True).rstrip('\n')

'2020'

In [48]:
df=pd.DataFrame(titulo,columns=['Titulo'])
df['Director']=director
df['Actores']=actores
df['Pais']=pais
df['Fecha']=fecha
        

In [49]:
df

Unnamed: 0,Titulo,Director,Actores,Pais,Fecha
0,After Yang,Kogonada,Colin Farrell,United States,2020
1,Bill & Ted Face the Music,Dean Parisot,Keanu Reeves,United States,"August 21, 2020"
2,BIOS,Miguel Sapochnik,Tom Hanks,United States,"October 2, 2020"
3,Chaos Walking,Doug Liman,Tom Holland,United States,2020
4,Dune,Denis Villeneuve,Timothée Chalamet,United States,"November 20, 2020"
5,Evangelion: 3.0+1.0,Hideaki Anno,,Japan,"June, 2020"
6,Godzilla vs. Kong,Adam Wingard,Kyle Chandler,United States,"March 13, 2020"
7,The New Mutants,Josh Boone,Anya Taylor-Joy,United States,"April 3, 2020"
8,Stowaway,Joe Penna,Anna Kendrick,United States,2020
9,Voyagers,Neil Burger,Colin Farrell,United States,2020
