In [None]:
# Installing packages required for webscraping
!pip install requests==2.32.3 bs4==0.0.2

In [66]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin
import time
from random import randint
import re
import os

In [26]:
base_url = 'https://www.filmweb.pl'
ajax_url_template = 'https://www.filmweb.pl/ajax/ranking/film/{}'

film_links = []

for page_num in range(1, 21):  # 1 to 20 included
    ajax_url = ajax_url_template.format(page_num)
    print(f"Downloading {page_num}: {ajax_url}")
    
    r = requests.get(ajax_url)
    r.raise_for_status()
    
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # Find each element with ranking on this page
    ranking_elements = soup.find_all('div', class_='rankingType__header')
    
    for element in ranking_elements:
        link_tag = element.find('a', href=True)
        if link_tag:
            relative_link = link_tag['href']
            full_link = urljoin(base_url, relative_link)
            film_links.append(full_link)
    
    time.sleep(1)  # Pause to avoid overloading the server

print(f"Found {len(film_links)} links to movies.")

Downloading 1: https://www.filmweb.pl/ajax/ranking/film/1
Downloading 2: https://www.filmweb.pl/ajax/ranking/film/2
Downloading 3: https://www.filmweb.pl/ajax/ranking/film/3
Downloading 4: https://www.filmweb.pl/ajax/ranking/film/4
Downloading 5: https://www.filmweb.pl/ajax/ranking/film/5
Downloading 6: https://www.filmweb.pl/ajax/ranking/film/6
Downloading 7: https://www.filmweb.pl/ajax/ranking/film/7
Downloading 8: https://www.filmweb.pl/ajax/ranking/film/8
Downloading 9: https://www.filmweb.pl/ajax/ranking/film/9
Downloading 10: https://www.filmweb.pl/ajax/ranking/film/10
Downloading 11: https://www.filmweb.pl/ajax/ranking/film/11
Downloading 12: https://www.filmweb.pl/ajax/ranking/film/12
Downloading 13: https://www.filmweb.pl/ajax/ranking/film/13
Downloading 14: https://www.filmweb.pl/ajax/ranking/film/14
Downloading 15: https://www.filmweb.pl/ajax/ranking/film/15
Downloading 16: https://www.filmweb.pl/ajax/ranking/film/16
Downloading 17: https://www.filmweb.pl/ajax/ranking/film/1

In [27]:
# Function to extract boxoffice amount
def extract_amount(text):
    match = re.search(r"\$\d[\d\s]*", text)
    return match.group(0) if match else None

In [63]:
already_parsed_movies = []
print(len(film_links))

500


In [64]:
file_path = "filmweb_top500.csv"
if(not os.path.isfile(file_path)):
    with open(file_path, mode='a', newline='', encoding='utf8') as file:
        writer = csv.writer(file)
        writer.writerow(["position", "title", "year", "duration", "rating_value", "rating_count", "critic_rating_value", "critic_count", "directors", "screenwriters", "production_countries", "world_premiere", "polish_premiere", "boxoffice", "boxoffice_usa", "boxoffice_outside_usa", "budget", "genres"])
        print("Created csv with headers")

Created csv with headers


In [62]:
def scrap_single_movie(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    ranking_div = soup.select('div[data-ranking-category="wolrdRanking"]')[0]
    ranking_position = ranking_div["data-position"]
    print(f"{url} ({ranking_position})")

    # zawsze namniejsza pasuajca klasa
    h2 = soup.find("h2", class_="filmCoverSection__originalTitle")
    if h2 and h2.contents and h2.contents[0].strip():
        title = h2.contents[0].strip()
    else:
        h1 = soup.find("h1", class_="filmCoverSection__title")
        title = h1.text.strip() if h1 else "N/A"

    year = "N/A"

    # At first we're looking for <div class="filmCoverSection__year">
    year_div = soup.find("div", class_="filmCoverSection__year")
    if year_div and year_div.text.strip():
        year = year_div.text.strip()
    else:
        # If we don't find It, then we're looking for <h2 class="filmCoverSection__year">
        year_h2 = soup.find("h2", class_="filmCoverSection__year")
        if year_h2 and year_h2.text.strip():
            year = year_h2.text.strip()

    duration = soup.find("div", class_="filmCoverSection__duration").text.strip() # mozliwa wymagana transformacja (np. na same minuty)

    rating_value = soup.find("span", itemprop="ratingValue").text.strip()

    rating_count = soup.find("span", itemprop="ratingCount").text.strip()

    critic_rating_value = None
    for rating_class in ["filmRating__rateValue isHigh", "filmRating__rateValue isMedium", "filmRating__rateValue isLow"]:
        span = soup.find("span", class_=rating_class)
        if span:
            critic_rating_value = span.text.strip()
            break
        
    if critic_rating_value is None:
        critic_rating_value = "N/A" 

    critic_span = soup.find("span", class_="filmRating__count", attrs={"data-rating-count": True})
    critic_count = critic_span["data-rating-count"] if critic_span else None

    director_info = soup.find("span", {"data-type": "directing-info"})
    if director_info:
        directors = [span.text.strip() for span in director_info.find_all("span", itemprop="name")]
    else:
        directors = []


    screenwriter_info = soup.find("span", {"data-type": "screenwriting-info"})
    if screenwriter_info:
        screenwriters = [span.text.strip() for span in screenwriter_info.find_all("span", itemprop="name")]
    else:
        screenwriters = []

    world_premiere = "N/A"
    blocks = soup.find_all("span", class_="block")

    for block in blocks:
        if block.has_attr("content") and "Światowa" in block.text:
            world_premiere = block["content"]
            break

    production_countries = [
        a.find("span").text.strip()
        for a in soup.find("span", class_="filmInfo__info filmInfo__info--productionCountry").find_all("a")
        if a.find("span")
    ]

    premiere_span = soup.find("span", class_="block premiereCountry")
    polish_premiere = premiere_span["content"] if premiere_span and premiere_span.has_attr("content") else None

    boxoffice = soup.find("span", attrs={"data-i18n": "film:info.gross.label"})

    gross_spans = soup.find_all("span", attrs={"data-i18n": "film:info.gross.label"})


    boxoffice = extract_amount(gross_spans[0].text.strip()) if len(gross_spans) > 0 else None
    boxoffice_usa = extract_amount(gross_spans[1].text.strip()) if len(gross_spans) > 1 else None
    boxoffice_outside_usa = extract_amount(gross_spans[2].text.strip()) if len(gross_spans) > 2 else None
    budget = extract_amount(gross_spans[3].text.strip()) if len(gross_spans) > 3 else None

    genres = [span.text.strip() for span in soup.find_all("span", itemprop="genre")]
    return [ranking_position, title, year, duration, rating_value, rating_count, critic_rating_value, critic_count, directors, screenwriters, production_countries, world_premiere, polish_premiere, boxoffice, boxoffice_usa, boxoffice_outside_usa, budget, genres]

In [65]:
for url in film_links:
    if url in already_parsed_movies:
        continue

    movie_data = scrap_single_movie(url)
    with open(file_path, mode='a', newline='', encoding='utf8') as file:
        writer = csv.writer(file)
        writer.writerow(movie_data)

    time_to_wait = randint(1,4)
    time.sleep(time_to_wait)
    already_parsed_movies.append(url)

https://www.filmweb.pl/film/Skazani+na+Shawshank-1994-1048 (1)
https://www.filmweb.pl/film/Zielona+mila-1999-862 (2)
https://www.filmweb.pl/film/Nietykalni-2011-583390 (3)
https://www.filmweb.pl/film/Ojciec+chrzestny-1972-1089 (4)
https://www.filmweb.pl/film/Dwunastu+gniewnych+ludzi-1957-30701 (5)
https://www.filmweb.pl/film/Forrest+Gump-1994-998 (6)
https://www.filmweb.pl/film/Lot+nad+kuku%C5%82czym+gniazdem-1975-1019 (7)
https://www.filmweb.pl/film/Ojciec+chrzestny+II-1974-1090 (8)
https://www.filmweb.pl/film/W%C5%82adca+Pier%C5%9Bcieni%3A+Powr%C3%B3t+kr%C3%B3la-2003-11841 (9)
https://www.filmweb.pl/film/Lista+Schindlera-1993-1211 (10)
https://www.filmweb.pl/film/Pulp+Fiction-1994-1039 (11)
https://www.filmweb.pl/film/%C5%BBycie+jest+pi%C4%99kne-1997-208 (12)
https://www.filmweb.pl/film/W%C5%82adca+Pier%C5%9Bcieni%3A+Dwie+wie%C5%BCe-2002-31451 (13)
https://www.filmweb.pl/film/Siedem-1995-702 (14)
https://www.filmweb.pl/film/Podziemny+kr%C4%85g-1999-837 (15)
https://www.filmweb.pl/fil