In [2]:
import requests
import sqlite3
import json
from bs4 import BeautifulSoup
import time
import re

In [3]:
URL = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"

response = requests.get(URL)
if response.status_code != 200:
    print("-----------------Error requesting wiki:", response.status_code)
    exit()

soup = BeautifulSoup(response.text, "html.parser")

In [4]:
films_table = soup.find("table", class_="wikitable")
if not films_table:
    print("---------------Error, no table found-------------------")
    exit()

rows = films_table.find_all("tr")[1:] 

data = []

for row in rows:
    cols = row.find_all("td")
    if len(cols) < 5:
        continue

    title_tag = row.find("th", scope="row").find("a") if row.find("th", scope="row") else None
    if not title_tag:
        continue

    title = title_tag.get_text(strip=True)
    film_url = "https://en.wikipedia.org" + title_tag["href"]
    
    print(f"Парсим {title} -> {film_url}")

    try:
        film_response = requests.get(film_url)
        film_soup = BeautifulSoup(film_response.text, "html.parser")
        

        year_tag = film_soup.find("span", class_="bday")
        release_year = year_tag.get_text(strip=True) if year_tag else "Unknown"

        director_tag = film_soup.find("th", string="Directed by")
        director = "Unknown"
        if director_tag:
            director_data = director_tag.find_next_sibling("td")
            if director_data:
                director = ", ".join([a.get_text(strip=True) for a in director_data.find_all("a")]) or director_data.get_text(strip=True)

        box_office_tag = film_soup.find("th", string="Box office")
        box_office = "Unknown"
        if box_office_tag:
            box_office_data = box_office_tag.find_next_sibling("td")
            box_office = box_office_data.get_text(strip=True) if box_office_data else "Unknown"

        country_tag = film_soup.find("th", string="Country")
        country = "Unknown"
        if country_tag:
            country_data = country_tag.find_next_sibling("td")
            if country_data:
                country = ", ".join([a.get_text(strip=True) for a in country_data.find_all("a")]) or country_data.get_text(strip=True)

        print(f"Добавлен фильм: {title}, {release_year}, {director}, {box_office}, {country}\n")

        data.append((title, release_year, director, box_office, country))
    
    except Exception as e:
        print(f"Error while proccessing {title}: {e}")

    time.sleep(1)

Парсим Avatar -> https://en.wikipedia.org/wiki/Avatar_(2009_film)
Добавлен фильм: Avatar, 2009-12-10, James Cameron, $2.923 billion[5], Unknown

Парсим Avengers: Endgame -> https://en.wikipedia.org/wiki/Avengers:_Endgame
Добавлен фильм: Avengers: Endgame, 2019-04-22, Anthony RussoJoe Russo, $2.799billion[4], United States

Парсим Avatar: The Way of Water -> https://en.wikipedia.org/wiki/Avatar:_The_Way_of_Water
Добавлен фильм: Avatar: The Way of Water, 2022-12-06, James Cameron, $2.320 billion[4][5], United States

Парсим Titanic -> https://en.wikipedia.org/wiki/Titanic_(1997_film)
Добавлен фильм: Titanic, 1997-11-01, James Cameron, $2.264 billion[7], United States

Парсим Star Wars: The Force Awakens -> https://en.wikipedia.org/wiki/Star_Wars:_The_Force_Awakens
Добавлен фильм: Star Wars: The Force Awakens, 2015-12-14, J. J. Abrams, $2.07billion[3], United States

Парсим Avengers: Infinity War -> https://en.wikipedia.org/wiki/Avengers:_Infinity_War
Добавлен фильм: Avengers: Infinity Wa

In [5]:
if not data:
    print("Error in collecting data, no movies found!")
    exit()

In [6]:
def clean_box_office(value):
    if value == "Unknown":
        return value
    value = re.sub(r"\[.*?\]", "", value)  
    value = re.sub(r"(\d)(?=[mbMB])", r"\1 ", value)  
    return value.strip()

data = [(title, release_year, director, clean_box_office(box_office), country) for title, release_year, director, box_office, country in data]


In [7]:
conn = sqlite3.connect("films.db")
cursor = conn.cursor()

cursor.execute("""
CREATE TABLE IF NOT EXISTS films (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    title TEXT NOT NULL,
    release_year TEXT,
    director TEXT,
    box_office TEXT,
    country TEXT
)
""")

cursor.executemany("INSERT INTO films (title, release_year, director, box_office, country) VALUES (?, ?, ?, ?, ?)", data)

conn.commit()
conn.close()


In [8]:
with open("films.json", "w", encoding="utf-8") as f:
    json.dump([{"title": d[0], "release_year": d[1], "director": d[2], "box_office": d[3], "country": d[4]} for d in data], f, indent=4, ensure_ascii=False)

print("Movies are saved in films.db and films.json")

Movies are saved in films.db and films.json
