In [17]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import re
import time

# Wikipedia page URL
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_films'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Locate the first table
table = soup.find_all('table', {'class': 'wikitable'})[0]
films = []

# Function to fetch director and country from a film's Wikipedia page
def get_film_details(film_url):
    film_response = requests.get(film_url)
    film_soup = BeautifulSoup(film_response.content, 'html.parser')

    # Extract director(s)
    director = 'Unknown'
    director_tag = film_soup.find('th', text='Directed by')
    if director_tag:
        director_cell = director_tag.find_next('td')
        if director_cell:
            director = ', '.join([a.get_text(strip=True) for a in director_cell.find_all('a')]) or director_cell.get_text(strip=True)

    # Extract country
    country = 'Unknown'
    country_tag = film_soup.find('th', text='Country')
    if country_tag:
        country_cell = country_tag.find_next('td')
        if country_cell:
            country = ', '.join([a.get_text(strip=True) for a in country_cell.find_all('a')]) or country_cell.get_text(strip=True)

    return director, country

# Iterate over table rows
for row in table.find_all('tr')[1:]:
    cells = row.find_all(['th', 'td'])

    if len(cells) >= 5:
        # Extract film title and Wikipedia link
        title_cell = cells[2].find('a')
        title = title_cell.get_text(strip=True) if title_cell else cells[2].get_text(strip=True)
        film_url = 'https://en.wikipedia.org' + title_cell['href'] if title_cell else None

        # Extract box office revenue
        box_office_text = cells[3].get_text(strip=True)
        box_office_cleaned = re.sub(r'[^\d.]', '', box_office_text)
        box_office = float(box_office_cleaned) if box_office_cleaned else None

        # Extract release year
        year_text = re.search(r'\d{4}', cells[4].get_text(strip=True))
        year = int(year_text.group()) if year_text else None

        # Extract additional details from film page
        director, country = get_film_details(film_url) if film_url else ('Unknown', 'Unknown')

        print(title, year, director, box_office, country)

        films.append((title, year, director, box_office, country))

        # Small delay to prevent overwhelming Wikipedia servers
        time.sleep(0.1)

# Store in SQLite database
conn = sqlite3.connect('films1.db')
c = conn.cursor()

c.execute('''
    CREATE TABLE IF NOT EXISTS films (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        title TEXT NOT NULL,
        release_year INTEGER,
        director TEXT,
        box_office REAL,
        country TEXT
    )
''')

c.executemany('''
    INSERT INTO films (title, release_year, director, box_office, country)
    VALUES (?, ?, ?, ?, ?)
''', films)

conn.commit()
conn.close()

print("Data extraction and storage complete.")


  director_tag = film_soup.find('th', text='Directed by')
  country_tag = film_soup.find('th', text='Country')


Avatar 2009 James Cameron 2923706026.0 Unknown
Avengers: Endgame 2019 Anthony RussoJoe Russo 2797501328.0 United States
Avatar: The Way of Water 2022 James Cameron 2320250281.0 United States
Titanic 1997 James Cameron 2257844554.0 United States
Star Wars: The Force Awakens 2015 J. J. Abrams 2068223624.0 United States
Avengers: Infinity War 2018 Anthony RussoJoe Russo 2048359754.0 United States
Ne Zha 2 2025 Jiaozi 1983302560.0 China
Spider-Man: No Way Home 2021 Jon Watts 1922598800.0 United States
Inside Out 2 2024 Kelsey Mann 1698863816.0 United States
Jurassic World 2015 Colin Trevorrow 1671537444.0 United States
The Lion King 2019 Jon Favreau 1656943394.0 United States
The Avengers 2012 Joss Whedon 1518815515.0 United States
Furious 7 2015 James Wan 1515341399.0 Unknown
Top Gun: Maverick 2022 Joseph Kosinski 1495696292.0 United States
Frozen 2 2019 Chris Buck, Jennifer Lee 1450026933.0 United States
Barbie 2023 Greta Gerwig 1447038421.0 Unknown
Avengers: Age of Ultron 2015 Joss Whed

In [14]:
conn = sqlite3.connect('highest_grossing_films.db')
c = conn.cursor()

# Удаляем таблицу, если она существует
c.execute('DROP TABLE IF EXISTS films')

# Сохраняем изменения и закрываем соединение
conn.commit()
conn.close()

In [19]:
import sqlite3
import json

# Connect to the SQLite database
conn = sqlite3.connect('films1.db')
c = conn.cursor()

# Fetch all records from the films table
c.execute('SELECT title, release_year, director, box_office, country FROM films')
films = c.fetchall()

# Define a list to hold the film data
film_list = []
for film in films:
    print(film)
    film_dict = {
        'title': film[0],
        'release_year': film[1],
        'directors': film[2],
        'box_office': film[3],
        'country': film[4]
    }
    film_list.append(film_dict)

# Write the data to a JSON file
with open('films.json', 'w') as f:
    json.dump(film_list, f, indent=4)

# Close the database connection
conn.close()


('Avatar', 2009, 'James Cameron', 2923706026.0, 'Unknown')
('Avengers: Endgame', 2019, 'Anthony RussoJoe Russo', 2797501328.0, 'United States')
('Avatar: The Way of Water', 2022, 'James Cameron', 2320250281.0, 'United States')
('Titanic', 1997, 'James Cameron', 2257844554.0, 'United States')
('Star Wars: The Force Awakens', 2015, 'J. J. Abrams', 2068223624.0, 'United States')
('Avengers: Infinity War', 2018, 'Anthony RussoJoe Russo', 2048359754.0, 'United States')
('Ne Zha 2', 2025, 'Jiaozi', 1983302560.0, 'China')
('Spider-Man: No Way Home', 2021, 'Jon Watts', 1922598800.0, 'United States')
('Inside Out 2', 2024, 'Kelsey Mann', 1698863816.0, 'United States')
('Jurassic World', 2015, 'Colin Trevorrow', 1671537444.0, 'United States')
('The Lion King', 2019, 'Jon Favreau', 1656943394.0, 'United States')
('The Avengers', 2012, 'Joss Whedon', 1518815515.0, 'United States')
('Furious 7', 2015, 'James Wan', 1515341399.0, 'Unknown')
('Top Gun: Maverick', 2022, 'Joseph Kosinski', 1495696292.0,