In [1]:
%pip install scrapy
%pip install "pymongo[srv]"
%pip install python-dotenv


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Note: you may need to restart the kernel to use updated packages.


## Script Overview

The script consists of:

1. **WikiSpider Class**: A Scrapy spider that:
   - Crawls the Wikipedia page for highest-grossing films.
   - Extracts movie titles, release years, box office revenue, and country of origin.
   - Follows links to individual movie pages for more details.
   - Stores the data in a MongoDB collection.

2. **MongoDB Connection**:
   - Uses `pymongo` to connect to a MongoDB database.
   - Stores scraped movie data in the database under the collection.

3. **Crawler Process**:
   - Initializes and starts the Scrapy spider.

In [None]:
import os

import scrapy

from scrapy.crawler import CrawlerProcess
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from dotenv import load_dotenv


load_dotenv()
class WikiSpider(scrapy.Spider):
    name = "high_rates_films"
    start_urls = ["https://en.wikipedia.org/wiki/List_of_highest-grossing_films"]
    def __init__(self):
        uri = os.getenv("MONGODB_URI")
        db_name = os.getenv("MONGODB_DB_NAME")
        collection_name = os.getenv("MONGODB_COLLECTION")
        self.client = MongoClient(uri, server_api=ServerApi('1'))
        self.db = self.client[db_name]
        self.collection = self.db[collection_name]
    def parse(self, response):
        rows = response.xpath('//*[@id="mw-content-text"]/div[1]/table[1]/tbody')
        titles = set()
        for row in rows.xpath('.//tr'):
            if row.xpath('.//th/span//text()').get() is not None:
                title = row.xpath('.//th/span//text()').get()
                link = row.xpath('.//th/span/i/a/@href').get()
                year = row.xpath(".//td[4]//text()").get()
                year = row.xpath(".//td[4]//text()").get()
                if year:
                    year = year.strip()

        
                gross_value = row.xpath(".//td[3]//text()").get()
                if gross_value:
                    gross_value = row.xpath(".//td[3]/text()[not(parent::sup)]").getall()[-1].replace('$', '').replace(',', '').strip()
            else:
                title = row.xpath('.//th/i/a//text()').get()
                link = row.xpath('.//th/i/a/@href').get()
                year = row.xpath(".//td[4]//text()").get()
                year = row.xpath(".//td[4]//text()").get()
                if year:
                    year = year.strip()

        
                gross_value = row.xpath(".//td[3]//text()").get()
                if gross_value:
                    gross_value = row.xpath(".//td[3]/text()[not(parent::sup)]").getall()[-1].replace('$', '').replace(',', '').strip()
            if (title and link) and (title not in titles):
                    titles.add(title)
                    full_link = response.urljoin(link)    # Передаем название фильма и ссылку в следующий запрос через meta
                    yield scrapy.Request(
                        url=full_link, 
                        callback=self.parse_movie, 
                        meta={"title": title, "link": full_link, "year" : year, "gross_value" : gross_value}
            )

    def parse_movie(self, response):
        title = response.meta["title"].replace("\u2013", "-")
        link = response.meta["link"]
        year = response.meta["year"]
        gross_value = response.meta["gross_value"]
        movie_info = response.xpath('//*[@id="mw-content-text"]/div[1]/table[1]/@class')
        if movie_info.get() == "box-Expand_language plainlinks metadata ambox ambox-notice skin-invert-image":
             movie_info = response.xpath('//*[@id="mw-content-text"]/div[1]/table[2]')
        else:
            movie_info = response.xpath('//*[@id="mw-content-text"]/div[1]/table[1]')
        countries = None
        directed_by = None
        for row in movie_info.xpath('.//tr'):
            header = row.xpath('.//th/text()').get()
            if header:
                if header.strip() == "Directed by":
                    directed_by = row.xpath('.//td//li/text()').getall()
                if not directed_by:
                    directed_by = row.xpath('.//td//a/text()').getall()
                elif header.strip() == "Country":
                    countries = row.xpath('.//td//text()').getall()
                elif header.strip() == "Countries":
                    countries = row.xpath('.//td//li/text()').getall()
                    if not countries:
                        countries = row.xpath('.//td//text()').getall()
                        countries = [value.strip() for value in countries if value.strip()]

        movie_doc = {
            "Film Title": title,
            "Release Year": year,
            "Director(s)": directed_by,
            "Box Office Revenue": gross_value,
            "Country of Origin": countries
        }

        self.collection.insert_one(movie_doc)
        yield movie_doc
process = CrawlerProcess(settings=None)

process.crawl(WikiSpider)


process.start()

2025-03-02 18:30:29 [scrapy.utils.log] INFO: Scrapy 2.12.0 started (bot: scrapybot)
2025-03-02 18:30:29 [scrapy.utils.log] INFO: Versions: lxml 5.3.1.0, libxml2 2.12.9, cssselect 1.2.0, parsel 1.10.0, w3lib 2.3.1, Twisted 24.11.0, Python 3.9.21 (main, Dec 11 2024, 10:21:40) - [Clang 14.0.6 ], pyOpenSSL 25.0.0 (OpenSSL 3.4.1 11 Feb 2025), cryptography 44.0.1, Platform macOS-15.3-arm64-arm-64bit
2025-03-02 18:30:30 [pymongo.topology] DEBUG: {"topologyId": {"$oid": "67c47996758ff19447473668"}, "message": "Starting topology monitoring"}
2025-03-02 18:30:30 [pymongo.topology] DEBUG: {"topologyId": {"$oid": "67c47996758ff19447473668"}, "previousDescription": "<TopologyDescription id: 67c47996758ff19447473668, topology_type: Unknown, servers: []>", "newDescription": "<TopologyDescription id: 67c47996758ff19447473668, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('cluster0-shard-00-00.nputb.mongodb.net', 27017) server_type: Unknown, rtt: None>, <ServerDescription ('cluster0

2025-03-02 18:31:23 [pymongo.topology] DEBUG: {"topologyId": {"$oid": "67c47996758ff19447473668"}, "driverConnectionId": 1, "serverConnectionId": 60952, "serverHost": "cluster0-shard-00-00.nputb.mongodb.net", "serverPort": 27017, "awaited": true, "durationMS": 9851.956416, "reply": "{\"topologyVersion\": {\"processId\": {\"$oid\": \"67c1faf94104d0bbb56b807b\"}, \"counter\": 4}, \"hosts\": [\"cluster0-shard-00-00.nputb.mongodb.net:27017\", \"cluster0-shard-00-01.nputb.mongodb.net:27017\", \"cluster0-shard-00-02.nputb.mongodb.net:27017\"], \"setName\": \"atlas-x1qjng-shard-0\", \"setVersion\": 255, \"secondary\": true, \"primary\": \"cluster0-shard-00-01.nputb.mongodb.net:27017\", \"tags\": {\"nodeType\": \"ELECTABLE\", \"region\": \"US_EAST_1\", \"workloadType\": \"OPERATIONAL\", \"diskState\": \"READY\", \"provider\": \"AWS\", \"availabilityZone\": \"use1-az4\"}, \"me\": \"cluster0-shard-00-00.nputb.mongodb.net:27017\", \"lastWrite\": {\"opTime\": {\"ts\": {\"$timestamp\": {\"t\": 1740

## Exporting Scraped Data to JSON

This script retrieves movie data from a MongoDB database and saves it as a JSON file for use in a frontend application.

### Steps:

1. **MongoDB Connection**:
   - Connects to a MongoDB database using `pymongo`.
   - Retrieves all documents from the `high_grossing_films` collection in the `movies_db` database.

2. **Data Retrieval**:
   - Queries the database for all movie records while excluding the `_id` field.

3. **Saving to JSON**:
   - Writes the retrieved data to `output.json` inside the `frontend/public/` directory.
   - Ensures proper encoding and formatting for easy use in frontend applications.

In [None]:
import json
import os

from pymongo import MongoClient
from pymongo.server_api import ServerApi
from dotenv import load_dotenv


load_dotenv()


uri = os.getenv("MONGODB_URI")
db_name = os.getenv("MONGODB_DB_NAME")
collection_name = os.getenv("MONGODB_COLLECTION")


client = MongoClient(uri, server_api=ServerApi('1'))
db = client[db_name]
collection = db[collection_name]

data = list(collection.find({}, {"_id": 0}))


output_path = os.path.join("frontend", "public", "output.json")


with open(output_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)



2025-03-02 18:31:27 [pymongo.topology] DEBUG: {"topologyId": {"$oid": "67c479cf758ff1944747369b"}, "message": "Starting topology monitoring"}
2025-03-02 18:31:27 [pymongo.topology] DEBUG: {"topologyId": {"$oid": "67c479cf758ff1944747369b"}, "previousDescription": "<TopologyDescription id: 67c479cf758ff1944747369b, topology_type: Unknown, servers: []>", "newDescription": "<TopologyDescription id: 67c479cf758ff1944747369b, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('cluster0-shard-00-00.nputb.mongodb.net', 27017) server_type: Unknown, rtt: None>, <ServerDescription ('cluster0-shard-00-01.nputb.mongodb.net', 27017) server_type: Unknown, rtt: None>, <ServerDescription ('cluster0-shard-00-02.nputb.mongodb.net', 27017) server_type: Unknown, rtt: None>]>", "message": "Topology description changed"}
2025-03-02 18:31:27 [pymongo.topology] DEBUG: {"topologyId": {"$oid": "67c479cf758ff1944747369b"}, "serverHost": "cluster0-shard-00-00.nputb.mongodb.net", "serverPort": 27017

2025-03-02 18:31:40 [pymongo.topology] DEBUG: {"topologyId": {"$oid": "67c479cf758ff1944747369b"}, "driverConnectionId": 1, "serverConnectionId": 60971, "serverHost": "cluster0-shard-00-00.nputb.mongodb.net", "serverPort": 27017, "awaited": true, "durationMS": 10698.006249999991, "reply": "{\"topologyVersion\": {\"processId\": {\"$oid\": \"67c1faf94104d0bbb56b807b\"}, \"counter\": 4}, \"hosts\": [\"cluster0-shard-00-00.nputb.mongodb.net:27017\", \"cluster0-shard-00-01.nputb.mongodb.net:27017\", \"cluster0-shard-00-02.nputb.mongodb.net:27017\"], \"setName\": \"atlas-x1qjng-shard-0\", \"setVersion\": 255, \"secondary\": true, \"primary\": \"cluster0-shard-00-01.nputb.mongodb.net:27017\", \"tags\": {\"availabilityZone\": \"use1-az4\", \"region\": \"US_EAST_1\", \"diskState\": \"READY\", \"workloadType\": \"OPERATIONAL\", \"provider\": \"AWS\", \"nodeType\": \"ELECTABLE\"}, \"me\": \"cluster0-shard-00-00.nputb.mongodb.net:27017\", \"lastWrite\": {\"opTime\": {\"ts\": {\"$timestamp\": {\"t\