In [4]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from dotenv import load_dotenv
import os
load_dotenv()

uri = f"mongodb+srv://{os.environ.get('mongo_username')}:{os.environ.get('mongo_password')}@scrapy-engine.cnaygdb.mongodb.net/?retryWrites=true&w=majority&appName=scrapy-engine"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [6]:
# Creating mangodb
from pymongo import MongoClient

db =  client['scrapy-engine']
collection = db['scrapy-engine']

# Sample data with some duplicate elements
data = {"name": "Item A", "unique_values": ["value1", "value2", "value1", "value3"]}

collection.insert_one({'urls_crawled':['https://google.com', 'https://facebook.com']})



InsertOneResult(ObjectId('6637a3666add56aa87b92946'), acknowledged=True)

In [56]:
# Example from https://www.mongodb.com/docs/manual/tutorial/query-arrays/
import time

db.inventory.insert_many(
    [
        {"item": "journal", "qty": 25, "tags": ["blank", "red"], "dim_cm": [14, 21], "timestamp": time.time()},
        {"item": "notebook", "qty": 50, "tags": ["red", "blank"], "dim_cm": [14, 21], "timestamp": time.time()},
        {
            "item": "paper",
            "qty": 100,
            "tags": ["red", "blank", "plain"],
            "dim_cm": [14, 21],
            "datetime": time.time()
        },
        {"item": "planner", "qty": 75, "tags": ["blank", "red"], "dim_cm": [22.85, 30], "timestamp": time.time()},
        {"item": "postcard", "qty": 45, "tags": ["blue"], "dim_cm": [10, 15.25], "timestamp": time.time()},
    ]
)

#  array with exactly two elements, "red" and "blank", in the specified order
cursor = db.inventory.find({"tags": ["red", "blank"]})

# contains both the elements "red" and "blank", without regard to orde
cursor = db.inventory.find({"tags": {"$all": ["red", "blank"]}})

# elements with timestamps less than the current time
cursor = db.inventory.find({"timestamp": {"$lt": time.time()}})
list(cursor)

# elements with timestamps more than an hour ago
cursor = db.['inventory'].find({"timestamp": {"$lt": time.time() - 3600}})
print(list(cursor))

# cursor = db.inventory.find({"datetime": {"$lt": datetime(2021, 1, 1)}})

[{'_id': ObjectId('66379e6e08ad9df06d752a84'), 'item': 'journal', 'qty': 25, 'tags': ['blank', 'red'], 'dim_cm': [14, 21], 'timestamp': 1714921070.751505}, {'_id': ObjectId('66379e6e08ad9df06d752a85'), 'item': 'notebook', 'qty': 50, 'tags': ['red', 'blank'], 'dim_cm': [14, 21], 'timestamp': 1714921070.75152}, {'_id': ObjectId('66379e6e08ad9df06d752a87'), 'item': 'planner', 'qty': 75, 'tags': ['blank', 'red'], 'dim_cm': [22.85, 30], 'timestamp': 1714921070.751525}, {'_id': ObjectId('66379e6e08ad9df06d752a88'), 'item': 'postcard', 'qty': 45, 'tags': ['blue'], 'dim_cm': [10, 15.25], 'timestamp': 1714921070.751527}, {'_id': ObjectId('66379e8808ad9df06d752a89'), 'item': 'journal', 'qty': 25, 'tags': ['blank', 'red'], 'dim_cm': [14, 21], 'timestamp': 1714921096.490667}, {'_id': ObjectId('66379e8808ad9df06d752a8a'), 'item': 'notebook', 'qty': 50, 'tags': ['red', 'blank'], 'dim_cm': [14, 21], 'timestamp': 1714921096.490681}, {'_id': ObjectId('66379e8808ad9df06d752a8c'), 'item': 'planner', 'qty

In [61]:
db.url_to_crawl.insert_many([
    {'url':'https://google.com/', 'timestamp':time.time()},
    {'url':'https://facebook.com/', 'timestamp':time.time()}
    ])

InsertManyResult([ObjectId('6637a1c308ad9df06d752aad'), ObjectId('6637a1c308ad9df06d752aae')], acknowledged=True)

In [8]:
# find all urls crawled before timestamp now
cursor = db['url_to_crawl'].find({"timestamp": {"$lt": time.time()}})
list(cursor)

[{'_id': ObjectId('6637a00608ad9df06d752aac'),
  'url': 'https://google.com/',
  'timestamp': 1714921478.812755},
 {'_id': ObjectId('6637a1c308ad9df06d752aad'),
  'url': 'https://google.com/',
  'timestamp': 1714921923.807448},
 {'_id': ObjectId('6637a1c308ad9df06d752aae'),
  'url': 'https://facebook.com/',
  'timestamp': 1714921923.8074486}]

In [3]:
from mongo_db_handler import MongoDBHandler
import time

db_handler = MongoDBHandler(collection_name="scrapy-engine", db_name="scrapy-engine")

# insert item
db_handler.insert_one({'url':'https://instagram.com/', 'timestamp':time.time()}, collection_name='url_to_crawl')

# insert many items
db_handler.insert_many([
    {'url':'https://youtube.com/', 'timestamp':time.time()},
    {'url':'https://twitter.com/', 'timestamp':time.time()}
    ], collection_name='url_to_crawl')

# Get item before ti
# timestamp
db_handler.get_items_before_timestamp(timestamp=time.time(), collection_name='url_to_crawl')

# Check if url exists
db_handler.exists(field='url', vlaue='https://youtube.com/', collection_name='url_to_crawl')

# Delete all items in collection url_to_crawl
db_handler.delete_all(collection_name='url_to_crawl')

# Delete an item in collection url_to_crawl
db_handler.insert_one({'url':'https://instagram.com/', 'timestamp':time.time()}, collection_name='url_to_crawl')
db_handler.delete_one(field='url', value='https://instagram.com/', collection_name='url_to_crawl')

# Get all items in collection url_to_crawl
db_handler.get_all_entries(collection_name='url_to_crawl')

[{'_id': ObjectId('6637af473ebbc93e4f23e593'),
  'url': 'https://instagram.com/',
  'timestamp': 1714925383.6190534}]

In [14]:
list(db_handler.db['url_to_crawl'].find({"timestamp": {"$lt": time.time()}}))

[]

In [48]:
# get current datetime in seconds
type(datetime.now().timestamp())

float

In [34]:
# check if 'https://facebook.com' is in the list of urls_crawled
collection.find_one({'urls_crawled': 'https://facebook.com'})

In [28]:
dir(collection.find_one)

['__call__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__func__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__self__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']