## Mongo Speed Check
* locally hostd mongo community edition
* sqlite was very slow (~700 entries/sec.)
* But mongo seems impressive

In [None]:
# Checking write speed of mongo
import time
from sqlite_handler import URLDatabase
sqlite_db = URLDatabase('urls.db')

# number_of_entries = 1,679,043
to_crawl_data = sqlite_db.fetch('to_crawl', 100000)
to_crawl_data = [{'url': data[0], 'timestamp':data[1]} for data in to_crawl_data]
# upload to mongo db
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client['url_database']
collection = db['to_crawl']

# Create index by url
# collection.create_index('url', unique=True)

# set unique index to false
collection.create_index('url', unique=False)

# lets see how it performs with 2 million entries
time_array = {}
entries = 0
write_rates = {}
prevoius_time = time.time()
start_time = time.time()
for i in range(20):
    sth = collection.insert_many([{'url':data['url'], 'timestamp':data['timestamp']} for data in to_crawl_data])
    entries += 100000
    time_array[entries] = time.time() - prevoius_time
    write_rates[entries] = (100000 / time_array[entries])
    prevoius_time = time.time()
    print(f"n. Entries: {entries}, time: {time_array[entries]}, rate:{write_rates[entries]}")


print("--- %s seconds ---" % (time.time() - start_time))
print(f'write rates: {write_rates} entries per second')
print(f'average write rate: {entries / time_array[entries]} entries per second')


'''
n. Entries: 100000, time: 3.170332908630371, rate:31542.428786509183
n. Entries: 200000, time: 4.867839813232422, rate:20542.99316262759
n. Entries: 300000, time: 7.408970832824707, rate:13497.151258439291
n. Entries: 400000, time: 8.563728332519531, rate:11677.156971486862
n. Entries: 500000, time: 8.947088718414307, rate:11176.819985498367
n. Entries: 600000, time: 9.271768569946289, rate:10785.42882575199
n. Entries: 700000, time: 10.188531875610352, rate:9814.956778943131
n. Entries: 800000, time: 11.131722927093506, rate:8983.335343050081
n. Entries: 900000, time: 12.331193685531616, rate:8109.514986966069
n. Entries: 1000000, time: 12.482326745986938, rate:8011.326897218898
n. Entries: 1100000, time: 12.25938105583191, rate:8157.018657351303
n. Entries: 1200000, time: 13.163069486618042, rate:7597.012239558782
n. Entries: 1300000, time: 13.669374227523804, rate:7315.623841700537
n. Entries: 1400000, time: 15.704973459243774, rate:6367.409678183258
n. Entries: 1500000, time: 14.366358995437622, rate:6960.705912455436
n. Entries: 1600000, time: 16.173240184783936, rate:6183.052923067434
n. Entries: 1700000, time: 15.36324954032898, rate:6509.039623257897
n. Entries: 1800000, time: 16.55576729774475, rate:6040.191203558541
n. Entries: 1900000, time: 15.8906729221344, rate:6292.9997042924615
n. Entries: 2000000, time: 19.180275201797485, rate:5213.6895298889385
'''

In [None]:
# Can we use same momngo collection writer for multiple threads- we cant for sqlite?
# it seems we can.

from mongo import Mongo
import random
import time
import threading
mongo = Mongo(local=True)

def get_random_data(n):
    return [{'url':f'https://www.doomain.com/{random.randint(1, 10000000)}/', 'status':'test', 'timestamp':time.time()} for i in range(n)]

batch_size = 20000
data = get_random_data(batch_size)

start_time= time.time()
_ = mongo.collection.insert_many(data, ordered=False)
stop_time = time.time()
print(f'time: {stop_time - start_time} rate: {batch_size / (stop_time - start_time)} entries per second')

# Delete the inserted data
# mongo.collection.delete_many({'status':'test', 'url':{'$in':[d['url'] for d in data]}})
data_urls = [d['url'] for d in data]
mongo.collection.delete_many({'status':'test', 'url':{'$in':data_urls}})

def writer_1():
    while True:
        test_data = get_random_data(100)
        # Save to mongo
        mongo.collection.insert_many(test_data)
        print(f"writer_1: inserted 100 entries")
        time.sleep(5)

def writer_2():
    test_data = get_random_data(100)
    # Save to mongo
    mongo.collection.insert_many(test_data)
    print(f"writer_2: inserted 100 entries")
    time.sleep(5)

thread1 = threading.Thread(target=writer_1)
thread2 = threading.Thread(target=writer_2)

thread1.start()
thread2.start()
thread1.join()

In [None]:
# Remove all urls starting with 'https://www.doomain.com/'
mongo.collection.delete_many({'url': {'$regex': '^https://www.doomain.com/'}})

In [4]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from dotenv import load_dotenv
import os
load_dotenv()

uri = f"mongodb+srv://{os.environ.get('mongo_username')}:{os.environ.get('mongo_password')}@scrapy-engine.cnaygdb.mongodb.net/?retryWrites=true&w=majority&appName=scrapy-engine"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [6]:
# Creating mangodb
from pymongo import MongoClient

db =  client['scrapy-engine']
collection = db['scrapy-engine']

# Sample data with some duplicate elements
data = {"name": "Item A", "unique_values": ["value1", "value2", "value1", "value3"]}

collection.insert_one({'urls_crawled':['https://google.com', 'https://facebook.com']})



InsertOneResult(ObjectId('6637a3666add56aa87b92946'), acknowledged=True)

In [56]:
# Example from https://www.mongodb.com/docs/manual/tutorial/query-arrays/
import time

db.inventory.insert_many(
    [
        {"item": "journal", "qty": 25, "tags": ["blank", "red"], "dim_cm": [14, 21], "timestamp": time.time()},
        {"item": "notebook", "qty": 50, "tags": ["red", "blank"], "dim_cm": [14, 21], "timestamp": time.time()},
        {
            "item": "paper",
            "qty": 100,
            "tags": ["red", "blank", "plain"],
            "dim_cm": [14, 21],
            "datetime": time.time()
        },
        {"item": "planner", "qty": 75, "tags": ["blank", "red"], "dim_cm": [22.85, 30], "timestamp": time.time()},
        {"item": "postcard", "qty": 45, "tags": ["blue"], "dim_cm": [10, 15.25], "timestamp": time.time()},
    ]
)

#  array with exactly two elements, "red" and "blank", in the specified order
cursor = db.inventory.find({"tags": ["red", "blank"]})

# contains both the elements "red" and "blank", without regard to orde
cursor = db.inventory.find({"tags": {"$all": ["red", "blank"]}})

# elements with timestamps less than the current time
cursor = db.inventory.find({"timestamp": {"$lt": time.time()}})
list(cursor)

# elements with timestamps more than an hour ago
cursor = db.['inventory'].find({"timestamp": {"$lt": time.time() - 3600}})
print(list(cursor))

# cursor = db.inventory.find({"datetime": {"$lt": datetime(2021, 1, 1)}})

[{'_id': ObjectId('66379e6e08ad9df06d752a84'), 'item': 'journal', 'qty': 25, 'tags': ['blank', 'red'], 'dim_cm': [14, 21], 'timestamp': 1714921070.751505}, {'_id': ObjectId('66379e6e08ad9df06d752a85'), 'item': 'notebook', 'qty': 50, 'tags': ['red', 'blank'], 'dim_cm': [14, 21], 'timestamp': 1714921070.75152}, {'_id': ObjectId('66379e6e08ad9df06d752a87'), 'item': 'planner', 'qty': 75, 'tags': ['blank', 'red'], 'dim_cm': [22.85, 30], 'timestamp': 1714921070.751525}, {'_id': ObjectId('66379e6e08ad9df06d752a88'), 'item': 'postcard', 'qty': 45, 'tags': ['blue'], 'dim_cm': [10, 15.25], 'timestamp': 1714921070.751527}, {'_id': ObjectId('66379e8808ad9df06d752a89'), 'item': 'journal', 'qty': 25, 'tags': ['blank', 'red'], 'dim_cm': [14, 21], 'timestamp': 1714921096.490667}, {'_id': ObjectId('66379e8808ad9df06d752a8a'), 'item': 'notebook', 'qty': 50, 'tags': ['red', 'blank'], 'dim_cm': [14, 21], 'timestamp': 1714921096.490681}, {'_id': ObjectId('66379e8808ad9df06d752a8c'), 'item': 'planner', 'qty

In [61]:
db.url_to_crawl.insert_many([
    {'url':'https://google.com/', 'timestamp':time.time()},
    {'url':'https://facebook.com/', 'timestamp':time.time()}
    ])

InsertManyResult([ObjectId('6637a1c308ad9df06d752aad'), ObjectId('6637a1c308ad9df06d752aae')], acknowledged=True)

In [8]:
# find all urls crawled before timestamp now
cursor = db['url_to_crawl'].find({"timestamp": {"$lt": time.time()}})
list(cursor)

[{'_id': ObjectId('6637a00608ad9df06d752aac'),
  'url': 'https://google.com/',
  'timestamp': 1714921478.812755},
 {'_id': ObjectId('6637a1c308ad9df06d752aad'),
  'url': 'https://google.com/',
  'timestamp': 1714921923.807448},
 {'_id': ObjectId('6637a1c308ad9df06d752aae'),
  'url': 'https://facebook.com/',
  'timestamp': 1714921923.8074486}]

In [5]:
from mongo_db_handler import MongoDBHandler
import time

db_handler = MongoDBHandler(collection_name="scrapy-engine", db_name="scrapy-engine")

# insert item
db_handler.insert_one({'url':'https://instagram.com/', 'timestamp':time.time()}, collection_name='url_to_crawl')

# insert many items
db_handler.insert_many([
    {'url':'https://youtube.com/', 'timestamp':time.time()},
    {'url':'https://twitter.com/', 'timestamp':time.time()}
    ], collection_name='url_to_crawl')

# Get item before ti
# timestamp
db_handler.get_items_before_timestamp(timestamp=time.time(), collection_name='url_to_crawl')

# Check if url exists
db_handler.exists(field='url', vlaue='https://youtube.com/', collection_name='url_to_crawl')

# Delete all items in collection url_to_crawl
db_handler.delete_all(collection_name='url_to_crawl')

# Delete an item in collection url_to_crawl
db_handler.insert_one({'url':'https://instagram.com/', 'timestamp':time.time()}, collection_name='url_to_crawl')
# db_handler.delete_one(field='url', value='https://instagram.com/', collection_name='url_to_crawl')

# Get all items in collection url_to_crawl
db_handler.get_all_entries(collection_name='url_to_crawl')

[{'_id': ObjectId('66385c238a00704fbbd2673f'),
  'url': 'https://instagram.com/',
  'timestamp': 1714969635.2607012}]

In [8]:
# the entry does not exists: it should not throw error
db_handler.delete_one(field='url', value='https://hululu.com/', collection_name='url_to_crawl')

In [14]:
list(db_handler.db['url_to_crawl'].find({"timestamp": {"$lt": time.time()}}))

[]

In [48]:
# get current datetime in seconds
type(datetime.now().timestamp())

float

In [34]:
# check if 'https://facebook.com' is in the list of urls_crawled
collection.find_one({'urls_crawled': 'https://facebook.com'})

### Unique index

In [6]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from dotenv import load_dotenv
import os
load_dotenv()

uri = f"mongodb+srv://{os.environ.get('mongo_username')}:{os.environ.get('mongo_password')}@scrapy-engine.cnaygdb.mongodb.net/?retryWrites=true&w=majority&appName=scrapy-engine"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

db =  client['scrapy-engine']
collection = db['test']

collection.create_index( { "a.loc": 1, "a.qty": 1 }, { 'unique': True } )


AttributeError: 'dict' object has no attribute 'in_transaction'

In [12]:
from mongo_db_handler import MongoDBHandler
import time

db_handler = MongoDBHandler(collection_name="scrapy-engine", db_name="scrapy-engine")

In [15]:
# db_handler.db['test'].insertMany( [
#    { '_id': 2, 'a': [ { 'loc': "A" }, { 'qty': 5 } ] },
#    { '_id': 3, 'a': [ { 'loc': "A", 'qty': 10 } ] }
# ] )
# db.test.create_index({ "a.loc": 1, "a.qty": 1 }, { 'unique': True })

TypeError: 'Collection' object is not callable. If you meant to call the 'insertMany' method on a 'Collection' object it is failing because no such method exists.

In [35]:
collection_names = ['url_crawled', 'url_to_crawl', 'url_crawling']
for collection_name in collection_names:
    db_handler.delete_all(collection_name=collection_name)
    db_handler.db[collection_name].create_index('url', unique=True)

In [30]:
try:
    db_handler.insert_one({'url':'https://instagram.com/', 'timestamp':time.time()}, collection_name='test3')
except Exception as e:
    print(e)


db_handler.insert_many({'url':'https://instagram.com/', 'timestamp':time.time()}, {'url':'https://instagram2.com/', 'timestamp':time.time()}, collection_name='test3')

E11000 duplicate key error collection: scrapy-engine.test3 index: url_1 dup key: { url: "https://instagram.com/" }, full error: {'index': 0, 'code': 11000, 'errmsg': 'E11000 duplicate key error collection: scrapy-engine.test3 index: url_1 dup key: { url: "https://instagram.com/" }', 'keyPattern': {'url': 1}, 'keyValue': {'url': 'https://instagram.com/'}}


TypeError: MongoDBHandler.insert_many() got multiple values for argument 'collection_name'

In [31]:
db_handler.get_all_entries(collection_name='test3')

[{'_id': ObjectId('6638804952a503b1b1e0cff1'),
  'url': 'https://instagram.com/',
  'timestamp': 1714978889.2119324}]

In [22]:
db_handler.insert_many( data=[
   { '_id': 2, 'a': [ { 'url': "A" }, { 'qty': 5 } ] },
   { '_id': 3, 'a': [ { 'loc': "A", 'qty': 10 } ] }
], collection_name='test2' )

db_handler.get_all_entries(collection_name='test2')

[{'_id': 2, 'a': [{'loc': 'A'}, {'qty': 5}]},
 {'_id': 3, 'a': [{'loc': 'A', 'qty': 10}]}]

### Single Collection

In [45]:
db_handler.delete_all(collection_name='urls-collection')
db_handler.db['urls-collection'].create_index('url', unique=True)  # Create index
db_handler.get_all_entries(collection_name='urls-collection')

[]

In [None]:
db_handler.insert_many(
    [
        {'url':'https://instagram.com/', 'timestamp':time.time(), 'status':'crawling'},
        {'url':'https://facebook.com/', 'timestamp':time.time(), 'status':'crawled'},
        {'url':'https://twitter.com/', 'timestamp':time.time(), 'status':'to_crawl'}
    ],
    collection_name='urls-collection'
)

In [84]:
db_handler.db['urls-collection'].insert_one({'url':'url-test', 'timestamp':time.time(), 'status':'crawled'})
# db_handler.insert_one({'url':'url', 'timestamp':time.time(), 'status':'crawled'}, collection_name='urls-collection')

InsertOneResult(ObjectId('66388b8d52a503b1b1e0cffe'), acknowledged=True)

In [52]:
# def fetch_start_urls(self, number_of_urls_required=15)
n=15
# Get all entries with  status 'to_crawl'
urls = list(db_handler.db['urls-collection'].find({'status':'to_crawl'}).limit(n))


In [56]:
for url in urls:
    ## update status to crawling
    db_handler.db['urls-collection'].update_one({'_id':url['_id']}, {'$set': {'status':'crawling'}})
list(db_handler.db['urls-collection'].find({'status':'crawling'}).limit(n))

[{'_id': ObjectId('6638860752a503b1b1e0cff6'),
  'url': 'https://instagram.com/',
  'timestamp': 1714980359.6815581,
  'status': 'crawling'},
 {'_id': ObjectId('6638860752a503b1b1e0cff8'),
  'url': 'https://twitter.com/',
  'timestamp': 1714980359.6815598,
  'status': 'crawling'}]

In [60]:
# get items with status 'crawling' and before timestamp now
timestamp = time.time() - 3600  # 1 hour ago
list(db_handler.db['urls-collection'].find({'status':'crawling', 'timestamp': {'$lt': timestamp}}))

[{'_id': ObjectId('6638860752a503b1b1e0cff6'),
  'url': 'https://instagram.com/',
  'timestamp': 1714980359.6815581,
  'status': 'crawling'},
 {'_id': ObjectId('6638860752a503b1b1e0cff8'),
  'url': 'https://twitter.com/',
  'timestamp': 1714980359.6815598,
  'status': 'crawling'}]

In [88]:
# update to crawling only if status is 'to_crawl' otherwise return False
list(db_handler.db['urls-collection'].find({'url':'https://twitter.com/'}))[0]

{'_id': ObjectId('6638860752a503b1b1e0cff8'),
 'url': 'https://twitter.com/',
 'timestamp': 1714980359.6815598,
 'status': 'crawled'}

In [66]:
url = 'https://instagram.com/'
success_reponse=db_handler.db['urls-collection'].update_one({'url':url}, {'$set': {'status':'crawled'}})

# failure_response=db_handler.db['urls-collection'].update_one({'url':url+'noise'}, {'$set': {'status':'crawled'}})

# print(f'success_reponse: {success_reponse}')
# print(f'failure_response: {failure_response}')

success_reponse: UpdateResult({'n': 1, 'electionId': ObjectId('7fffffff0000000000000400'), 'opTime': {'ts': Timestamp(1714981246, 30), 't': 1024}, 'nModified': 1, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1714981246, 31), 'signature': {'hash': b'\x826\xd1\xc9\xae\x10+\xeb%#\xe3\xeb\x85\xd3o^\xb2<\xfa\xb5', 'keyId': 7318892626235621378}}, 'operationTime': Timestamp(1714981246, 30), 'updatedExisting': True}, acknowledged=True)
failure_response: UpdateResult({'n': 0, 'electionId': ObjectId('7fffffff0000000000000400'), 'opTime': {'ts': Timestamp(1714981246, 37), 't': 1024}, 'nModified': 0, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1714981246, 37), 'signature': {'hash': b'\x826\xd1\xc9\xae\x10+\xeb%#\xe3\xeb\x85\xd3o^\xb2<\xfa\xb5', 'keyId': 7318892626235621378}}, 'operationTime': Timestamp(1714981246, 37), 'updatedExisting': False}, acknowledged=True)


In [78]:
success_reponse.raw_result['n']     # 1
failure_response.raw_result['n']    # 0

0

In [103]:
# get all entries with status 'crawled'
list(db_handler.db['urls-collection'].find())

[{'_id': ObjectId('6638860752a503b1b1e0cff6'),
  'url': 'https://instagram.com/',
  'timestamp': 1714980359.6815581,
  'status': 'crawled'},
 {'_id': ObjectId('6638860752a503b1b1e0cff7'),
  'url': 'https://facebook.com/',
  'timestamp': 1714980359.6815593,
  'status': 'crawled'},
 {'_id': ObjectId('6638860752a503b1b1e0cff8'),
  'url': 'https://twitter.com/',
  'timestamp': 1714980359.6815598,
  'status': 'crawling'},
 {'_id': ObjectId('66388b2e52a503b1b1e0cffd'),
  'url': 'url',
  'timestamp': 1714981678.4797213,
  'status': 'crawled'},
 {'_id': ObjectId('66388b8d52a503b1b1e0cffe'),
  'url': 'url-test',
  'timestamp': 1714981773.4163313,
  'status': 'crawled'}]

In [102]:
# update status to crawling

# Change status to crawled only if status is 'crawling' otherwise return False
# upsert=True: if the item does not exist, insert it
url = 'https://twitter.com/'

result = db_handler.db['urls-collection'].update_one(
    {'url': url, 'status': {'$in': ['crawled']}},
    {'$set': {'status':'crawling'}}, 
)

if result.upserted_id is not None:
    print("A new document was inserted with the id", result.upserted_id)
elif result.modified_count > 0:
    print("An existing document was updated")
else:
    print("No changes were made")

An existing document was updated


In [99]:
result

UpdateResult({'n': 1, 'electionId': ObjectId('7fffffff0000000000000400'), 'opTime': {'ts': Timestamp(1714982990, 13), 't': 1024}, 'nModified': 1, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1714982990, 13), 'signature': {'hash': b'y\xce)LO\x8f~F\x0eI3(\xa6)\x93d\xcbF\x01=', 'keyId': 7318892626235621378}}, 'operationTime': Timestamp(1714982990, 13), 'updatedExisting': True}, acknowledged=True)

In [1]:
from mongo import Mongo
db=Mongo()
a=db.append_url_crawling('https://pornhub.com')
db.fetch_all()

[{'_id': ObjectId('6638860752a503b1b1e0cff6'),
  'url': 'https://instagram.com/',
  'timestamp': 1714980359.6815581,
  'status': 'crawled'},
 {'_id': ObjectId('6638860752a503b1b1e0cff7'),
  'url': 'https://facebook.com/',
  'timestamp': 1714980359.6815593,
  'status': 'crawled'},
 {'_id': ObjectId('6638860752a503b1b1e0cff8'),
  'url': 'https://twitter.com/',
  'timestamp': 1714980359.6815598,
  'status': 'crawling'},
 {'_id': ObjectId('66388b2e52a503b1b1e0cffd'),
  'url': 'url',
  'timestamp': 1714981678.4797213,
  'status': 'crawled'},
 {'_id': ObjectId('66388b8d52a503b1b1e0cffe'),
  'url': 'url-test',
  'timestamp': 1714981773.4163313,
  'status': 'crawled'},
 {'_id': ObjectId('663896692731c9124c75305e'),
  'url': 'https://pornhub.com',
  'timestamp': 1714984553.1539218,
  'status': 'crawled'},
 {'_id': ObjectId('663898a4aba62d06e2a16eff'),
  'url': 'https://onlinemajdoor.com/',
  'timestamp': 1714985124.0123963,
  'status': 'crawling'},
 {'_id': ObjectId('663898a4aba62d06e2a16f00'),

In [4]:
db.fetch_start_urls()

[{'_id': ObjectId('6638860752a503b1b1e0cff8'),
  'url': 'https://twitter.com/',
  'timestamp': 1714980359.6815598,
  'status': 'to_crawl'},
 {'_id': ObjectId('663898a4aba62d06e2a16eff'),
  'url': 'https://onlinemajdoor.com/',
  'timestamp': 1714985124.0123963,
  'status': 'to_crawl'},
 {'_id': ObjectId('663898a4aba62d06e2a16f00'),
  'url': 'http://nepalipost.com/beta/',
  'timestamp': 1714985124.012397,
  'status': 'to_crawl'},
 {'_id': ObjectId('663898a4aba62d06e2a16f01'),
  'url': 'https://nepalkhabar.com/',
  'timestamp': 1714985124.0123975,
  'status': 'to_crawl'},
 {'_id': ObjectId('663898a4aba62d06e2a16f02'),
  'url': 'https://www.nepalipaisa.com/',
  'timestamp': 1714985124.0123978,
  'status': 'to_crawl'},
 {'_id': ObjectId('663898a4aba62d06e2a16f03'),
  'url': 'https://topnepalnews.com/',
  'timestamp': 1714985124.012398,
  'status': 'to_crawl'},
 {'_id': ObjectId('663898a4aba62d06e2a16f04'),
  'url': 'https://www.dainiknepal.com/',
  'timestamp': 1714985124.0123985,
  'status

In [6]:
db.recover_expired_crawling(0)
db.fetch_all()

[{'_id': ObjectId('6638860752a503b1b1e0cff6'),
  'url': 'https://instagram.com/',
  'timestamp': 1714980359.6815581,
  'status': 'crawled'},
 {'_id': ObjectId('6638860752a503b1b1e0cff7'),
  'url': 'https://facebook.com/',
  'timestamp': 1714980359.6815593,
  'status': 'crawled'},
 {'_id': ObjectId('6638860752a503b1b1e0cff8'),
  'url': 'https://twitter.com/',
  'timestamp': 1714980359.6815598,
  'status': 'to_crawl'},
 {'_id': ObjectId('66388b2e52a503b1b1e0cffd'),
  'url': 'url',
  'timestamp': 1714981678.4797213,
  'status': 'crawled'},
 {'_id': ObjectId('66388b8d52a503b1b1e0cffe'),
  'url': 'url-test',
  'timestamp': 1714981773.4163313,
  'status': 'crawled'},
 {'_id': ObjectId('663896692731c9124c75305e'),
  'url': 'https://pornhub.com',
  'timestamp': 1714984553.1539218,
  'status': 'crawled'},
 {'_id': ObjectId('663898a4aba62d06e2a16eff'),
  'url': 'https://onlinemajdoor.com/',
  'timestamp': 1714985124.0123963,
  'status': 'to_crawl'},
 {'_id': ObjectId('663898a4aba62d06e2a16f00'),