Store to Mongo:
* url_to_crawl
* url_crawled
* url_crawling

In [None]:
from mongo_db_handler import MongoDBHandler
import time

class Mongo():
    def __init__(self):
        self.db_handler = MongoDBHandler(collection_name="scrapy-engine", db_name="scrapy-engine")

    def append_url_crawled(self, url):
        # Add to url_crawled if it does not already exists
        if not db_handler.exists(field='url', vlaue=url, collection_name='url_crawled'):
            # Check if it does not already exists
            db_handler.insert_one({'url':url, 'timestamp':time.time()}, collection_name='url_crawled')
        
        # Remove from url_crawling
        db_handler.delete_one(field='url', value=url, collection_name='url_crawling')        
        
        # Remove from url_to_crawl
        db_handler.delete_one(field='url', value=url, collection_name='url_to_crawl')
        
    def append_url_crawling(self, url):
        # Add to url_crawling
        if self.not_crawling_or_not_crawled():
            # Check if url has not already been crawled
            db_handler.insert_one({'url':url, 'timestamp':time.time()}, collection_name='url_crawling')
            
            # Remove from url_to_crawl
            db_handler.delete_one(field='url', value=url, collection_name='url_to_crawl')
            
            return True
        return False
        

    def append_url_to_crawl(self, url):
        # Add to url_crawling
        if self.not_to_crawl_or_not_crawling_or_not_crawled(url):
            db_handler.insert_one({'url':url, 'timestamp':time.time()}, collection_name='url_to_crawl')
            return True
        return False
    
    def not_crawling_or_not_crawled(self, url):
        if not db_handler.exists(field='url', vlaue=url, collection_name='url_crawled'):
            if not db_handler.exists(field='url', vlaue=url, collection_name='url_crawling'):
                return True
        return False
    
    def not_to_crawl_or_not_crawling_or_not_crawled(self, url):
        if not db_handler.exists(field='url', vlaue=url, collection_name='url_crawled'):
            if not db_handler.exists(field='url', vlaue=url, collection_name='url_crawling'):
                if not db_handler.exists(field='url', vlaue=url, collection_name='url_to_crawl'):
                    return True
        return False

    def get_expired_url_crawling(self):
        # crawling urls expire in every 2 hours
        return db_handler.get_items_before_timestamp(timestamp=time.time() - 7200, collection_name='url_crawling')

    def fetch_urls_to_crawl(self, number_of_urls_required=10):
        # return [json.loads(url) for url in self.redis_client.srandmember('urls_to_crawl_cleaned_set', number_of_new_urls_required)]
        
        # get urls from to_crawl
        urls = db_handler.get_n_items(collection_name='url_to_crawl', n=number_of_urls_required)
        
        # append them to crawling   -> removes from to_crawl
        for url in urls:
            self.append_url_crawling(url)
        
        # return urls
        return urls


# One time operation

In [4]:
# Populate initial Start Urls
import time
from mongo_db_handler import MongoDBHandler
db_handler = MongoDBHandler(collection_name="scrapy-engine", db_name="scrapy-engine")

start_urls = ["https://onlinemajdoor.com/", "http://nepalipost.com/beta/", "https://nepalkhabar.com/", "https://www.nepalipaisa.com/", "https://topnepalnews.com/",  "https://www.dainiknepal.com/", "https://www.bbc.com/nepali"]

db_handler.insert_many([{'url':url, 'timestamp':time.time(), 'status':'to_crawl'} for url in start_urls], collection_name='urls-collection')

In [None]:
# Indexing by url
mongo = Mongo()

collection_names = ['url_crawled', 'url_to_crawl', 'url_crawling']
for collection_name in collection_names:
    mongo.db_handler.delete_all(collection_name=collection_name)
    mongo.db_handler.db[collection_name].create_index('url', unique=True)

In [17]:
# Convert crawled to to_crawl

from mongo import Mongo
db=Mongo()
db.recover_expired_crawling(0)

In [3]:
# print all urls with status to_crawl
print(list(db.collection.find({'status':'to_crawl'})))


# # Delete all (does-not require re-indexing)
db.collection.delete_many({})

DeleteResult({'n': 419, 'electionId': ObjectId('7fffffff0000000000000400'), 'opTime': {'ts': Timestamp(1714993939, 435), 't': 1024}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1714993939, 443), 'signature': {'hash': b'\xcf\xc11\r^(r\xd3a\xb0\xba\xf2\xe9e:\xcb)\xb3\xae\t', 'keyId': 7318892626235621378}}, 'operationTime': Timestamp(1714993939, 435)}, acknowledged=True)

In [61]:
list(db.collection.find())

[{'_id': ObjectId('6638b7f5c7df94274912fb62'),
  'url': 'https://onlinemajdoor.com/',
  'timestamp': 1714993141.1548414,
  'status': 'to_crawl'},
 {'_id': ObjectId('6638b7f5c7df94274912fb63'),
  'url': 'http://nepalipost.com/beta/',
  'timestamp': 1714993141.1548426,
  'status': 'to_crawl'},
 {'_id': ObjectId('6638b7f5c7df94274912fb64'),
  'url': 'https://nepalkhabar.com/',
  'timestamp': 1714993141.154843,
  'status': 'to_crawl'},
 {'_id': ObjectId('6638b7f5c7df94274912fb65'),
  'url': 'https://www.nepalipaisa.com/',
  'timestamp': 1714993141.1548433,
  'status': 'to_crawl'},
 {'_id': ObjectId('6638b7f5c7df94274912fb66'),
  'url': 'https://topnepalnews.com/',
  'timestamp': 1714993141.1548436,
  'status': 'to_crawl'},
 {'_id': ObjectId('6638b7f5c7df94274912fb67'),
  'url': 'https://www.dainiknepal.com/',
  'timestamp': 1714993141.1548448,
  'status': 'to_crawl'},
 {'_id': ObjectId('6638b7f5c7df94274912fb68'),
  'url': 'https://www.bbc.com/nepali',
  'timestamp': 1714993141.154845,
  '

### Error Data

In [1]:
from mongo import Mongo
db = Mongo()
error_data = {'url': 'https://nepalkhabar.com/', 'timestamp': 1714993152.5575845, 'status': 'error', 'status_code': 403, 'error_type': 'HttpError'}
db.append_error_data(error_data)

True

In [7]:
list(db.collection.find({'status':'error'}))

[{'_id': ObjectId('6638bb23a304c2ebb7f6fe81'),
  'url': 'https://nepalkhabar.com/',
  'timestamp': 1714993954.888785,
  'status': 'error',
  'status_code': 403,
  'error_type': 'HttpError'}]