### save_other_data
*  boolean variable stored in server.py which is updated in online_mongo
* spider checks this variable in online_mongo to decide whether or not to crawl other_data

In [4]:
# Save to mongo
from mongo import Mongo
mongo = Mongo()

# # unique Index for config name
# mongo.db['config'].create_index('name', unique=True)

# # Remove index
# mongo.db['config'].drop_index('name_1')

configs = [
        {'crawl_other_data': False},
        {'crawl_paragraph_data':True},
        # {'some_config':1000}
    ]
# Save configs
mongo.set_configs(configs)

print('Configs saved')

# Get configs
configs = mongo.get_configs()
crawled_data = got_configs['crawl_other_data'] in got_configs[0]
 and got_configs[0]['crawl_other_data'] == False

{'crawl_other_data': False, 'crawl_paragraph_data': True}


In [10]:
# Get configs
configs = mongo.get_configs()
# default: Do not Crawl other data
crawl_other_data = configs['crawl_other_data'] if 'crawl_other_data' in configs else False
print(f'crawl_other_data: {crawl_other_data}')
# default: Crawl paragraph data
crawl_paragraph_data = configs['crawl_paragraph_data'] if 'crawl_paragraph_data' in configs else True
print(f'crawl_paragraph_data: {crawl_paragraph_data}')

crawl_other_data: False
crawl_paragraph_data: True


In [8]:
'crawl_other_data' in configs

True

In [41]:
crawl_other_data = mongo.db['config'].find_one({'name': 'save_other_data'})
if crawl_other_data:
            print(crawl_other_data['value'])

False


{'save_other_data': False, 'some_config': 1000, 'crawl_paragraph_data': True, 'crawl_other_data': False}


In [45]:
configs = list(mongo.db['config'].find({}))
[config for config in configs if config['name'] == 'save_other_data']

[{'_id': ObjectId('665c2b374e0b765a6d0780aa'),
  'name': 'save_other_data',
  'value': False}]

In [1]:
# list(mongo.db['config'].find({}))
mongo.db['config'].delete_many({})

NameError: name 'mongo' is not defined

## mongoDb  is full:
```
 * more than 900k urls to_crawl
 * >160K crawled_urls
 * (500MB) Mongo free
```
#### Solution:
* only keep 100k-200K to_crawls in mongo at a time
* remove crawled_urls
* Avoid crawling data urls

get all to_crawl (920054)   | Save to csv
get all crawled  (160842)   | Save to csv



* make scrapy_engine_spider to push to "to_crawl?" instead of "to_crawl"
* avoid updating "crawled" from scrapy_spider to mongo

* Get Urls from "to_crawl?"
    [ ] Store Crawled urls in bloom function.
    [ ] bloom filter to check urls in "to_crawl?" does not exist in urls_crawled
    [X] store to csv/sqlite (avoid duplicate): csv files: "to_crawl", "crawled" or sqlite_tables
    [X] Delete to_crawls and crawled from mongo

    if len(to_crawl_in_mongo < 100000):
        [ ] shuffle(to_crawl_urls)
        [X] add 100k to_crawl urls from local to mongo
        [X] delete all attempted mongo inserts.
        [ ] delete only successful mongo insert and store unsuccessful ones somewhere

update to_crawl and "crawled" using crawled_data
[X] for data in crawled_data:
        if data['parent_url'] in to_crawl:
            remove data['parent_url'] from to_crawl
        append_to_crawled(data['parent_url'])




Note:
* "to_crawl?" -> uploaded by scrapy spider to mongo
* "to_crawl"  -> actual to_crawl link uploaded to mongo by server
* sqlite seems to handle concurrency by itself without throwing errors.

## Fetch Error Data from Mongo and save it to csv

In [11]:
import os
import csv
from mongo import Mongo
mongo=Mongo()

error_data = mongo.collection.find({'status': 'error'}) # .limit(10)
formatted_for_csv = [{
        'url': error['url'],
        'timestamp': error['timestamp'],
        'status': error['status'],
        'status_code': error['status_code'] if 'status_code' in error else None,
        'error_type': error['error_type']
    } for error in error_data]
# Append error data to csv
csv_file_path = 'error_data.csv'
file_exists = os.path.exists(csv_file_path)
with open(csv_file_path, 'a') as csvfile:
    fieldnames = ['url', 'timestamp', 'status', 'status_code', 'error_type']
    csv_writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    if not file_exists:
        print(f"creating csv file: {csv_file_path}")
        # Write header only if file is empty
        csv_writer.writeheader()
    else:
        print(f'csv file: \"{csv_file_path}\" exists')
    csv_writer.writerows(formatted_for_csv)

print(f'Saved error_data to {csv_file_path}')

# Delete from mongo
mongo.collection.delete_many({'url': {'$in': [error['url'] for error in formatted_for_csv]}, 'status': 'error'})


In [14]:
formatted_for_csv=[{'url': 'https://www.dainiknepal.com/', 'timestamp': 1714996985.424532, 'status': 'error', 'status_code': 403, 'error_type': 'HttpError'}, {'url': 'https://nepalkhabar.com/', 'timestamp': 1714996985.8666382, 'status': 'error', 'status_code': 403, 'error_type': 'HttpError'}, {'url': 'https://belkotgadhimun.gov.np/faq', 'timestamp': 1715018151.0237563, 'status': 'error', 'status_code': 404, 'error_type': 'HttpError'}, {'url': 'https://epsnepal.gov.np/documents/service-sector-%e0%a4%ae%e0%a4%be-%e0%a4%b0%e0%a5%8b%e0%a4%b7%e0%a5%8d%e0%a4%9f%e0%a4%b0-%e0%a4%aa%e0%a4%b0%e0%a4%bf%e0%a4%b5%e0%a4%b0%e0%a5%8d%e0%a4%a4%e0%a4%a8-%e0%a4%b8%e0%a4%ae%e0%a5%8d%e0%a4%b5/', 'timestamp': 1715036967.8365157, 'status': 'error', 'status_code': 404, 'error_type': 'HttpError'}, {'url': 'https://www.bbc.com/nepali/send/u50853473', 'timestamp': 1715078773.2192092, 'status': 'error', 'status_code': 404, 'error_type': 'HttpError'}, {'url': 'https://www.bbc.com/nepali/undefined', 'timestamp': 1715083194.0109007, 'status': 'error', 'status_code': 404, 'error_type': 'HttpError'}, {'url': 'https://www.bbc.com/nepali/resources/%5Bhttps:/www.bbc.com/nepali%5D', 'timestamp': 1715092717.9155746, 'status': 'error', 'status_code': 404, 'error_type': 'HttpError'}, {'url': 'http://dohs.gov.np/ne/mohpnep', 'timestamp': 1715097008.6633568, 'status': 'error', 'status_code': 404, 'error_type': 'HttpError'}, {'url': 'http://nhtc.gov.np/index.php/trainingevent/training', 'timestamp': 1715098146.3797266, 'status': 'error', 'status_code': 404, 'error_type': 'HttpError'}, {'url': 'https://mofaga.gov.np/prov-1', 'timestamp': 1715100428.9299655, 'status': 'error', 'status_code': 404, 'error_type': 'HttpError'}]

['https://www.dainiknepal.com/',
 'https://nepalkhabar.com/',
 'https://belkotgadhimun.gov.np/faq',
 'https://epsnepal.gov.np/documents/service-sector-%e0%a4%ae%e0%a4%be-%e0%a4%b0%e0%a5%8b%e0%a4%b7%e0%a5%8d%e0%a4%9f%e0%a4%b0-%e0%a4%aa%e0%a4%b0%e0%a4%bf%e0%a4%b5%e0%a4%b0%e0%a5%8d%e0%a4%a4%e0%a4%a8-%e0%a4%b8%e0%a4%ae%e0%a5%8d%e0%a4%b5/',
 'https://www.bbc.com/nepali/send/u50853473',
 'https://www.bbc.com/nepali/undefined',
 'https://www.bbc.com/nepali/resources/%5Bhttps:/www.bbc.com/nepali%5D',
 'http://dohs.gov.np/ne/mohpnep',
 'http://nhtc.gov.np/index.php/trainingevent/training',
 'https://mofaga.gov.np/prov-1']

In [4]:
from mongo import Mongo
mongo = Mongo()

# Get 10 data urls from to_crawl.
urls = mongo.collection.find({"status": 'to_crawl?'}).limit(10)

# get urls starting with 'data:'
data_urls = mongo.collection.find({"url": {"$regex": "^data:"}})
list(data_urls)

[]

In [11]:
mongo.collection.count_documents({"status": 'to_crawl'})

920054

In [3]:
from mongo import Mongo
db = Mongo()
# Count Entries with status="to_crawl"
db.collection.find({"status": "to_crawl"}).count()

AttributeError: 'Cursor' object has no attribute 'count'

## One-Time-Operation: Get all urls from mongo and save to sqlite

In [None]:
from mongo import Mongo
mongo = Mongo()
# get all to_crawl urls 50000 at a time
# Initialize skip to 0
skip = 0
bulk_size = 50000

the_to_crawl_urls = []
while True:
    # Get the next 50000 documents
    to_crawl_urls = mongo.collection.find({"status": 'to_crawl'}).skip(skip).limit(bulk_size)
    # Convert the cursor to a list
    crawled_urls_list = list(to_crawl_urls)
    # If the list is empty, break the loop
    if not crawled_urls_list:
        break
    # Process the documents
    the_to_crawl_urls.extend([(url['url'], url['timestamp']) for url in crawled_urls_list])
    # Increase skip by 50000
    skip += bulk_size
    print(skip)

len(the_to_crawl_urls)

In [8]:
# Save to sqlite
from sqlite_handler import URLDatabase
url_db = URLDatabase(db_path="urls.db")

# Insert the data into the database
url_db.bulk_insert("to_crawl", the_to_crawl_urls)

url_db.count_entries("to_crawl")

# # Get all the urls from the database
# urls = url_db.fetch('to_crawl', 10)
# urls

# delete from mongo
mongo.collection.delete_many({"status": 'to_crawl'})

In [None]:
from mongo import Mongo
mongo = Mongo()
# get all crawled urls 50000 at a time
# Initialize skip to 0
skip = 0
bulk_size = 50000

the_crawled_urls = []
while True:
    # Get the next 50000 documents
    crawled_urls = mongo.collection.find({"status": 'crawled'}).skip(skip).limit(bulk_size)
    # Convert the cursor to a list
    crawled_urls_list = list(crawled_urls)
    # If the list is empty, break the loop
    if not crawled_urls_list:
        break
    # Process the documents
    the_crawled_urls.extend([(url['url'], url['timestamp']) for url in crawled_urls_list])
    # Increase skip by 50000
    skip += bulk_size
    print(skip)

len(the_crawled_urls)

In [None]:
# Save to sqlite
from sqlite_handler import URLDatabase
url_db = URLDatabase(db_path="urls.db")

# Insert the data into the database
url_db.bulk_insert("crawled", the_crawled_urls)


import time;start=time.time();a=url_db.fetch('crawled', 100000);print(start-time.time())

# # Get all the urls from the database
urls = url_db.fetch('crawled', 10)

In [15]:
from mongo import Mongo
mongo = Mongo()


sample_url = {'url': 'https://wdww.iana.org/f_img/2013.1/iana-logo-header.svg1', 'status': 'to_crawl?', 'timestamp': '2021-07-01T00:00:00.000Z'}
mongo.collection.insert_one(sample_url)

# Get all urls from "to_crawl?"
urls = mongo.collection.find({"status": 'to_crawl?'})
to_crawl_urls = [(url['url'], url['timestamp']) for url in list(urls)]
to_crawl_urls

# Remove multiple url from "to_crawl?"
mongo.collection.delete_many({"status": 'to_crawl?', "url": {"$in": [url[0] for url in to_crawl_urls]}})


DeleteResult({'n': 3, 'electionId': ObjectId('7fffffff0000000000000405'), 'opTime': {'ts': Timestamp(1716833546, 33), 't': 1029}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1716833546, 35), 'signature': {'hash': b"\xa7.\x84\xd8K\x97\xd6\xbc\xf5\x08W'\x06\xe1o\x9b\xa0\xaf\x1cl", 'keyId': 7318892626235621378}}, 'operationTime': Timestamp(1716833546, 33)}, acknowledged=True)

In [1]:
# Get all urls from "to_crawl?"
urls = mongo.collection.find({"status": 'to_crawl?'})
to_crawl_urls = [(url['url'], url['timestamp']) for url in list(urls)]
to_crawl_urls

# Save to sqlite
from sqlite_handler import URLDatabase
url_db = URLDatabase(db_path="urls.db")

# url_db.exists("crawled", to_crawl_urls[0][0])

# Insert the data into the database
url_db.bulk_insert("crawled", to_crawl_urls)

# Delete 
# url_db.count_entries("crawled")

# url_db.fetch_all("crawled")

# # Get all the urls from the database
# # urls = url_db.fetch_all('test')

NameError: name 'to_crawl_urls' is not defined

##### Final Code


In [13]:
'''
* creating a giant thread to avoid concurrency issues
'''
import sys
from mongo import Mongo
mongo = Mongo()
from sqlite_handler import URLDatabase
url_db = URLDatabase(db_path="urls.db")


# Get all urls from "to_crawl?"
urls = mongo.collection.find({"status": 'to_crawl?'})
to_crawl_urls = [(url['url'], url['timestamp']) for url in list(urls)]
# to_crawl_urls

# Save to sqlite
# Insert the data into the database
url_db.bulk_insert("crawled", to_crawl_urls)

# delete from mongo
mongo.collection.delete_many([{'url': url[0], 'status': 'to_crawl?'} for url in to_crawl_urls])


if mongo.collection.count_documents({"status": 'to_crawl'}) < 100000:
    new_to_crawl_urls = url_db.fetch('to_crawl', 100000)
    n_failed_to_upload = 0
    try:
        # insert many
        mongo.collection.insert_many([{'url': url[0], 'status': 'to_crawl', 'timestamp': url[1]} for url in new_to_crawl_urls], ordered=False)
    except Exception as bwe:
        # pass
        # Get the details of the operations that failed
        failed_ops = bwe.details['writeErrors']
        

        # Get the documents that failed to insert
        failed_docs = [op['op'] for op in failed_ops]

        # Get the URLs that failed to insert
        urls_failed_to_upload_to_mongo = [(doc['url'], doc['timestamp']) for doc in failed_docs]
        n_failed_to_upload = len(urls_failed_to_upload_to_mongo)
        # for url in urls_failed_to_upload_to_mongo:
        #     logging.error(f"Failed to upload {url} to MongoDB")
        # success_urls = [url for url in new_to_crawl_urls if url not in urls_failed_to_upload_to_mongo]

        # # Delete successful urls from sqlite
        # url_db.delete("to_crawl", success_urls)
        # # print(f'success_urls:{success_urls}, len:{len(success_urls)}')
        # print(failed_urls, len(failed_urls))
    # # delete from sqlite
    if n_failed_to_upload < 10000:
        url_db.delete("to_crawl", new_to_crawl_urls)
        # print(n_failed_to_upload)
    else:
        print(f'failed to upload {n_failed_to_upload} urls to mongo')
        # exit the python script
        sys.exit(1)
        


1463


In [1]:
from mongo import Mongo
mongo = Mongo()

# Delete all urls with status 'crawled'
# mongo.collection.delete_many({"status": 'to_crawl'})
mongo.collection.delete_many({"status": 'crawled'})
mongo.collection.delete_many({"status": 'test'})
mongo.collection.delete_many({"status": 'to_crawl?'})

DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff0000000000000405'), 'opTime': {'ts': Timestamp(1716835666, 11), 't': 1029}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1716835666, 11), 'signature': {'hash': b'a\x0cbO\xfd"\xda\xff0,*\xbaK&\xad@\xba\xb5\xe6o', 'keyId': 7318892626235621378}}, 'operationTime': Timestamp(1716835666, 11)}, acknowledged=True)

### DB stats

In [None]:
from mongo import Mongo
mongo = Mongo()

# crawled_count = mongo.collection.count_documents({"status": 'crawled'})
to_crawl_spider_count = mongo.collection.count_documents({"status": 'to_crawl?'})
to_crawl_spider_count

In [8]:
from mongo import Mongo
mongo = Mongo()

crawled_data_count = mongo.collection.count_documents({'status':'crawling'})

other_data_count = mongo.db['other_data'].count_documents({})
print(crawled_data_count)
print(other_data_count)

102914
0


In [41]:
import time

# Before 5 minutes
timestamp = time.time()

mongo.collection.count_documents({'status':'crawling', 'timestamp': ['$lt', '1714996990.47649']})
# mongo.collection.count_documents({'status':'crawling', { $gt: ["$timeStamp", 1432201420790] }})

# using aggregate


0

In [45]:
result

[]

## Debug: recover_expired_crawling

In [None]:
import time

# Before 5 minutes
timestamp = time.time()
pipeline = [
    {"$match": {"status": "crawling", "timestamp": {"$lt": str(timestamp)}}},
    # {"$count": "count"}
]
# The result is a list of documents returned by the aggregation pipeline
expired_crawling_urls = list(mongo.collection.aggregate(pipeline))

expired_crawling_urls[:10]

In [59]:
def convert_from_crawling_to_to_crawl(urls):
            # for url in urls:
            #     self.collection.update_one(
            #         {'_id':url['_id'], 'status': {'$in': ['crawling']}},
            #         {'$set': {'status':'to_crawl'}}
            #         )
            # perform bulk update
            if urls:
                mongo.collection.update_many(
                    {'_id': {'$in': [url['_id'] for url in urls]}},
                    {'$set': {'status':'to_crawl'}}
                )


convert_from_crawling_to_to_crawl(result)

In [63]:
' count to_crawl'
mongo.collection.count_documents({'status':'to_crawl?'})

0

# Mongo is full by indexing


In [1]:
from mongo import Mongo
mongo=Mongo()

mongo.check_connection()

Pinged your deployment. You successfully connected to MongoDB!


In [13]:
# Add a url to 'to_crawl' status
import time
# mongo.collection.insert_one({'url': 'https://www.bbc.com/nepali', 'status': 'to_crawl', 'timestamp': time.time()})
# mongo.collection.create_index('url', unique=True)
mongo.collection.delete_many({'status': 'to_crawl'})

DeleteResult({'n': 1, 'electionId': ObjectId('7fffffff000000000000000e'), 'opTime': {'ts': Timestamp(1716896107, 4000), 't': 14}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1716896107, 4000), 'signature': {'hash': b'\xc0\x84\x0e\x19\x11\xa6\xae\x92\xcd\x85\x8e\xfb\n\xf1.h\x8f\x8d\xe1E', 'keyId': 7351058967755227142}}, 'operationTime': Timestamp(1716896107, 4000)}, acknowledged=True)

In [1]:
# list all collections
from mongo import Mongo
mongo = Mongo()

a = mongo.recover_expired_crawling()


In [4]:
# expired_crawling_urls=a
# expired_crawling_urls[0]['url']
from sqlite_handler import URLDatabase

In [12]:
# Save to sqlite
another_sqlite_instance = URLDatabase(db_path="urls.db")
another_sqlite_instance.fetch_all("to_crawl")
not_already_crawled = []
for entry in expired_crawling_urls:
    if not another_sqlite_instance.exists("crawled", entry['url']):
        not_already_crawled.append(entry)
# Save to sqlite
# entries = ['url': url['url'], 'timestamp': url['timestamp']} for url in not_already_crawled]
entries2 = [(url['url'], url['timestamp']) for url in not_already_crawled]
if entries2:
    another_sqlite_instance.bulk_insert("to_crawl", entries2, show_progress=False)
# another_sqlite_instance.close()

# Delete from mongo
mongo.collection.delete_many({'status': 'crawling', 'url': {'$in': [entry['url'] for entry in not_already_crawled]}})

In [9]:
another_sqlite_instance.bulk_insert("to_crawl", entries, show_progress=False)

In [13]:
another_sqlite_instance.count_entries('to_crawl')

472