# Re-initialize Database
* note:It will delete everything in MongoDb, Redis and .csv files

### Mongo

In [None]:
# Convert all urls with status: crawled -> to_crawl

from mongo import Mongo
db=Mongo()
db.recover_expired_crawling(0)

: 

In [3]:
# delete all "to_crawl" urls starting with "https://www.bbc.com" that do not start with "https://www.bbc.com/nepali"
collection = db.collection
collection.delete_many({"status": "to_crawl", "url": {"$regex": "^https://www.bbc.com", "$not": {"$regex": "^https://www.bbc.com/nepali"}}})
collection.delete_many({"status": "crawling", "url": {"$regex": "^https://www.bbc.com", "$not": {"$regex": "^https://www.bbc.com/nepali"}}})

DeleteResult({'n': 2186, 'electionId': ObjectId('7fffffff0000000000000401'), 'opTime': {'ts': Timestamp(1715109691, 2209), 't': 1025}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1715109691, 2214), 'signature': {'hash': b'\x17#\xfc$\x87\xde\xcc\x16\xba0\xebi\x0e\x95\x04\xc7\xff\x16$#', 'keyId': 7318892626235621378}}, 'operationTime': Timestamp(1715109691, 2209)}, acknowledged=True)

In [2]:
## Delete all (does-not require re-indexing)
db.collection.delete_many({})

DeleteResult({'n': 1842, 'electionId': ObjectId('7fffffff0000000000000400'), 'opTime': {'ts': Timestamp(1714996543, 904), 't': 1024}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1714996543, 906), 'signature': {'hash': b'S\x07\xf5\x03l\x98\xdd\xc2d\xa7t\xfa(T\xd8aH\xc2\xe9,', 'keyId': 7318892626235621378}}, 'operationTime': Timestamp(1714996543, 904)}, acknowledged=True)

In [3]:
# Create index for unique url
db.collection.create_index('url', unique=True)

'url_1'

In [None]:
# Populate initial Start Urls
import time
from mongo_db_handler import MongoDBHandler
db_handler = MongoDBHandler(collection_name="scrapy-engine", db_name="scrapy-engine")

start_urls = ["https://onlinemajdoor.com/", "http://nepalipost.com/beta/", "https://nepalkhabar.com/", "https://www.nepalipaisa.com/", "https://topnepalnews.com/",  "https://www.dainiknepal.com/", "https://www.bbc.com/nepali", "https://deshsanchar.com/"]

db_handler.insert_many([{'url':url, 'timestamp':time.time(), 'status':'to_crawl'} for url in start_urls], collection_name='urls-collection')

: 

### Redis

In [5]:
from dotenv import load_dotenv
load_dotenv()

import redis
import os

redis_client = redis.Redis(
            host=os.environ.get('REDIS_HOST', 'localhost'),
            port = int(os.environ.get('REDIS_PORT', 6379)),
            password=os.environ.get('REDIS_PASSWORD', None),
        )

# delete all key-value pairs
redis_client.flushall()

# Get all keys from redis
redis_client.keys()

[]

### CSV Files

In [6]:
# Remove crawled_data.csv
!rm crawled_data.csv

# Remove other_data.csv
!rm other_data.csv

## Tasks
* Continuously fetch data from redis
* 

## Save Crawled Data

In [None]:

import csv
import dotenv
import json
import os
import redis
import threading
import time
dotenv.load_dotenv()


def pop_from_redis():
    # print('call gari rako muji')
    lists_to_pop = ['crawled_data', 'other_data']       # , 'crawled', 'to_crawl']
    popped_data = {}
    for list_name in lists_to_pop:
        if list_name not in popped_data:
            popped_data[list_name] = []
        # print(f'list_name:{list_name}')
        
        # pop until the list is empty
        while redis_client.llen(list_name) > 0:
            # print(f'list_name:{list_name}')
            popped_data[list_name].append(json.loads(redis_client.rpop(list_name)))
    # print("return vayo muji")
    return popped_data

    '''
    e.g. format
    {
        'crawled_data': {

        },
        'other_data': {
            
        }
    }
    '''

def push_to_redis(list_name, data):
    redis_client.lpush(list_name, data)


# Connect to your Redis server
redis_client = redis.Redis(
    host=os.environ.get('REDIS_HOST', 'localhost'),
    port = int(os.environ.get('REDIS_PORT', 6379)),
    password=os.environ.get('REDIS_PASSWORD', None),
)



def save_to_csv(data, data_type="crawled_data"):
    for key, data_items in data.items():
        csv_file_path = key + ".csv"
        if data_items:
            # field_names = ['paragraph', 'parent_url', 'page_title', 'is_nepali_confidence']
            field_names = data_items[0].keys()
            file_exists = os.path.exists(csv_file_path)
            print(f'file_exists: {file_exists}')
            # Open the CSV file in append mode
            with open(csv_file_path, 'a', newline='', encoding='utf-8') as csvfile:
                # Create a CSV writer object
                csv_writer = csv.DictWriter(csvfile, fieldnames=field_names)

                # If the file doesn't exist, write the header
                if not file_exists:
                    csv_writer.writeheader()

                # Append the new data
                csv_writer.writerows(data_items)

def load_from_csv(csv_file_path="crawled_data.csv"):
    data = []
    # Open the CSV file in read mode
    with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
        # Create a CSV reader object
        csv_reader = csv.DictReader(csvfile)

        # Read and print each row of data
        for row in csv_reader:
            data.append(row)
            # print(row)
    return data

def producer():
    import random
    print('producer')
    data_paragraphs_copy = [{'paragraph': 'प्रदेश सरकार र निजी क्षेत्रको सहकार्यमा पहाडी तथा हिमाली क्षेत्रमा मनोरञ्जनात्मक तथा साहसिक पर्यटनको लागि हिलस्टेशनहरू विकास गर्न आवश्यक छ भन्दै उनले सांस्कृतिक, धार्मिक, साहसिक, कृषि, स्वास्थ्य तथा खेल पर्यटक आकर्षित गर्दै यस क्षेत्रको मौलिक संस्कृति संरक्षणमा महोत्सवले सहयोग गर्ने विश्वास व्यक्त गरे ।\xa0', 'parent_url': 'https://hamrakura.com/news-details/161504/2023-12-27', 'page_title': 'लोकतन्त्रको आन्दोलन उठाउँदाकै आस्थाबाट निर्देशित छु – राष्ट्रपति पौडेल', 'is_nepali_confidence':'-1914.427728056908'},
        {'paragraph': 'त्रिवेणी बाहेक अन्य पालिकाबाट कुन-कुन घर परिवारले रकम पाउने भन्ने विवरण नआइसकेकाले ती पालिकाका लागि रकम निकासा भने हुन सकेको छैन ।', 'parent_url': 'https://hamrakura.com/news-details/159820/2023-11-28', 'page_title': 'भूकम्पपीडितको अस्थायी आवासका लागि रकम निकासा', 'is_nepali_confidence':'-800.0689898729324'},
        {'paragraph': 'निर्वाचित मण्डलले निर्वाचनका सबै प्रक्रिया अघि बढाएपनी सहमतिका लागि शीर्ष नेताहरूले समय मागेकाले निर्वाचन कमिटीले समय दिएको थियो । निर्वाचन कमिटीका संयोजक जगत बहादुर रोकायाले बताए ।','parent_url': 'https://hamrakura.com/news-details/160003/2023-12-01', 'page_title': 'अध्यक्ष मण्डलले घोषणा गरे जिल्ला कमिटी, टिके प्रथा चलाएको रावल पक्षको आरोप', 'is_nepali_confidence':'-1128.5258438587189'},
        {'paragraph': 'अहिलेसम्म एनसेलले शेयर किनबेच गरेको सम्बन्धमा नेपाल दूरसञ्चार प्राधिकरणले गरेको काम कारबाहीको सम्बन्धमा जानकारी माग्ने पत्र लेख्ने', 'parent_url':'https://hamrakura.com/news-details/161068/2023-12-19', 'page_title': 'एनसेलले राज्यलाई तिर्नुपर्ने कर असुल उपर गर्न सरकारलाई समितिको निर्देशन [भिडियो]', 'is_nepali_confidence':'-800.2218471765518'}
        ]
    while True:
        # add n items to the list
        n_items = random.randint(0, 3)
        data_paragraphs.extend(data_paragraphs_copy[:n_items])
        print(f'produced: {n_items}')

        # sleep randomly between 0 and 5 seconds
        time.sleep(random.randint(0, 5))

def publisher():
    print('published')
    while True:
        
        if data_paragraphs:
            pushed = 0
            for paragraph in data_paragraphs:
                push_to_redis('crawled_data', json.dumps(data_paragraphs.pop()))
                pushed += 1
            print(f'---published: {pushed}---')
        else:
            time.sleep(5)  # sleep for a while before producing more items
        # time.sleep(1)  # sleep for a while before producing more items

def consumer():
    print('consumer')
    pulled = 0
    while True:
        # print('consumer')
        paragraphs = pop_from_redis()
        
        if paragraphs:
            save_to_csv(paragraphs)
            pulled += len(paragraphs)
            print(f'======consumed: {len(paragraphs)}')     #\n\n current_count:{redis_client.llen("paragraphs")}')
            # print(f'len(paragraphs): {len(paragraphs)} \n\n paragraphs:{paragraphs}')
        else:
            print('------------------')   # No Data
            time.sleep(1)  # sleep for a while before consuming more items

data_paragraphs = []

# # Create producer and consumer threads
# producer_thread = threading.Thread(target=producer)
# publisher_thread = threading.Thread(target=publisher)
# consumer_thread = threading.Thread(target=consumer)


# # Start the threads
# producer_thread.start()
# # consumer_thread.start()
# publisher_thread.start()

# # Wait for both threads to finish
# producer_thread.join()
# publisher_thread.join()
# # consumer_thread.join()


### One time operation

In [5]:
# Populate initial Start Urls
import time
from mongo_db_handler import MongoDBHandler
db_handler = MongoDBHandler(collection_name="scrapy-engine", db_name="scrapy-engine")

start_urls = ["https://onlinemajdoor.com/", "http://nepalipost.com/beta/", "https://nepalkhabar.com/", "https://www.nepalipaisa.com/", "https://topnepalnews.com/",  "https://www.dainiknepal.com/", "https://www.bbc.com/nepali"]

db_handler.insert_many([{'url':url, 'timestamp':time.time(), 'status':'to_crawl'} for url in start_urls], collection_name='urls-collection')

In [None]:
# Indexing by url
mongo = Mongo()

collection_names = ['url_crawled', 'url_to_crawl', 'url_crawling']
for collection_name in collection_names:
    mongo.db_handler.delete_all(collection_name=collection_name)
    mongo.db_handler.db[collection_name].create_index('url', unique=True)

In [8]:
# Convert all urls with status: crawled -> to_crawl

from mongo import Mongo
db=Mongo()
db.recover_expired_crawling(0)

In [9]:
# Create index for unique url
db.collection.create_index('url', unique=True)

'url_1'

In [None]:
# print all urls with status to_crawl
print(list(db.collection.find({'status':'to_crawl'})))

In [None]:
## Delete all (does-not require re-indexing)
db.collection.delete_many({})

In [None]:
# Display all entries
list(db.collection.find())

### Error Data

In [None]:
# Append error mannually
from mongo import Mongo
db = Mongo()
error_data = {'url': 'https://nepalkhabar.com/', 'timestamp': 1714993152.5575845, 'status': 'error', 'status_code': 403, 'error_type': 'HttpError'}
db.append_error_data(error_data)

In [None]:
# Get urls with status: error
list(db.collection.find({'status':'error'}))

### Upload/Download crawled_data from mongo

In [4]:
paragraph_data = {'paragraph': 'प्रदेश सरकार र निजी क्षेत्रको सहकार्यमा पहाडी तथा हिमाली क्षेत्रमा मनोरञ्जनात्मक तथा साहसिक पर्यटनको लागि हिलस्टेशनहरू विकास गर्न आवश्यक छ भन्दै उनले सांस्कृतिक, धार्मिक, साहसिक, कृषि, स्वास्थ्य तथा खेल पर्यटक आकर्षित गर्दै यस क्षेत्रको मौलिक संस्कृति संरक्षणमा महोत्सवले सहयोग गर्ने विश्वास व्यक्त गरे ।\xa0', 'parent_url': 'https1://hamrakura.com/news-details/161504/2023-12-27', 'page_title': 'लोकतन्त्रको आन्दोलन उठाउँदाकै आस्थाबाट निर्देशित छु – राष्ट्रपति पौडेल', 'is_nepali_confidence':'-1914.427728056908'}
from mongo import Mongo
mongo = Mongo()

# Create index for unique url
# mongo.db['crawled_data'].create_index('parent_url', unique=True)
# mongo.db['other_data'].create_index('parent_url', unique=True)

# mongo.db['crawled_data'].drop_index('parent_url_1')
# mongo.db['other_data'].drop_index('parent_url_1')

In [36]:
# mongo.db['crawled_data'].delete_many({})
# mongo.db['other_data'].delete_many({})

DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff0000000000000401'), 'opTime': {'ts': Timestamp(1715141605, 5), 't': 1025}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1715141605, 8), 'signature': {'hash': b'>[\xfcSb\xb3\xb6"\xc0\xd8 \x830"\xe3\x9f\xf2\x87\xf3\xf4', 'keyId': 7318892626235621378}}, 'operationTime': Timestamp(1715141605, 5)}, acknowledged=True)

In [87]:
# Insert the data (Done by scrapy)
try:
    paragraph_data = {'paragraph': 'test', 'parent_url': 'https://hamrakura.com/news-details/161504/2023-12-27', 'page_title': 'लोकतन्त्रको आन्दोलन उठाउँदाकै आस्थाबाट निर्देशित छु – राष्ट्रपति पौडेल', 'is_nepali_confidence':'-1914.427728056908'}
    mongo.db['crawled_data'].insert_one(paragraph_data)
except Exception as e:
    print(e)
try:
    mongo.db['other_data'].insert_one({'paragraph':'test', 'parent_url':'some_url'})
except Exception as ex:
    pass

# Find all the data
crawled_data = list(mongo.db['crawled_data'].find())
other_data = list(mongo.db['other_data'].find())
combined_data = {"crawled_data":crawled_data, "other_data":other_data}
combined_data

{'crawled_data': [{'_id': ObjectId('663b079024073125d38e9547'),
   'paragraph': 'test',
   'parent_url': 'https://hamrakura.com/news-details/161504/2023-12-27',
   'page_title': 'लोकतन्त्रको आन्दोलन उठाउँदाकै आस्थाबाट निर्देशित छु – राष्ट्रपति पौडेल',
   'is_nepali_confidence': '-1914.427728056908'}],
 'other_data': [{'_id': ObjectId('663b079024073125d38e9548'),
   'paragraph': 'test',
   'parent_url': 'some_url'}]}

In [88]:
crawled_data = list(mongo.db['crawled_data'].find())
other_data = list(mongo.db['other_data'].find())
combined_data = {"crawled_data":crawled_data, "other_data":other_data}
combined_data
# Save to .csv file
save_to_csv(combined_data)

# Delete multiple data by id
mongo.db['crawled_data'].delete_many({"_id": {"$in": [data['_id'] for data in crawled_data]} })
mongo.db['other_data'].delete_many({"_id": {"$in": [data_ot['_id'] for data_ot in other_data]} })


file_exists: False
file_exists: False


DeleteResult({'n': 1, 'electionId': ObjectId('7fffffff0000000000000401'), 'opTime': {'ts': Timestamp(1715144594, 1), 't': 1025}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1715144594, 1), 'signature': {'hash': b'f\xa32WG\xb6\x187\xe2\xd3\xa8v\xab\xb7\xb0!\x94\xaf\x11\xf3', 'keyId': 7318892626235621378}}, 'operationTime': Timestamp(1715144594, 1)}, acknowledged=True)

In [83]:
collection.delete_many({"_id": {"$in": ["663b070124073125d38e9543", "663b05c524073125d38e953d"]} })

DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff0000000000000401'), 'opTime': {'ts': Timestamp(1715144489, 19), 't': 1025}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1715144489, 20), 'signature': {'hash': b'\xcf\x0f;\x0cj\x95w\xa7\x9b\x96ZA\xcew\xdfBI\x01\xfa:', 'keyId': 7318892626235621378}}, 'operationTime': Timestamp(1715144489, 19)}, acknowledged=True)

In [89]:
# Get crawled_data
crawled_data = list(mongo.db['crawled_data'].find())
other_data = list(mongo.db['other_data'].find())
combined_data = {"crawled_data":crawled_data, "other_data":other_data}
combined_data

{'crawled_data': [], 'other_data': []}

In [60]:
import os, csv
def save_to_csv(data, data_type="crawled_data"):
        for key, data_items in data.items():
            csv_file_path = key + ".csv"
            if data_items:
                # field_names = ['paragraph', 'parent_url', 'page_title', 'is_nepali_confidence']
                field_names = data_items[0].keys()
                file_exists = os.path.exists(csv_file_path)
                print(f'file_exists: {file_exists}')
                # Open the CSV file in append mode
                with open(csv_file_path, 'a', newline='', encoding='utf-8') as csvfile:
                    # Create a CSV writer object
                    csv_writer = csv.DictWriter(csvfile, fieldnames=field_names)

                    # If the file doesn't exist, write the header
                    if not file_exists:
                        csv_writer.writeheader()

                    # Append the new data
                    csv_writer.writerows(data_items)



file_exists: True
