In [5]:
import pymongo
from pymongo import MongoClient,ReturnDocument
from bson import ObjectId
import requests
from PIL import Image 
import pytesseract 
import sys 
from pdf2image import convert_from_path,pdfinfo_from_path
from pdf2image.exceptions import PDFSyntaxError,PDFPageCountError
import os
import glob
import time

In [6]:
def download_pdf(url,storage_dir,title):
    r = requests.get(url, allow_redirects=True)
    pdf_path = f"./{storage_dir}/{title}.pdf"
    open(f"./{storage_dir}/{title}.pdf", 'wb').write(r.content)
    return pdf_path
    
def pdf_to_images(pdf_path,storage_dir,object_id,mongodb_collection):
    info = pdfinfo_from_path(pdf_path)
    maxPages = info["Pages"]
    mongodb_collection.update_one({'_id':object_id},
                                {"$set": {"numb_pages": maxPages}},
                                )    
    image_counter = 0
    for page in range(1, maxPages+1, 10): 
        pages = convert_from_path(pdf_path, dpi=200, first_page=page, last_page = min(page+10-1,maxPages))
        
        for individual_page in pages:
            filename = f"./{storage_dir}/page_{image_counter}.jpg"
            individual_page.save(filename, 'JPEG')
            image_counter += 1

def images_to_text(storage_dir):
    allfiles = os.listdir(storage_dir)
    images = [filename for filename in allfiles if filename.endswith('.jpg')]
    
    master_text = ''
    for img_number in range(0,len(images)): 
        filename = f"./{storage_dir}/page_{img_number}.jpg"
        master_text += pytesseract.image_to_string(Image.open(filename))
        
    return master_text

def upload_text_to_mongodb(mongodb_collection,object_id,master_text):
    mongodb_collection.update_one({'_id':object_id},
                                {'$set': {'full_text': master_text}},
                                )
def delete_temp_content(storage_dir):
    files = glob.glob(f'./{storage_dir}/*')
    for f in files:
        os.remove(f)

In [None]:
client = MongoClient('localhost', 27017)
db = client.db_world_bank
reports_coll = db["reports"]

#.batchSize(100)

coll_knowledge_notes = reports_coll.find({'collection':'Knowledge Notes'},no_cursor_timeout=True).batch_size(100)

start_index = 1079
for element in coll_knowledge_notes[start_index:start_index+100]:
    print(start_index)
    
    url = element["pdf_link"]
    object_id = element['_id']
    storage_dir = 'tempstorage'
    title = element["title"]
    start_index+=1
    
    print(url)
    print(title)
    if url:
        if 'full_text' not in element.keys():
            try:
                pdf_path = download_pdf(url,storage_dir,title)
                pdf_to_images(pdf_path,storage_dir,object_id,reports_coll)
                master_text = images_to_text(storage_dir)
                upload_text_to_mongodb(reports_coll,object_id,master_text)
                delete_temp_content(storage_dir)
                time.sleep(60)
            except:
                print('ERROR - trying again')
                time.sleep(300)
                try:
                    pdf_path = download_pdf(url,storage_dir,title)
                    pdf_to_images(pdf_path,storage_dir,object_id,reports_coll)
                    master_text = images_to_text(storage_dir)
                    upload_text_to_mongodb(reports_coll,object_id,master_text)
                    delete_temp_content(storage_dir)
                except:
                    print('ERROR AGAIN WITH THIS DOCUMENT - MOVING ON')
                    pass


1079
https://openknowledge.worldbank.org/bitstream/handle/10986/21467/939740BRI0IFC00r000Entrepreneurship.pdf?sequence=4&isAllowed=y
Entrepreneurship around the World--Before, During, and After the Crisis
1080
https://openknowledge.worldbank.org/bitstream/handle/10986/20014/895320WP0The0N00Box385285B00PUBLIC0.pdf?sequence=1&isAllowed=y
The New Trade Environment and Trade Performance in the Caribbean
1081
https://openknowledge.worldbank.org/bitstream/handle/10986/18932/892270BRI00Box0al0ADD0VC0KNOW0NOTES.pdf?sequence=1&isAllowed=y
Empowering Communities : The Local Initiatives Support Program in Russia
1082
https://openknowledge.worldbank.org/bitstream/handle/10986/18674/886940BRI0Live00Box385194B00PUBLIC0.pdf?sequence=7&isAllowed=y
Promoting Renewable Energy through Auctions
1083
https://openknowledge.worldbank.org/bitstream/handle/10986/20262/909520BRI0Box30Bangladesh0July02014.pdf?sequence=1&isAllowed=y
bKash Bangladesh : A Fast Start for Mobile Financial Services
1084
https://openkn

1120
https://openknowledge.worldbank.org/bitstream/handle/10986/20951/924580NEWS0Res00Box385356B00PUBLIC0.pdf?sequence=1&isAllowed=y
World Bank Research Digest, Vol. 9(1)
1121
https://openknowledge.worldbank.org/bitstream/handle/10986/23656/926720BRI0Box385366B00PUBLIC00IN32.pdf?sequence=1&isAllowed=y
Do Special Tax Regimes for Micro and Small Enterprises Encourage Formal Firm Creation? : Do They Lead Firms to Underreport Revenues?
1122
https://openknowledge.worldbank.org/bitstream/handle/10986/21062/933890BRI00P1200sectoral0synergies.pdf?sequence=1&isAllowed=y
Tackling NCDIs in Cambodia : An Opportunity for Inter - and Itra-Sectoral Synergies
1123
https://openknowledge.worldbank.org/bitstream/handle/10986/23659/91851.pdf?sequence=2&isAllowed=y
Tax Incentives for Research and Development
1124
https://openknowledge.worldbank.org/bitstream/handle/10986/20430/912260BRI0Box30D0VC0KNOWLEDGE0NOTES.pdf?sequence=1&isAllowed=y
Barriers to Entrepreneurship in Rural Pakistan
1125
https://openknow

In [None]:
coll_knowledge_notes = reports_coll.find({'collection':'Knowledge Notes'},no_cursor_timeout=True)

for element in coll_knowledge_notes[start_index:start_index+100]:
    print(start_index)
    
    url = element["pdf_link"]
    object_id = element['_id']
    storage_dir = 'tempstorage'
    title = element["title"]
    start_index+=1
    
    print(url)
    print(title)
    if url:
        if 'full_text' not in element.keys():
            try:
                pdf_path = download_pdf(url,storage_dir,title)
                pdf_to_images(pdf_path,storage_dir,object_id,reports_coll)
                master_text = images_to_text(storage_dir)
                upload_text_to_mongodb(reports_coll,object_id,master_text)
                delete_temp_content(storage_dir)
                time.sleep(60)
            except:
                print('ERROR - trying again')
                time.sleep(300)
                try:
                    pdf_path = download_pdf(url,storage_dir,title)
                    pdf_to_images(pdf_path,storage_dir,object_id,reports_coll)
                    master_text = images_to_text(storage_dir)
                    upload_text_to_mongodb(reports_coll,object_id,master_text)
                    delete_temp_content(storage_dir)
                except:
                    print('ERROR AGAIN WITH THIS DOCUMENT - MOVING ON')
                    pass


In [100]:
start_index

210

In [75]:
coll_knowledge_notes = reports_coll.find({'collection':'Knowledge Notes'})
n = 0
for i in coll_knowledge_notes[1:2]:
    print(i)



{'_id': ObjectId('5f770a0a9d1d71848dfe8814'), 'title': 'COVID-19 Crisis Through a Migration Lens', 'abstract': 'The economic crisis induced by COVID‐19 could be long, deep, and pervasive when viewed through amigration lens. Lockdowns, travel bans, and social distancing have brought global economic activities to a near standstill. Host countries face additional challenges in many sectors, such as health and agriculture, that depend on the availability of migrant workers. Migrants face the risk of contagion and also the possible loss of employment, wages, and health insurance coverage. This Migration and Development Brief provides a prognosis of how these events might affect global trends in international economic migration and remittances in 2020 and 2021. Considering that migrants tend to be concentrated in urban economic centers (cities), and are vulnerable to infection by the coronavirus, there is a need to include migrants in efforts to fight thecoronavirus. Migrant remittances prov