In [1]:
import json
import os 
import requests
import time
from datetime import datetime,timedelta
from datetime import date
import re
import sys
import urllib, urllib.request, urllib.parse
import random
from scrawl import *
    
# Date and time
start_time = time.time()
current_time = datetime.now().strftime("%H-%M-%S")
created_on = date.today().strftime("%Y-%m-%d")

# client_id = sys.argv[1]
client_id = '5f69d22ef472d6646f577fa6'  # Europe
site = 'inss_ndu_edu_publication'
c = Crawl()  # creating object

# create directories to store logs.
log_path = c.create_directories(project_path, client_id, site)

# create image directories
image_directory = c.create_image_directories(project_path)

# creating pdf directories
pdf_directory = c.create_pdf_directories(project_path, site)


# logger
logger = log_func(log_path, created_on, current_time)
logger.info("Process Started ...\n")

# initialize variables
skipped_due_to_headline = 0
skipped_due_to_content = 0
skipped_due_to_date = 0
missing_overall_tonality = 0
no_of_data = 0
duplicate_data = 0  
unable_to_fetch_article_url = 0
unable_to_fetch_cat_url = 0
unable_to_download_pdf = 0
publish_source = 'inss.ndu.edu'
country = 'United States'
language = 'English'
images_path = []

cat_pages = c.download_page('https://inss.ndu.edu/Media/News/')

for _ in cat_pages.split(' <li class=" dm  ">')[1:]:
    
    cat_url = c.scrap('<a\s*href="(.*?)"', _)
#     print(cat_url)
#     continue
    logger.info(f'Fetching cat url {cat_url}\n')
    cat_page = c.download_page(cat_url)
    
    if cat_page.startswith('Unable to fetch'):
        logger.info(cat_page) # writes error message with error code
        unable_to_fetch_cat_url += 1
        continue

    for i in cat_page.split('<div class="poster">')[1:]:

        # source_link
        source_link = c.scrap("<a\s*href='(.*?)'", i)
        logger.info(f'Fetching {source_link}\n')
      
        # handle duplicates
        source_link_query = {'source_link':source_link}
        dic = cl_data.find_one(source_link_query,{'source_link': 1}) 
        if dic:
            duplicate_data += 1
            continue

        time.sleep(random.randint(1,3))
        page = c.download_page(source_link)
        if page.startswith('Unable to fetch'):
                logger.info(page) # writes error message with error code
                unable_to_fetch_article_url += 1
                continue    
       
        # source_headline
        source_headline = c.scrap('<title>(.*?)>', page)

        # skip if headline not found
        if not source_headline:
            logger.info(f'Skipping due to headline {source_link}\n')
            skipped_due_to_headline += 1
            continue
            
         # Date and time
        pub_date, publish_time = '', ''

        try:
            date_time_str = c.scrap('<meta\s*itemprop="datePublished"\s*content="(.*?)"', page) + '00:00:00' # January 22, 202000:00:00
            date_time_str = date_time_str.replace("Feb.","February").replace("Jan.","January").replace("Sept.","September").replace("Nov.","November").replace("Oct.","October").replace("Dec.","December").replace("Aug.","August")
            date_time_obj = datetime.strptime(date_time_str, '%B %d, %Y%H:%M:%S')
            # Singapore is 2 hours and 30 minutes ahead of India
            ist_date_time = date_time_obj - timedelta(hours = 0,minutes = 0)  # utc+8 to ist
            ist_date_time = ist_date_time.strftime('%Y-%m-%d %H:%M:%S')
            pub_date = ist_date_time[:10]
            publish_time = ist_date_time[11:]
        except:
            pass

        # skip null date
        if not pub_date:
            logger.info(f'Skipping due to date {source_link}\n')
            skipped_due_to_date += 1
            continue

        # break if date is not today's date
#         if pub_date != created_on:
#             break    

        # source_content          
        source_content = c.scrap('class="action-item virin".*?</script>(.*?)<div\s*class="related">', page)
        if not source_content:
            source_content = source_headline
        source_content = re.sub('Advertisement' , '', source_content, flags=re.S)
        source_content = c.strip_html(source_content)

        # skip if content not found
        if not source_content:
            logger.info(f'Skipping due to content {source_link}\n')
            skipped_due_to_content += 1
            continue

        # journalist
        journalist =  c.scrap('class="line">(.*?)<',page)
        if not journalist: journalist = 'NA'
        
        # current date and time 
        harvest_time = datetime.now().strftime("%H:%M:%S")

        # temp link
        temp_link = source_link

        # headline and content 
        headline = source_headline
        content = source_content

        # overall_tonality
        overall_tonality = ''

        # word count
        word_count = len((source_headline + ' ' + source_content).split())

        html_content = ''

        # image_urls
        image_urls = []
        pdf_path, pdf_name = '', ''

        pdf_url = c.scrap('class="dateline-text".*?href="(.*?)"', page)
        if 'Documents' not in pdf_url:
            pdf_url = c.scrap('itemprop="articleBody">.*?href.*?href="(.*?)"', page)    
        if 'http' not in pdf_url:
            pdf_url = 'https://inss.ndu.edu' + pdf_url
        pdf_url = re.sub(' ','%20',pdf_url,re.S)


        if pdf_url and '.pdf' in pdf_url:         
            # pdf_name
            pdf_name = c.scrap('.*/(.*)', pdf_url)
            pdf_name = re.sub('[^\w|\.]', '', pdf_name, flags=re.S)
            # pdf_path
            pdf_path = f'{pdf_directory}/{pdf_name}'        
            
            # download pdf
            pdf = c.download_pdf(pdf_url, pdf_path)
            if pdf.startswith('Unable to fetch'):
                logger.info(pdf) # writes error message with error code
                unable_to_download_pdf += 1
                continue   


        # storing the above data in a dictionary
        clientdata ={
                        "client_master" : client_id, 
                        "articleid":client_id,
                        "medium":'Web' ,
                        "searchkeyword":[],
                        "entityname" : [] ,
                        "process_flage":"1",
                        "na_flage":"0",
                        "na_reason":"",
                        "qc_by":"",
                        "qc_on":"",
                        "location":"",
                        "spokeperson":"",
                        "quota":"",
                        "overall_topics":"",
                        "person":"",
                        "overall_entites":"",
                        "overall_tonality": overall_tonality,
                        "overall_wordcount":word_count,
                        "article_subjectivity":"",
                        "article_summary":"",
                        "pub_date":pub_date,
                        "publish_time":publish_time,
                        "harvest_time":harvest_time,
                        "temp_link":temp_link,
                        "publish_source": publish_source,
                        "programme":'null',
                        "feed_class":"News",
                        "publishing_platform":"",
                        "klout_score":"",
                        "journalist":journalist,
                        "headline":headline,
                        "content":content,
                        "source_headline":source_headline,
                        "source_content":source_content,
                        "language":language,
                        "presence":'null',
                        "clip_type":'null',
                        "prog_slot":'null',
                        "op_ed":'0',
                        "location_mention":'',
                        "source_link":source_link,
                        "author_contact":'',
                        "author_emailid":'',
                        "author_url":'',
                        "city":'',
                        "state":'',
                        "country":country,
                        "source":publish_source,
                        "foot_fall":'',
                        "created_on":created_on,
                        "active":'1',
                        'crawl_flag':2,
                        "images_path":images_path,
                        "html_content":html_content,
                        "pdf_url": pdf_url,
                        "pdf_name": pdf_name,
                        "pdf_path":pdf_path
                        
                    } 

        cl_data.insert_one(clientdata)  
        no_of_data += 1


logger.info('Iteration complete\n')   

logger.info(f'Number of data: {no_of_data}\n')
logger.info(f'Duplicate data: {duplicate_data}\n')
logger.info(f'Unable to fetch cat url: {unable_to_fetch_cat_url}\n')
logger.info(f'Unable to fetch article url: {unable_to_fetch_article_url}\n')
logger.info(f'Skipped due to headline: {skipped_due_to_headline}\n')
logger.info(f'Skipped due to content: {skipped_due_to_content}\n')
logger.info(f'Skipped due to date: {skipped_due_to_date}\n')
logger.info(f'unable to download pdf  {unable_to_download_pdf}\n')
logger.info(f'Processing finished in {time.time() - start_time} seconds.\n')


http://inss.ndu.edu/Media/News/Article/2524839/want-to-grow-the-economy-try-fermenting-it-instead/
Want to Grow the Economy? Try Fermenting It Instead
By Peter Emanuel Ph.D., Brian Feeney Ph.D. and Diane DiEuliis Ph.D.
https://media.defense.gov/2021/Mar/04/2002593534/-1/-1/0/210304-D-BD104-001.JPG
http://inss.ndu.edu/Media/News/Article/2493376/intellectual-overmatch-is-impossible-if-we-teach-only-half-the-team-a-call-for/
Intellectual Overmatch Is Impossible If We Teach Only Half the Team: A Call for Professional Civilian Education
By Laura Junor Pulzone and Justin Lynch
https://media.defense.gov/2021/Feb/04/2002576070/-1/-1/0/210204-D-BD104-001.JPG
http://inss.ndu.edu/Media/News/Article/2425797/russias-escalating-use-of-private-military-companies-in-africa/
Russia’s Escalating Use of Private Military Companies in Africa
By R. Kim Cragin and Lachlan MacKenzie
https://media.defense.gov/2020/Nov/23/2002540698/-1/-1/0/201123-D-BD104-001.JPG
http://inss.ndu.edu/Media/News/Article/2412750/c

http://inss.ndu.edu/Media/News/Article/1723239/between-russia-and-iran-room-to-pursue-american-interests-in-syria/
Between Russia and Iran: Room to Pursue American Interests in Syria
By John W. Parker
https://inss.ndu.edu/Portals/68/Documents/stratperspective/inss/Strategic-Perspectives-27.pdf?ver=2019-01-02-140836-703
http://inss.ndu.edu/Media/News/Article/1664373/united-nations-peacekeeping-operations-environmental-sustainability/
United Nations Peacekeeping Operations: Environmental Sustainability
By Philip Stockdale, Rebekah Kirkwood, Julie Sapp, and Jonathan Daniel
http://ndupress.ndu.edu/Portals/68/Documents/DefenseTechnologyPapers/DTP-111.pdf
http://inss.ndu.edu/Media/News/Article/1866042/thucydides-other-traps-the-united-states-china-and-the-prospect-of-inevitable-w/
Thucydides’ Other “Traps”: The United States, China, and the Prospect of “Inevitable” War
By Alan Greeley Misenheimer
https://inss.ndu.edu/Portals/68/Documents/casestudies/nwc_casestudy-3.pdf?ver=2019-06-04-144701-