In [1]:
import json
import os 
import requests
import time
from datetime import datetime,timedelta
from datetime import date
import re
import sys
import urllib, urllib.request, urllib.parse
import random
from scrawl import *
    
# Date and time
start_time = time.time()
current_time = datetime.now().strftime("%H-%M-%S")
created_on = date.today().strftime("%Y-%m-%d")

# client_id = sys.argv[1]
client_id = '5f69d22ef472d6646f577fa6'  # Europe
site = 'rt_com'
c = Crawl()  # creating object

# create directories to store logs.
log_path = c.create_directories(project_path, client_id, site)

# create image directories
image_directory = c.create_image_directories(project_path)

# logger
logger = log_func(log_path, created_on, current_time)
logger.info("Process Started ...\n")

# initialize variables
skipped_due_to_headline = 0
skipped_due_to_content = 0
skipped_due_to_date = 0
missing_overall_tonality = 0
no_of_data = 0
duplicate_data = 0  
unable_to_fetch_article_url = 0
publish_source = 'rt.com'
country = 'Russia'
language = 'English'
images_path = []

# archive page
rss_page = c.download_page('https://www.rt.com/rss-feed/')

for i in rss_page.split('<item>')[1:]:
    
    # source_link
    source_link = c.scrap("<link>(.*?)</link>", i)
    source_link = source_link.replace("<![CDATA[","").replace("]]>","")

    # handle duplicates
    source_link_query = {'source_link':source_link}
    dic = cl_data.find_one(source_link_query,{'source_link': 1}) 
    if dic:
        duplicate_data += 1
        continue
        
    time.sleep(random.randint(1,3))
    
    # source_headline
    source_headline = c.scrap("<title>(.*?)<", i)

    # skip if headline not found
    if not source_headline:
        logger.info(f'Skipping due to headline {source_link}\n')
        skipped_due_to_headline += 1
        continue

    # Date and time
    pub_date, publish_time = '', ''

    try:
        date_time_str = c.scrap("<pubDate>\w+,(.*?)\+", i) 
        date_time_str = re.sub(' |:','',date_time_str,re.S)
        date_time_obj = datetime.strptime(date_time_str, '%d%b%Y%H%M%S')
        ist_date_time = date_time_obj + timedelta(hours = 5,minutes = 30)  # utc time to ist
        ist_date_time = ist_date_time.strftime('%Y-%m-%d %H:%M:%S')
        pub_date = ist_date_time[:10]
        publish_time = ist_date_time[11:]
    except:
        pass


    # skip null date
    if not pub_date:
        logger.info(f'Skipping due to date {source_link}\n')
        skipped_due_to_date += 1
        continue

    # break if date is not today's date
#     if pub_date != created_on:
#         break    

    logger.info(f'Fetching {source_link}\n')
    
    page = c.download_page(source_link)
    if page.startswith('Unable to fetch'):
        logger.info(page) # writes error message with error code
        unable_to_fetch_article_url += 1
        continue    
            
    # source_content          
    source_content = c.scrap('<content:encoded>(.*?)</content:encoded>', i)
    source_content = re.sub(']]>','',source_content,re.S)
    source_content = c.strip_html(source_content)

    # skip if content not found
    if not source_content:
        logger.info(f'Skipping due to content {source_link}\n')
        skipped_due_to_content += 1
        continue

    # journalist
    journalist = c.scrap('name="article:author" content="(.*?)"',page)
    if not journalist: journalist = 'NA'

    # current date and time 
    harvest_time = datetime.now().strftime("%H:%M:%S")

    # temp link
    temp_link = source_link

    # headline and content 
    headline = source_headline
    content = source_content

    # overall_tonality
    overall_tonality = ''

    # word count
    word_count = len((source_headline + ' ' + source_content).split())

    html_content = ''

    # image_urls
    image_urls = []
    
    # storing the above data in a dictionary
    clientdata ={
                    "client_master" : client_id, 
                    "articleid":client_id,
                    "medium":'Web' ,
                    "searchkeyword":[],
                    "entityname" : [] ,
                    "process_flage":"1",
                    "na_flage":"0",
                    "na_reason":"",
                    "qc_by":"",
                    "qc_on":"",
                    "location":"",
                    "spokeperson":"",
                    "quota":"",
                    "overall_topics":"",
                    "person":"",
                    "overall_entites":"",
                    "overall_tonality": overall_tonality,
                    "overall_wordcount":word_count,
                    "article_subjectivity":"",
                    "article_summary":"",
                    "pub_date":pub_date,
                    "publish_time":publish_time,
                    "harvest_time":harvest_time,
                    "temp_link":temp_link,
                    "publish_source": publish_source,
                    "programme":'null',
                    "feed_class":"News",
                    "publishing_platform":"",
                    "klout_score":"",
                    "journalist":journalist,
                    "headline":headline,
                    "content":content,
                    "source_headline":source_headline,
                    "source_content":source_content,
                    "language":language,
                    "presence":'null',
                    "clip_type":'null',
                    "prog_slot":'null',
                    "op_ed":'0',
                    "location_mention":'',
                    "source_link":source_link,
                    "author_contact":'',
                    "author_emailid":'',
                    "author_url":'',
                    "city":'',
                    "state":'',
                    "country":country,
                    "source":publish_source,
                    "foot_fall":'',
                    "created_on":created_on,
                    "active":'1',
                    'crawl_flag':2,
                    "images_path":images_path,
                    "html_content":html_content
                } 

    cl_data.insert_one(clientdata)  # get object id and insert data
    no_of_data += 1


logger.info('Iteration complete\n')   

logger.info(f'Number of data: {no_of_data}\n')
logger.info(f'Duplicate data: {duplicate_data}\n')
logger.info(f'Unable to article fetch url: {unable_to_fetch_article_url}\n')
logger.info(f'Skipped due to headline: {skipped_due_to_headline}\n')
logger.info(f'Skipped due to content: {skipped_due_to_content}\n')
logger.info(f'Skipped due to date: {skipped_due_to_date}\n')
logger.info(f'Processing finished in {time.time() - start_time} seconds.\n')


https://www.rt.com/usa/520422-nude-photos-lawsuit-katie-hill/?utm_source=rss&utm_medium=rss&utm_campaign=RSS
Judge rules nude photos of ex-congresswoman Katie Hill, who resigned after sex scandal, are â€˜matter of public concernâ€™
https://www.rt.com/usa/520420-youtube-desantis-lockdown-video/?utm_source=rss&utm_medium=rss&utm_campaign=RSS
YouTube REMOVES roundtable video featuring Florida Gov. DeSantis &amp; anti-lockdown health experts, citing â€˜terms of serviceâ€™
https://www.rt.com/news/520419-state-department-iran-sanctions/?utm_source=rss&utm_medium=rss&utm_campaign=RSS
State Dept says US prepared to lift Iran sanctions, but warns of â€˜long road aheadâ€™ amid indirect talks to revive nuclear pact
https://www.rt.com/usa/520418-florida-police-identity-shootings/?utm_source=rss&utm_medium=rss&utm_campaign=RSS
â€˜Meets the definition of a crime victimâ€™: Florida court rules police can hide officersâ€™ identity after on-duty shootings
https://www.rt.com/usa/520413-twitch-deplatform

https://www.rt.com/news/520379-wef-smart-mask-future/?utm_source=rss&utm_medium=rss&utm_campaign=RSS
â€˜They want you in masks foreverâ€™: World Economic Forumâ€™s smart â€˜mask of the futureâ€™ tweet spooks Twitter users
https://www.rt.com/russia/520378-germany-ukraine-nato-unlikely/?utm_source=rss&utm_medium=rss&utm_campaign=RSS
NATO has an open door policy, but Ukraine unlikely to walk through it any time soon, German officials say amid Donbass escalations
https://www.rt.com/business/520368-air-passenger-traffic-fall/?utm_source=rss&utm_medium=rss&utm_campaign=RSS
No signs of demand recovery: Global air passenger traffic continues to fall, IATA says
https://www.rt.com/news/520374-turkey-aerobatic-jet-crash/?utm_source=rss&utm_medium=rss&utm_campaign=RSS
Pilot killed as jet from Turkish Air Force aerobatic team crashes during training
https://www.rt.com/news/520371-member-who-covid-19-study/?utm_source=rss&utm_medium=rss&utm_campaign=RSS
Inconclusive and terribly complex: Member of W

https://www.rt.com/uk/520333-extinction-rebellion-bank-barclays/?utm_source=rss&utm_medium=rss&utm_campaign=RSS
Extinction Rebellion activists arrested outside Barclays London HQ after breaking windows to protest bankâ€™s fossil fuel financing
https://www.rt.com/russia/520331-foreign-agents-election-participation/?utm_source=rss&utm_medium=rss&utm_campaign=RSS
â€˜Foreign agentsâ€™ who take cash from overseas will be allowed run in upcoming Russian elections, as long as they donâ€™t hide status
https://www.rt.com/business/520322-coinbase-record-revenue-nasdaq-listing/?utm_source=rss&utm_medium=rss&utm_campaign=RSS
Coinbase reports record $1.8 BILLION revenue for Q1 ahead of its Nasdaq listing
https://www.rt.com/uk/520330-britain-covid19-moderna-vaccine/?utm_source=rss&utm_medium=rss&utm_campaign=RSS
UK starts rollout of Moderna Covid-19 jab ahead of schedule, with 5,000 doses distributed to vaccination centres in Wales
https://www.rt.com/news/520329-iran-cargo-ship-attack/?utm_source=rs