In [1]:
import requests
import xml.etree.ElementTree as ET
import re
from datetime import datetime
from bs4 import BeautifulSoup
import time
import os
import schedule

In [2]:
# URL of the GEO news XML file
url = 'https://www.dawn.com/feeds/home'
MONGODB_URI = 'mongodb://localhost:27017/'

In [3]:
def get_xml(url):
    response = requests.get(url)
    
    if(response.status_code == 200):
        xml_content = response.text
        return xml_content
    else:
        print("could not fetch xml file : " , response.status_code)
        return None

In [4]:
xml = get_xml(url)
xml_root = ET.fromstring(xml)

In [5]:
def preprocess_publish_date(date):
    date_format = "%a, %d %b %Y %H:%M:%S %z"
    parsed_date = datetime.strptime(date , date_format)

    return parsed_date

def extract_xml(root):
    
    news_articles = []
    namespaces = {'media': 'http://search.yahoo.com/mrss/'}
        
    for item in root.findall('.//item'):
        
        title = item.find('title').text.strip()
        link = item.find('link').text.strip()
        description = item.find('description').text
        category = item.find('category').text
        publish_date = preprocess_publish_date(item.find('pubDate').text)
        image_url = item.find('media:content', namespaces).get('url')
        
        print(link)
        
        if(category != 'Pakistan'):
            continue
            
        description_html = BeautifulSoup(description , 'lxml')
        description = description_html.get_text().replace('\n' ,' ')
        
        news_articles.append({"title":title , 
                              "link" :link , 
                              "publish_date":publish_date ,
                              "scraped_date": datetime.now(),
                              "source": "DAWN" , 
                              "image_url" : image_url,
                              "description":description })
        
    return news_articles


In [6]:
#This if for filtering out the articles that have been already scraped if they appear again in rss feed

def find_disjoint(arr1, arr2):
    # Convert arrays to sets
    set1 = set(arr1)
    set2 = set(arr2)
    
    # Find disjoint elements
    disjoint = set1.difference(set2)
    #  convert to list
    disjoint = list(disjoint)
    
    return disjoint

def filter_articles(articles):
    try:
        
        current_titles = [ article['title'] for article in articles ]
        
        with open(os.path.abspath('dawn_prev_scraped_articles.txt') , 'r') as file:
            prev_titles = file.readlines()
            prev_titles = [ title.strip() for title in prev_titles ]
    
        current_titles = find_disjoint(current_titles , prev_titles)
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
    
    articles = [ article for article in articles if article["title"] in current_titles ]
    
    return articles

def cache_articles(articles):
    
    with open(os.path.abspath('dawn_prev_scraped_articles.txt') , 'w') as file:
        for article in articles:
            file.write(article["title"] + '\n')

In [7]:
from pymongo import MongoClient

def save_articles(articles):
    
    client = MongoClient(MONGODB_URI)
    
    if(len(articles) == 0):
        print('No articles to insert.')
        return
    
    try:
        database = client.get_database("neutra_news")
        news_articles = database.get_collection("news_articles")

        result = news_articles.insert_many(articles)

        print("Articles inserted : " , len(result.inserted_ids))
        
        
        client.close()

    except Exception as e:
        raise Exception("Unable to find the document due to the following error: ", e)

In [8]:
def scrape():
    
    xml = get_xml(url)
    xml_root = ET.fromstring(xml)
    
    news_articles = extract_xml(xml_root)
    latest_news_articles = filter_articles(news_articles)
    cache_articles(news_articles)
    save_articles(latest_news_articles)
    
    print("prev : " , len(news_articles))
    print("new : " , len(latest_news_articles))
    print('Time : ' , datetime.now().strftime("%A, %B %d, %Y %I:%M %p"))
    
    

In [9]:
scrape()

https://www.dawn.com/news/1856848/us-to-continue-to-stand-shoulder-to-shoulder-with-pakistan-against-terrorism
https://www.dawn.com/news/1856793/vpn-users-actually-fared-better-during-internet-slowdown-report
https://www.dawn.com/news/1856789/ecp-refuses-to-share-financial-details-on-election-expenses-with-senate-panel
https://www.dawn.com/news/1856857/
https://www.dawn.com/news/1856749/sc-reserves-decision-on-review-pleas-in-monal-restaurant-case
https://www.dawn.com/news/1856811/rumours-swirl-after-imran-moves-ihc-against-possible-military-trial
https://www.dawn.com/news/1856809/allies-disagree-as-na-discusses-thorny-issues
https://www.dawn.com/news/1856761/provincial-assembly-demands-expulsion-of-all-illegal-immigrants-from-sindh
https://www.dawn.com/news/1856765/man-in-karachi-convicted-of-sharing-wifes-explicit-photos-on-whatsapp
https://www.dawn.com/news/1856763/sindh-govt-increases-monthly-octroi-zilla-tax-share-of-ucs-across-province
https://www.dawn.com/news/1856868/bangladesh

In [None]:
schedule.every(1).hour.do(scrape)

while True:
    schedule.run_pending()
    time.sleep(1)