In [1]:
import requests
import xml.etree.ElementTree as ET
import re
from datetime import datetime
from bs4 import BeautifulSoup
import time
import os
import schedule

In [2]:
# URL of the GEO news XML file
url = 'https://feeds.feedburner.com/geo/GiKR'
MONGODB_URI = 'mongodb://localhost:27017/'

In [3]:
def get_xml(url):
    response = requests.get(url)
    
    if(response.status_code == 200):
        xml_content = response.text
        return xml_content
    else:
        print("could not fetch xml file : " , response.status_code)
        return None

In [4]:
def preprocess_publish_date(date):
    date_format = "%a, %d %b %Y %H:%M:%S %z"
    parsed_date = datetime.strptime(date , date_format)

    return parsed_date

def preprocess_description(description):
    description = description.strip()
    description_cleaned = re.sub(r'&mdash;|<p>|</p>|<p class="">', ' ', description)
    
    return description_cleaned

def preprocess_img_url(img_url):
    bs4 = BeautifulSoup(img_url , 'lxml')
    image_element = bs4.find('img')
    return image_element['src']

In [5]:
def extract_xml(root):
    
    news_articles = []
    for item in root[0].iter('item'):
        title = item[0].text.strip()
        link = item[1].text
        publish_date = preprocess_publish_date(item[2].text)

        description_and_image = item[4].text.strip().split("\n")
        image_url = description_and_image[0]
        description = preprocess_description(description_and_image[1])
        
        news_articles.append({"title":title , 
                              "link" :link , 
                              "image_url" : image_url ,
                              "publish_date":publish_date ,
                              "scraped_date": datetime.now(),
                              "source": "GEO" , 
                              "description":description })
        
    return news_articles
        

In [6]:
#This if for filtering out the articles that have been already scraped if they appear again in rss feed

def find_disjoint(arr1, arr2):
    # Convert arrays to sets
    set1 = set(arr1)
    set2 = set(arr2)
    
    # Find disjoint elements
    disjoint = set1.difference(set2)
    print("disjoint : " , set1.difference(set2) )
    #  convert to list
    disjoint = list(disjoint)
    
    return disjoint

In [7]:
def filter_articles(articles):
    try:
        
        current_titles = [ article['title'] for article in articles ]
        
        with open(os.path.abspath('geo_prev_scraped_articles.txt') , 'r') as file:
            prev_titles = file.readlines()
            prev_titles = [ title.strip() for title in prev_titles ]
    
        current_titles = find_disjoint(current_titles , prev_titles)
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
    
    articles = [ article for article in articles if article["title"] in current_titles ]
    
    return articles


In [8]:
def extract_content(page):
    
    try:
        
        content_area = page.find('div' , class_="content-area")

        if(not content_area):
            content_area = page.find('div' , class_="long-content")

        content_area_paragraphs = content_area.findAll('p')
        content_area_text = [ paragraph.text for paragraph in content_area_paragraphs]
        content_area_text = " ".join(content_area_text)
        content_area_text = content_area_text.replace('\xa0' , " ")
        
    except Exception as e:
        print("Unknown Error scraping : " , e)
        return None
        
    return content_area_text

def cache_articles(articles):
    
    with open(os.path.abspath('geo_prev_scraped_articles.txt') , 'w') as file:
        for article in articles:
            file.write(article["title"] + '\n')

def scrape_articles(news_articles , old_news_articles):
    
    for news in news_articles:
        url = news["link"]
        print(url)
        page = requests.get(url)
        page_scraped = BeautifulSoup(page.text , "html.parser")
        scraped_content = extract_content(page_scraped)
        news["content"] = scraped_content

        time.sleep(5)
    
    cache_articles(old_news_articles)
    return news_articles


In [9]:
from pymongo import MongoClient

def save_articles(articles):
    
    client = MongoClient(MONGODB_URI)
    
    if(len(articles) == 0):
        print('No articles to insert')
        return
    
    try:
        database = client.get_database("neutra_news")
        news_articles = database.get_collection("news_articles")

        result = news_articles.insert_many(articles)

        print("Articles inserted : " , len(result.inserted_ids))

        client.close()

    except Exception as e:
        raise Exception("Unable to find the document due to the following error: ", e)


In [10]:
def scrape():
    try:
        xml = get_xml(url)
        xml_root = ET.fromstring(xml)

        news_articles = extract_xml(xml_root)
        latest_news_articles = filter_articles(news_articles)
        scraped_news_articles = scrape_articles(latest_news_articles , news_articles)

        print("prev : " , len(news_articles))
        print("new : " , len(latest_news_articles))
        print('Time : ' , datetime.now().strftime("%A, %B %d, %Y %I:%M %p"))
    
        save_articles(scraped_news_articles)
        
    except Exception as e:
        print("Unknown Error : " , e)
    

In [11]:
scrape()

disjoint :  {'Light to moderate isolated rains may lash parts of Karachi today', 'Did CJP postpone meeting of JC in anticipation of judicial package for his extension?', 'Envoy Rizwan Saeed, Congressman Tom Suozzi discuss Pak-US bilateral relations', "US stands 'shoulder-to-shoulder' with Pakistan in fight against terrorism: Miller", 'Non-bailable arrest warrants issued against KP CM Ali Amin Gandapur'}
https://www.geo.tv/latest/562543-non-bailable-arrest-warrants-issued-against-kp-cm-ali-amin-gandapur
https://www.geo.tv/latest/562498-light-to-moderate-isolated-rains-may-lash-parts-of-karachi-today
https://www.geo.tv/latest/562495-legal-expert-weighs-in-on-rumours-about-postponement-of-jcp-meeting-by-cjp-isa
https://www.geo.tv/latest/562487-amb-rizwan-saeed-rep-tom-suozzi-discuss-pak-us-bilateral-relations
https://www.geo.tv/latest/562484-balochistan-attacks-us-reaffirms-support-to-pakistan-in-fight-against-terrorism
prev :  20
new :  5
Time :  Wednesday, September 04, 2024 01:36 PM
Ar

In [None]:
schedule.every(1).hour.do(scrape)

while True:
    schedule.run_pending()
    time.sleep(1)
