In [1]:
# pip install feedparser SQLAlchemy pymysql celery spacy nltk

In [144]:
import feedparser
import logging
from datetime import datetime
from email.utils import parsedate_to_datetime

# Example RSS feed URLs
rss_feeds = [
    "http://rss.cnn.com/rss/cnn_topstories.rss",
    "http://qz.com/feed",
    "http://feeds.foxnews.com/foxnews/politics",
    "http://feeds.reuters.com/reuters/businessNews",
    "http://feeds.feedburner.com/NewshourWorld",
    "https://feeds.bbci.co.uk/news/world/asia/india/rss.xml"
]

def fetch_articles(feed_url):
    """Fetch articles from RSS feeds"""
    articles = []
    feed = feedparser.parse(feed_url)
    for entry in feed.entries:
        pub_date = None
        if 'published' in entry:
            try:
                pub_date = parsedate_to_datetime(entry.published)
            except Exception as e:
                print(f"Error parsing date: {e}")
                pub_date = datetime.utcnow()
        article = {
            "title": entry.get('title', ''),
            "content": entry.get('summary', ''),
            "pub_date": pub_date,
            "source_url": entry.get('link', '')
        }
        articles.append(article)
    return articles


In [145]:
articles = []
for feed_url in rss_feeds:
    try:
        articles.extend(fetch_articles(feed_url))
    except Exception as e:
        logging.error(f"Error parsing RSS feed {feed_url}: {str(e)}")

In [146]:
# unique_articles = []
# for article in articles:
#     if article not in unique_articles:
#         unique_articles.append(article)

In [147]:
# for article in unique_articles:
#     print(article)

In [148]:
import sqlalchemy
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime
from sqlalchemy.orm import declarative_base
from sqlalchemy.orm import sessionmaker
import urllib.parse
import mysql.connector

In [149]:
# !pip install mysql-connector-python

In [150]:
username = 'root'
password = 'Bhagavan%401074'
host = 'localhost'
database = 'news_classifier'
database_url = f'mysql+mysqlconnector://{username}:{password}@{host}/{database}'
engine = create_engine(database_url)
Base = declarative_base()

In [151]:
from datetime import datetime
# Define the NewsArticle model
class NewsArticle(Base):
    __tablename__ = 'news_articles'

    id = Column(Integer, primary_key=True)
    title = Column(Text, nullable=False)
    content = Column(Text, nullable=False)
    pub_date = Column(DateTime, default=datetime.utcnow)
    source_url = Column(Text, unique=True)
    category = Column(String(50),  nullable=True)

# Create the table
Base.metadata.create_all(engine)

# Insert articles into the database
Session = sessionmaker(bind=engine)
session = Session()

def store_articles(articles):
    """Store articles in the database"""
    for article in articles:
        existing_article = session.query(NewsArticle).filter_by(source_url=article['source_url']).first()
        if not existing_article:
            new_article = NewsArticle(
                title=article['title'],
                content=article['content'],
                pub_date=article['pub_date'],
                source_url=article['source_url']
            )
            session.add(new_article)
    session.commit()


In [152]:
import spacy

In [153]:
nlp = spacy.load("en_core_web_sm")

# Define categories based on keywords
category_keywords = {
    "Terrorism / protest / political unrest / riot": ["terrorism", "protest", "riot", "violence", "unrest", "attack", "militant", "demonstration", "march", "civil unrest",
            "uprising", "revolt", "election", "democracy", "government", "law", "court", "legal", "lawsuit", "trial", "judge",
            "politician", "policy", "voting", "defamation", "settlement", "democrats", "republicans"],
    "Positive/Uplifting": ["happy", "joy", "success", "celebration", "achievement", "inspiring", "good news", "uplifting", "positive", 
            "motivation", "hero", "progress", "hope", "victory", "breakthrough", "compassion", "love", "kindness", "charity", 
            "support", "peace", "recovery", "healing", "solution", "growth", "innovation", "discovery"],
    "Natural Disasters": ["earthquake", "flood", "hurricane", "disaster", "tsunami", "wildfire", "storm", "devastation", "natural disaster", 
            "catastrophe", "landslide", "volcano", "drought", "tornado", "blizzard", "cyclone", "evacuation", "emergency", 
            "destruction", "famine", "mudslide", "tidal wave", "eruption", "fire", "global warming", "climate change", 
            "heatwave", "hazard"]
}

def categorize_article(content):
    """Categorize the article based on keywords"""
    doc = nlp(content)
    for category, keywords in category_keywords.items():
        if any(keyword in doc.text.lower() for keyword in keywords):
            return category
    return "Others"


In [154]:
from celery import Celery

# Celery app setup
celery_app = Celery('rss_parser', broker='redis://localhost:6379/0')

# Task to process articles
@celery_app.task
def process_articles_task():
    """Fetch, store, and categorize articles"""
    # Fetch new articles from RSS feeds
    try:
        articles = []
        for feed_url in rss_feeds:
            try:
                articles.extend(fetch_articles(feed_url))
            except Exception as e:
                logging.error(f"Error parsing RSS feed {feed_url}: {str(e)}")
        store_articles(articles)
        print(session.query(NewsArticle).filter_by(category=None).all())
        for article in session.query(NewsArticle).filter_by(category=None).all():
            article.category = categorize_article(article.content)
            session.commit()
            logging.info("Articles successfully stored in the database.")
    except Exception as e:
        logging.error(f"Error while storing articles: {str(e)}")
        session.rollback()

In [None]:
if __name__ == "__main__":
    store_articles(articles)
    print("Article fetching and storing completed.")

In [157]:
process_articles_task()

[<__main__.NewsArticle object at 0x00000180FB8FB350>, <__main__.NewsArticle object at 0x00000180FB8ED490>, <__main__.NewsArticle object at 0x00000180FB8EC410>, <__main__.NewsArticle object at 0x00000180FB8EC200>, <__main__.NewsArticle object at 0x00000180FB8ED940>, <__main__.NewsArticle object at 0x00000180FB8EFB90>, <__main__.NewsArticle object at 0x00000180FB8ECB60>, <__main__.NewsArticle object at 0x00000180FB8EC6B0>, <__main__.NewsArticle object at 0x00000180FB8EE2A0>, <__main__.NewsArticle object at 0x00000180FB8EF050>, <__main__.NewsArticle object at 0x00000180FB8EED80>, <__main__.NewsArticle object at 0x00000180FB8EE210>, <__main__.NewsArticle object at 0x00000180FB8ED880>, <__main__.NewsArticle object at 0x00000180FB8EC650>, <__main__.NewsArticle object at 0x00000180FB8EEEA0>, <__main__.NewsArticle object at 0x00000180FB8ECD70>, <__main__.NewsArticle object at 0x00000180FB8ED8B0>, <__main__.NewsArticle object at 0x00000180FB8ED550>, <__main__.NewsArticle object at 0x00000180FB8

In [155]:
# if __name__ == "__main__":
#     process_articles_task.delay()
#     print("Article fetching and categorization process started.")

Article fetching and categorization process started.


In [14]:
# !npm install redis

In [None]:
# CREATE TABLE news_articles (
#     id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY,
#     title TEXT NOT NULL,
#     content TEXT NOT NULL,
#     pub_date DATETIME,
#     source_url VARCHAR(255) NOT NULL UNIQUE,
#     category TEXT
#     );