<a href="https://colab.research.google.com/github/Arnav-Ajay/Article-Parsing-System/blob/main/Article_Parsing_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Article Parsing System for Neural Network Processing

1. Fetch RSS feed from https://cointelegraph.com/
2. Extract necessary attributes and save to json file.
3. Create a table in the DB if it does not already exist.
4. Compare the entries in json file and the db.
5. Add new entries (if exists) in the DB.

In [2]:
! pip install feedparser



In [113]:
import pandas as pd
import feedparser
import sqlite3
import json

## Functions

In [144]:
feed = feedparser.parse(url)
feed.feed

{'title': 'Cointelegraph.com News',
 'title_detail': {'type': 'text/plain',
  'language': None,
  'base': 'https://cointelegraph.com/rss',
  'value': 'Cointelegraph.com News'},
 'links': [{'href': 'https://cointelegraph.com/rss',
   'rel': 'self',
   'type': 'application/rss+xml'},
  {'rel': 'alternate',
   'type': 'text/html',
   'href': 'https://cointelegraph.com'}],
 'link': 'https://cointelegraph.com',
 'subtitle': 'Cointelegraph covers fintech, blockchain and Bitcoin bringing you the latest crypto news and analyses on the future of money.',
 'subtitle_detail': {'type': 'text/html',
  'language': None,
  'base': 'https://cointelegraph.com/rss',
  'value': 'Cointelegraph covers fintech, blockchain and Bitcoin bringing you the latest crypto news and analyses on the future of money.'},
 'generator_detail': {'name': 'cointelegraph.com'},
 'generator': 'cointelegraph.com',
 'updated': 'Mon, 16 Dec 2024 11:21:22 +0000',
 'updated_parsed': time.struct_time(tm_year=2024, tm_mon=12, tm_mday

In [4]:
# get data from a RSS feed - Modified to incorporate Cointelegraph RSS feed
def fetch_rss_feed(url):
  """
  Fetches and parses an RSS feed from the given URL

  Args:
    url (str): The URL of the RSS feed.

  Returns:
    dict: A dictionary containing the feed data.
  """
  try:
    # Parse the RSS feed
    feed = feedparser.parse(url)
    if feed.bozo:
      # If there was a problem with the feed
      print(f"Error reading feed: {feed.bozo_exception}")
      return {}

    # Extract all feed data
    feed_data = {
        "feed": {
            "title": feed.feed.get("title", "No Title"),
            "link": feed.feed.get("link", "No Link"),
            "description": feed.feed.get("description", "No Description"),
            "updated": feed.feed.get("updated", "No Update Time"),
          },
          "entries": []
    }
    feed_host = feed.feed.link if "link" in feed.feed else url

    for entry in feed.entries:
      # Initialize sets for unique links
      inbound_links = set()
      outbound_links = set()

      # looping over each entry to retrieve outbound and inbound links
      entry_link = entry.get("link", "")
      if entry_link.startswith(feed_host):
        inbound_links.add(entry_link)
      else:
        outbound_links.add(entry_link)
      # Extractive only the tags of each entry
      tags = entry.get("tags", "No Tags") if "tags" in entry else "No Tags",
      str_tags = [item.term for item in tags[0]]
      # Extract all Entries
      feed_data["entries"].append({
          # Source (media outlet name)
          "title": entry.get("title", "No Title"),
          "author": entry.get("author", "No Author"),
          "article_text": entry.get("description", "No Description"),
          "publication_date": entry.get("published", "No Published Date"),
          "article_link": entry.get("link", "No Link"),
          "tags": ', '.join(str(x) for x in str_tags),
          "inbound_links" : ', '.join(str(x) for x in inbound_links),
          "outbound_links" : ', '.join(str(x) for x in outbound_links),
      })
    return feed_data

  except Exception as e:
    print(f"An error occurred: {e}")
    return {}

In [124]:
def create_table_from_json(cursor, table_name, json_data):
    """
    Create a table in SQLite based on JSON data structure.

    Args:
        cursor: SQLite cursor object.
        table_name (str): Name of the table to create.
        json_data (list): List of dictionaries representing the JSON data.
    """
    if not json_data:
        raise ValueError(f"JSON data is empty or invalid in file {json_data}")

    # Extract column names and types from the first record in the JSON data
    columns = json_data["entries"][0].keys()

    # Define a SQLite CREATE TABLE query with dynamic columns
    column_definitions = ", ".join([f"{col} TEXT" for col in columns])
    create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} ({column_definitions})"

    # Execute the CREATE TABLE query
    cursor.execute(create_table_query)

In [126]:
def insert_data_into_table(cursor, table_name, json_data):
    """
    Insert JSON data into an SQLite table.

    Args:
        cursor: SQLite cursor object.
        table_name (str): Name of the table to insert data into.
        json_data (list): List of dictionaries representing the JSON data.
    """
    if not json_data:
        raise ValueError("JSON data is empty or invalid - No Data to insert")

    # Extract column names
    columns = json_data[0].keys()
    column_names = ", ".join(columns)
    placeholders = ", ".join(["?"] * len(columns))

    # Prepare the INSERT query
    insert_query = f"INSERT INTO {table_name} ({column_names}) VALUES ({placeholders})"

    # Insert each record
    for record in json_data:
        values = tuple(record.values())
        cursor.execute(insert_query, values)

In [132]:
def validate_new_entries(cursor, table_name, json_data):
  new_data = {}

  # Fetch all data from the table
  query = f"SELECT * FROM {table_name}"
  cursor.execute(query)

  rows = [row for row in cursor.fetchall()]
  if len(rows) == 0:
    new_data = json_data['entries']
    print(f"New Entries: {len(new_data)}")
    return new_data

  headers = [description[0] for description in cursor.description]

  # return [dict(zip(headers, row)) for row in rows]
  # Extract existing keys from db for quick lookup
  existing_keys = {entry['title'] for entry in json_data['entries'] if 'title' in entry}

  # Add new entries from json_data to new_data if their unique_key value is not in db_data
  for entry in json_data['entries']:
    if entry.get('title') not in existing_keys:
      new_data.update(entry)
  print(f"New Entries: {len(new_data)}")
  return new_data

In [128]:
def load_json_to_sqlite(json_file, db_file, table_name):
    """
    Load data from a JSON file into an SQLite database.

    Args:
        json_file (str): Path to the JSON file.
        db_file (str): Path to the SQLite database file.
        table_name (str): Name of the table to create/insert data into.
    """
    try:

        # Connect to SQLite database
        conn = sqlite3.connect(db_file)
        cursor = conn.cursor()

        # Load JSON data
        with open(json_file, "r") as f:
            json_data = json.load(f)

        # if not isinstance(json_data, list):
        #     raise ValueError("JSON data must be a list of dictionaries")

        # Create the table and insert data
        create_table_from_json(cursor, table_name, json_data)

        json_data = validate_new_entries(cursor, table_name, json_data)
        if json_data:
          insert_data_into_table(cursor, table_name, json_data)

        conn.commit()

    except Exception as e:
        print(f"Error: {e}")

    finally:
        # Close the database connection
        if conn:
            conn.close()

In [130]:
def read_data_from_table(db_file, table_name):
    """
    Reads all data from a specified SQLite table.

    Args:
        db_file (str): Path to the SQLite database file.
        table_name (str): Name of the table to read data from.

    Returns:
        list: A list of rows from the table as dictionaries.
    """
    try:
        # Connect to the SQLite database
        conn = sqlite3.connect(db_file)
        # conn.row_factory = sqlite3.Row
        cursor = conn.cursor()

        # Fetch all data from the table
        query = f"SELECT * FROM {table_name}"
        cursor.execute(query)

        headers = [description[0] for description in cursor.description]
        # Convert rows to a list of dictionaries
        rows = [row for row in cursor.fetchall()]
        return [dict(zip(headers, row)) for row in rows]

    except Exception as e:
        print(f"Error: {e}")
        return []

    finally:
        # Close the database connection
        if conn:
            conn.close()

## Main

In [137]:
import os
file_path = "/content/feed_data.json"
if os.path.exists(file_path):
    print(f"{file_path} exists.")
else:
    print(f"{file_path} does not exist.")

/content/feed_data.json does not exist.


In [143]:
url = "https://cointelegraph.com/rss"
json_file_path = "/content/feed_data.json"
sqlite_db_path = "/content/data.db"
table_name = "example_table"

data = fetch_rss_feed(url)

if os.path.exists(json_file_path):
  with open(json_file_path, "r", encoding="utf-8") as json_file:
    existing_data = json.load(json_file)
  print(len(existing_data['entries']))

  existing_keys = {entry['title'] for entry in existing_data['entries'] if 'title' in entry}
  # Add new entries from json_data to new_data if their unique_key value is not in db_data
  for entry in data['entries']:
    if entry.get('title') not in existing_keys:
      existing_data['entries'].update(entry)
  print(len(existing_data['entries']))

  with open(json_file_path, "w", encoding="utf-8") as json_file:
    json.dump(existing_data, json_file, indent=4, ensure_ascii=False)

else:
  with open(json_file_path, "w", encoding="utf-8") as json_file:
    json.dump(data, json_file, indent=4, ensure_ascii=False)

print(f"Feed data successfully saved to {json_file_path}")

load_json_to_sqlite(json_file_path, sqlite_db_path, table_name)

# Fetch all data from the table
data = read_data_from_table(sqlite_db_path, table_name)
df_data = pd.DataFrame(data)
df_data.describe()

30
30
Feed data successfully saved to /content/feed_data.json
New Entries: 0


Unnamed: 0,title,author,article_text,publication_date,article_link,tags,inbound_links,outbound_links
count,30,30,30,30,30,30,30,30.0
unique,30,17,30,30,30,30,30,1.0
top,Bitcoin to gold ratio posts new record as BTC ...,Cointelegraph by Ciaran Lyons,"<p style=""float: right; margin: 0 0 10px 15px;...","Mon, 16 Dec 2024 09:22:37 +0000",https://cointelegraph.com/news/bitcoin-to-gold...,"Gold, Bitcoin, Bitcoin-to-gold ratio, Dollar, ...",https://cointelegraph.com/news/bitcoin-to-gold...,
freq,1,5,1,1,1,1,1,30.0
