<a href="https://colab.research.google.com/github/Arnav-Ajay/Article-Parsing-System/blob/main/Article_Parsing_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Article Parsing System for Neural Network Processing

A system for collecting articles and storing them in a database that will be compatible with neural network processing.

The system uses JSON for article storage and SQLite as the database.

**Initial Source:** https://cointelegraph.com/  or  https://cointelegraph.com/rss

**Database Fields:**
- Title
- Author
- Publication date
- Article text
- Article URL
- Outbound links within the article
- Internal links (links to other articles on the same website)
- Tags

## Libraries

In [1]:
! pip install feedparser
! pip install cloudscraper

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=e84c1780839e1290eb5cd5476e17fc1b6561b8d3d7d825c986f9c942ccbd5bca
  Stored in directory: /root/.cache/pip/wheels/f0/69/93/a47e9d621be168e9e33c7ce60524393c0b92ae83cf6c6e89c5
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0
Collecting cloudscraper
  Downloading cloudscraper-1.2.71-py2.py3-none-any.whl.m

In [4]:
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import pandas as pd
import cloudscraper
import feedparser
import sqlite3
import json
import os

## Functions

#### RSS Feed

Check https://cointelegraph.com/robots.txt for their policy.

In [5]:
# extracting text from the article
def extract_main_content(response_text):
    soup = BeautifulSoup(response_text, 'html.parser')

    # Remove unwanted scripts, styles, and other tags
    for tag in soup(['script', 'style', 'header', 'footer', 'nav']):
        tag.decompose()

    # Extract visible text from the main content area (e.g., <article>, <div>, etc.)
    main_content = soup.find('article') or soup.find('div', {'id': 'content'}) or soup.body
    text_content = main_content.get_text(separator='\n', strip=True) if main_content else "No main content found"

    return text_content

In [6]:
# extracting inbound and outbound links
def extract_links(response_text, base_url):
    soup = BeautifulSoup(response_text, 'html.parser')

    # Find all <a> tags with href attributes
    links = [a.get('href') for a in soup.find_all('a', href=True)]

    # Classify links as inbound or outbound
    inbound_links = []
    outbound_links = []

    base_domain = urlparse(base_url).netloc

    for link in links:
        full_url = urljoin(base_url, link)  # Handle relative links
        link_domain = urlparse(full_url).netloc

        if link_domain == base_domain:
            inbound_links.append(full_url)
        else:
            outbound_links.append(full_url)

    return inbound_links, outbound_links

In [9]:
# get data from a RSS feed - Modified to incorporate Cointelegraph RSS feed
def fetch_rss_feed(url):
  """
  Fetches and parses an RSS feed from the given URL

  Args:
    url (str): The URL of the RSS feed.

  Returns:
    dict: A dictionary containing the feed data.
  """
  try:
    # Parse the RSS feed
    feed = feedparser.parse(url)
    if feed.bozo:
      # If there was a problem with the feed
      print(f"Error reading feed: {feed.bozo_exception}")
      return {}

    # Extract all feed data
    feed_data = {
        "feed": {
            "title": feed.feed.get("title", "No Title"),
            "link": feed.feed.get("link", "No Link"),
            "description": feed.feed.get("description", "No Description"),
            "updated": feed.feed.get("updated", "No Update Time"),
          },
          "entries": []
    }
    feed_host = feed.feed.link if "link" in feed.feed else url

    for entry in feed.entries:
      # Initialize sets for unique links
      inbound_links = set()
      outbound_links = set()

      # looping over each entry to retrieve outbound and inbound links
      entry_link = entry.get("link", "")
      if entry_link.startswith(feed_host):
        inbound_links.add(entry_link)
      else:
        outbound_links.add(entry_link)

      print(f'Entry Link: {entry_link}')

      scraper = cloudscraper.create_scraper()
      response = scraper.get(entry_link)

      if response.status_code == 200:
        print("Request successful!")
      else:
        print(f"Failed with status code: {response.status_code}")

      response_text = response.text
      main_text = extract_main_content(response_text)

      base_url = "https://cointelegraph.com/"  # Replace with the actual URL of the page
      inbound, outbound = extract_links(response_text, base_url)

      # Extractive only the tags of each entry
      tags = entry.get("tags", "No Tags") if "tags" in entry else "No Tags",
      str_tags = [item.term for item in tags[0]]
      # Extract all Entries
      feed_data["entries"].append({
          # Source (media outlet name)
          "title": entry.get("title", "No Title"),
          "author": entry.get("author", "No Author"),
          # "article_text": entry.get("description", "No Description"),
          "article_text": main_text,
          "publication_date": entry.get("published", "No Published Date"),
          "article_link": entry.get("link", "No Link"),
          "tags": ', '.join(str(x) for x in str_tags),
          "inbound_links" : ', '.join(str(x) for x in inbound),
          "outbound_links" : ', '.join(str(x) for x in outbound),
      })
    return feed_data

  except Exception as e:
    print(f"An error occurred: {e}")
    return {}

#### SQLite Database

In [12]:
def create_table_from_json(cursor, table_name, json_data):
    """
    Create a table in SQLite based on JSON data structure.

    Args:
        cursor: SQLite cursor object.
        table_name (str): Name of the table to create.
        json_data (list): List of dictionaries representing the JSON data.
    """
    if not json_data:
        raise ValueError(f"JSON data is empty or invalid in file {json_data}")

    # Extract column names and types from the first record in the JSON data
    columns = json_data["entries"][0].keys()

    # Define a SQLite CREATE TABLE query with dynamic columns
    column_definitions = ", ".join([f"{col} TEXT" for col in columns])
    create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} ({column_definitions})"

    # Execute the CREATE TABLE query
    cursor.execute(create_table_query)

In [13]:
def insert_data_into_table(cursor, table_name, json_data):
    """
    Insert JSON data into an SQLite table.

    Args:
        cursor: SQLite cursor object.
        table_name (str): Name of the table to insert data into.
        json_data (list): List of dictionaries representing the JSON data.
    """
    if not json_data:
        raise ValueError("JSON data is empty or invalid - No Data to insert")

    # Extract column names
    columns = json_data[0].keys()
    column_names = ", ".join(columns)
    placeholders = ", ".join(["?"] * len(columns))

    # Prepare the INSERT query
    insert_query = f"INSERT INTO {table_name} ({column_names}) VALUES ({placeholders})"

    # Insert each record
    for record in json_data:
        values = tuple(record.values())
        cursor.execute(insert_query, values)

In [14]:
def validate_new_entries(cursor, table_name, json_data):
  new_data = {}

  # Fetch all data from the table
  query = f"SELECT * FROM {table_name}"
  cursor.execute(query)

  rows = [row for row in cursor.fetchall()]
  if len(rows) == 0:
    new_data = json_data['entries']
    print(f"New Entries: {len(new_data)}")
    return new_data

  # Extract existing keys from db for quick lookup
  existing_keys = {entry['title'] for entry in json_data['entries'] if 'title' in entry}

  # Add new entries from json_data to new_data if their unique_key value is not in db_data
  for entry in json_data['entries']:
    if entry.get('title') not in existing_keys:
      new_data.update(entry)
  print(f"New Entries: {len(new_data)}")
  return new_data

In [15]:
def load_json_to_sqlite(json_file, db_file, table_name):
    """
    Load data from a JSON file into an SQLite database.

    Args:
        json_file (str): Path to the JSON file.
        db_file (str): Path to the SQLite database file.
        table_name (str): Name of the table to create/insert data into.
    """
    try:

        # Connect to SQLite database
        conn = sqlite3.connect(db_file)
        cursor = conn.cursor()

        # Load JSON data
        with open(json_file, "r") as f:
            json_data = json.load(f)

        # Create the table and insert data
        create_table_from_json(cursor, table_name, json_data)

        json_data = validate_new_entries(cursor, table_name, json_data)
        if json_data:
          insert_data_into_table(cursor, table_name, json_data)

        conn.commit()

    except Exception as e:
        print(f"Error: {e}")

    finally:
        # Close the database connection
        if conn:
            conn.close()

In [16]:
def read_data_from_table(db_file, table_name):
    """
    Reads all data from a specified SQLite table.

    Args:
        db_file (str): Path to the SQLite database file.
        table_name (str): Name of the table to read data from.

    Returns:
        list: A list of rows from the table as dictionaries.
    """
    try:
        # Connect to the SQLite database
        conn = sqlite3.connect(db_file)
        # conn.row_factory = sqlite3.Row
        cursor = conn.cursor()

        # Fetch all data from the table
        query = f"SELECT * FROM {table_name}"
        cursor.execute(query)

        headers = [description[0] for description in cursor.description]
        # Convert rows to a list of dictionaries
        rows = [row for row in cursor.fetchall()]
        return [dict(zip(headers, row)) for row in rows]

    except Exception as e:
        print(f"Error: {e}")
        return []

    finally:
        # Close the database connection
        if conn:
            conn.close()

## Main

In [10]:
# Global Variables and fetching RSS Feed
url = "https://cointelegraph.com/rss"
json_file_path = "/content/feed_data.json"
sqlite_db_path = "/content/data.db"
table_name = "example_table"

data = fetch_rss_feed(url)

Entry Link: https://cointelegraph.com/news/defi-security-improvements-vs-cefi-losses-2024?utm_source=rss_feed&utm_medium=rss&utm_campaign=rss_partner_inbound
Request successful!
Entry Link: https://cointelegraph.com/news/what-happened-in-crypto-today?utm_source=rss_feed&utm_medium=rss&utm_campaign=rss_partner_inbound
Request successful!
Entry Link: https://cointelegraph.com/news/why-is-solana-price-up-this-week?utm_source=rss_feed&utm_medium=rss&utm_campaign=rss_partner_inbound
Request successful!
Entry Link: https://cointelegraph.com/news/bitcoin-bull-market-over-decembear-has-only-sent-btc-price-2-lower?utm_source=rss_feed&utm_medium=rss&utm_campaign=rss_partner_inbound
Request successful!
Entry Link: https://cointelegraph.com/news/strange-crazy-weird-crypto-stories-2024?utm_source=rss_feed&utm_medium=rss&utm_campaign=rss_partner_inbound
Request successful!
Entry Link: https://cointelegraph.com/news/uae-retail-investors-crypto-investments-etoro-survey-2025?utm_source=rss_feed&utm_med

In [11]:
# This block has not been validated yet.
# Saving the feed to json file,
# Appending json file if already exists
if os.path.exists(json_file_path):
  with open(json_file_path, "r", encoding="utf-8") as json_file:
    existing_data = json.load(json_file)
  print(len(existing_data['entries']))

  existing_keys = {entry['title'] for entry in existing_data['entries'] if 'title' in entry}
  # Add new entries from json_data to new_data if their unique_key value is not in db_data
  for entry in data['entries']:
    if entry.get('title') not in existing_keys:
      existing_data['entries'].update(entry)
  print(len(existing_data['entries']))

  with open(json_file_path, "w", encoding="utf-8") as json_file:
    json.dump(existing_data, json_file, indent=4, ensure_ascii=False)

else:
  with open(json_file_path, "w", encoding="utf-8") as json_file:
    json.dump(data, json_file, indent=4, ensure_ascii=False)

print(f"Feed data successfully saved to {json_file_path}")

Feed data successfully saved to /content/feed_data.json


In [17]:
# Saving to DB
load_json_to_sqlite(json_file_path, sqlite_db_path, table_name)

# Fetch all data from the table
data = read_data_from_table(sqlite_db_path, table_name)
df_data = pd.DataFrame(data)
df_data.head()

New Entries: 30


Unnamed: 0,title,author,article_text,publication_date,article_link,tags,inbound_links,outbound_links
0,"DeFi hacks drop 40% in 2024, CeFi breaches sur...",Cointelegraph by Josh O&#039;Sullivan,Josh O'Sullivan\n1 hour ago\nDeFi hacks drop 4...,"Tue, 24 Dec 2024 13:00:56 +0000",https://cointelegraph.com/news/defi-security-i...,"DeFi security, CeFi breaches, crypto hacks 202...","https://cointelegraph.com/, https://cointelegr...","https://twitter.com/cointelegraph, https://tel..."
1,Here’s what happened in crypto today,Cointelegraph by Cointelegraph,Cointelegraph\n1 hour ago\nHere’s what happene...,"Tue, 24 Dec 2024 12:26:35 +0000",https://cointelegraph.com/news/what-happened-i...,"Bitcoin, BTC, Hex, Richard Heart, Interpol, Cr...","https://cointelegraph.com/, https://cointelegr...","https://twitter.com/cointelegraph, https://tel..."
2,Why is Solana (SOL) price up today?,Cointelegraph by Yashu Gola,Yashu Gola\n1 hour ago\nWhy is Solana (SOL) pr...,"Tue, 24 Dec 2024 12:06:56 +0000",https://cointelegraph.com/news/why-is-solana-p...,"solana, altcoins, SOL price, Solana ETF, Bitwi...","https://cointelegraph.com/, https://cointelegr...","https://www.facebook.com/cointelegraph, https:..."
3,Bitcoin bull market over? ‘Decembear’ has only...,Cointelegraph by William Suberg,William Suberg\n1 hour ago\nBitcoin bull marke...,"Tue, 24 Dec 2024 12:04:22 +0000",https://cointelegraph.com/news/bitcoin-bull-ma...,"Bitcoin, BTC price","https://cointelegraph.com/, https://cointelegr...","https://twitter.com/cointelegraph, https://tel..."
4,"Strange, but true: 5 outlandish and weird cryp...",Cointelegraph by Brayden Lindrea,"Brayden Lindrea\n2 hours ago\nStrange, but tru...","Tue, 24 Dec 2024 12:00:00 +0000",https://cointelegraph.com/news/strange-crazy-w...,"TruthOrDare Memecoin, Quant, Pump.Fun, Rug Pul...","https://cointelegraph.com/, https://cointelegr...","https://twitter.com/cointelegraph, https://tel..."


##### Extra - To be removed

##### Try w Cloudscraper

In [35]:
scraper = cloudscraper.create_scraper()
url = "https://cointelegraph.com/news/scammers-crypto-keys-steal-funds-from-wannabe-thieves"
response = scraper.get(url)
# print(response.content)

if response.status_code == 200:
    print("Request successful!")
    # print(response.text)
else:
    print(f"Failed with status code: {response.status_code}")


Request successful!


In [36]:
def extract_main_content(response_text):
    soup = BeautifulSoup(response_text, 'html.parser')

    # Remove unwanted scripts, styles, and other tags
    for tag in soup(['script', 'style', 'header', 'footer', 'nav']):
        tag.decompose()

    # Extract visible text from the main content area (e.g., <article>, <div>, etc.)
    main_content = soup.find('article') or soup.find('div', {'id': 'content'}) or soup.body
    text_content = main_content.get_text(separator='\n', strip=True) if main_content else "No main content found"

    return text_content

# Usage
response_text = response.text
main_text = extract_main_content(response_text)
print(main_text)


Martin Young
3 hours ago
Scammers share crypto keys aiming to steal from wannabe thieves: Kaspersky
Kaspersky says scammers are targeting digital thieves, baiting them with keys to loaded-up crypto wallets and swiping any crypto added to pay fees.
1487
Total views
1
Total shares
Listen to article
0:00
News
COINTELEGRAPH IN YOUR SOCIAL FEED
Follow our
Subscribe on
Scammers are pretending to be inexperienced crypto users, posting the seed phrase of a wallet supposedly filled with funds online. The scheme is a trap: when others try to access the wallet to steal the funds, the scammers exploit the attempt to steal crypto from them instead.
“Scammers have invented a new trick — they post crypto wallet seed phrases in YouTube comments using newly created accounts,” cybersecurity firm Kaspersky analyst Mikhail Sytnik said in a Dec. 23 blog
post
.
The researcher found comments in finance-related videos from users asking how to transfer Tether (
USDT
) from a crypto wallet to another wallet, wh

In [37]:
from bs4 import BeautifulSoup


def extract_links(response_text, base_url):
    soup = BeautifulSoup(response_text, 'html.parser')

    # Find all <a> tags with href attributes
    links = [a.get('href') for a in soup.find_all('a', href=True)]

    # Classify links as inbound or outbound
    inbound_links = []
    outbound_links = []

    base_domain = urlparse(base_url).netloc

    for link in links:
        full_url = urljoin(base_url, link)  # Handle relative links
        link_domain = urlparse(full_url).netloc

        if link_domain == base_domain:
            inbound_links.append(full_url)
        else:
            outbound_links.append(full_url)

    return inbound_links, outbound_links

# Usage
response_text = response.text
base_url = "https://cointelegraph.com/"  # Replace with the actual URL of the page
inbound, outbound = extract_links(response_text, base_url)

print("Inbound Links:")
print("\n".join(inbound))

print("\nOutbound Links:")
print("\n".join(outbound))


Inbound Links:
https://cointelegraph.com/
https://cointelegraph.com/rss-feeds
https://cointelegraph.com/
https://cointelegraph.com/markets
https://cointelegraph.com/tags/technology
https://cointelegraph.com/tags/regulation
https://cointelegraph.com/tags/business
https://cointelegraph.com/tags/investments
https://cointelegraph.com/tags/nft
https://cointelegraph.com/
https://cointelegraph.com/magazine
https://cointelegraph.com/category/opinion
https://cointelegraph.com/category/interview
https://cointelegraph.com/tags/investigation
https://cointelegraph.com/tags/features
https://cointelegraph.com/
https://cointelegraph.com/about
https://cointelegraph.com/advertise
https://cointelegraph.com/Careers
https://cointelegraph.com/newsletter-subscriptions
https://cointelegraph.com/
https://cointelegraph.com/price-indexes
https://cointelegraph.com/rankings/crypto-exchanges/
https://cointelegraph.com/converter
https://cointelegraph.com/price-indexes/memecoins
https://cointelegraph.com/research
htt