In [1]:
import pandas as pd
import numpy as np
import bs4
from bs4 import BeautifulSoup
import time
import sys
import requests

In [2]:
# Web request of target source(s). Starting with CISA's advisory page
cisa_advisory = requests.get("https://www.cisa.gov/news-events/cybersecurity-advisories")

In [3]:
# Status confirmation
print(cisa_advisory.status_code)

200


In [4]:
cisa_html = cisa_advisory.content

In [5]:
print(cisa_html)

b'\n\n<!DOCTYPE html>\n<html lang="en" dir="ltr" prefix="og: https://ogp.me/ns#" class="no-js">\n  <head>\n    <meta charset="utf-8" />\n<script async src="https://www.googletagmanager.com/gtag/js?id=G-9MDR73GM0K"></script>\n<script>window.dataLayer = window.dataLayer || [];function gtag(){dataLayer.push(arguments)};gtag("js", new Date());gtag("set", "developer_id.dMDhkMT", true);gtag("config", "G-9MDR73GM0K", {"groups":"default","page_placeholder":"PLACEHOLDER_page_location"});</script>\n<meta name="description" content="View Cybersecurity Advisories Only" />\n<link rel="canonical" href="https://www.cisa.gov/news-events/cybersecurity-advisories" />\n<meta property="og:site_name" content="Cybersecurity and Infrastructure Security Agency CISA" />\n<meta property="og:type" content="website" />\n<meta property="og:url" content="https://www.cisa.gov/news-events/cybersecurity-advisories" />\n<meta property="og:title" content="Cybersecurity Alerts &amp; Advisories | CISA" />\n<meta property=

In [6]:
# Find latest Articles by scanning for and collecting all HTML tags

In [7]:
cisa_soup = BeautifulSoup(cisa_html)

In [8]:
cisa_advisories = cisa_soup.find_all("article")

In [9]:
len(cisa_advisories)

10

In [10]:
import time

for adv in cisa_advisories:
    
    #print(adv)
    i = 0
    for content in adv:
        print("Iteration Entry Number: {}".format(i))
        print(content)
        time.sleep(2)
        i = i + 1
        # This helped pinpoint what part was actual content vs whitespace. Note: content[1] has all the meat
        # Also lows us to identify the specific pieces of data and where they are located in our advisory objects
        # Namely: <time> for our publication date, <span> for the adviory title, <a href> for the article link, 

Iteration Entry Number: 0


Iteration Entry Number: 1
<div class="c-teaser__row">
<div class="c-teaser__content">
<div class="c-teaser__eyebrow">
<div class="c-teaser__date"><time datetime="2023-07-12T12:00:00Z">Jul 12, 2023</time>
</div>
<div class="c-teaser__meta">Alert</div>
</div>
<h3 class="c-teaser__title">
<a href="/news-events/alerts/2023/07/12/cisa-releases-one-industrial-control-systems-advisory" target="_self">
<span>CISA Releases One Industrial Control Systems Advisory</span>
</a> </h3>
</div>
</div>
Iteration Entry Number: 2


Iteration Entry Number: 0


Iteration Entry Number: 1
<div class="c-teaser__row">
<div class="c-teaser__content">
<div class="c-teaser__eyebrow">
<div class="c-teaser__date"><time datetime="2023-07-12T12:00:00Z">Jul 12, 2023</time>
</div>
<div class="c-teaser__meta">Alert</div>
</div>
<h3 class="c-teaser__title">
<a href="/news-events/alerts/2023/07/12/cisa-and-fbi-release-cybersecurity-advisory-enhanced-monitoring-detect-apt-activity-targeting" targ

In [11]:
'''
Here we had two bs4.element.NavigableString objects as whitespace padding in each content object. 
We can ignore them and focus on bs4.element.Tag objects where the content we are looking for is actually held.

Focuses on the bs4.element.Tag and extracts the data we need.
'''
import time

for adv in cisa_advisories:
    for content in adv:
        print(type(content))
        # Extracting URLs, Titles, and publication date
        if type(content) == bs4.element.Tag:
            print(content.find("time").text)
            print(content.find("span").text)
            print(content.find("a")['href'])
            print(content.find("div").text)
            time.sleep(2)
        else:
            pass

<class 'bs4.element.NavigableString'>
<class 'bs4.element.Tag'>
Jul 12, 2023
CISA Releases One Industrial Control Systems Advisory
/news-events/alerts/2023/07/12/cisa-releases-one-industrial-control-systems-advisory


Jul 12, 2023

Alert



CISA Releases One Industrial Control Systems Advisory
 

<class 'bs4.element.NavigableString'>
<class 'bs4.element.NavigableString'>
<class 'bs4.element.Tag'>
Jul 12, 2023
CISA and FBI Release Cybersecurity Advisory on Enhanced Monitoring to Detect APT Activity Targeting Outlook Online
/news-events/alerts/2023/07/12/cisa-and-fbi-release-cybersecurity-advisory-enhanced-monitoring-detect-apt-activity-targeting


Jul 12, 2023

Alert



CISA and FBI Release Cybersecurity Advisory on Enhanced Monitoring to Detect APT Activity Targeting Outlook Online
 

<class 'bs4.element.NavigableString'>
<class 'bs4.element.NavigableString'>
<class 'bs4.element.Tag'>
Jul 12, 2023
Rockwell Automation Select Communication Modules
/news-events/ics-advisories/icsa-23-193-

In [12]:
# Finally we can modify this approach to ingest our data directly into a dataframe

import time

df = pd.DataFrame(columns=("Publication Date", "Title", "URL", "Classification")) # Maybe add description if theres time

for adv in cisa_advisories:
    entry =[]
    for content in adv:
        # Extracting URLs, Titles, and publication date
        if type(content) == bs4.element.Tag:
            entry.append(content.find("time").text)
            entry.append(content.find("span").text)
            entry.append(content.find("a")['href'])
            print(content.find("div").text.split("\n")[4])
            entry.append(content.find("div").text.split("\n")[4])
            #print(entry)
            df.loc[len(df)] = entry
            time.sleep(2)
        else:
            pass

Alert
Alert
ICS Advisory | ICSA-23-193-01
Cybersecurity Advisory | AA23-193A
Alert
Alert
Alert
Alert
ICS Advisory | ICSA-23-192-03
ICS Advisory | ICSA-23-192-02


In [13]:
# checking our DataFrame
df

Unnamed: 0,Publication Date,Title,URL,Classification
0,"Jul 12, 2023",CISA Releases One Industrial Control Systems A...,/news-events/alerts/2023/07/12/cisa-releases-o...,Alert
1,"Jul 12, 2023",CISA and FBI Release Cybersecurity Advisory on...,/news-events/alerts/2023/07/12/cisa-and-fbi-re...,Alert
2,"Jul 12, 2023",Rockwell Automation Select Communication Modules,/news-events/ics-advisories/icsa-23-193-01,ICS Advisory | ICSA-23-193-01
3,"Jul 12, 2023",Enhanced Monitoring to Detect APT Activity Tar...,/news-events/cybersecurity-advisories/aa23-193a,Cybersecurity Advisory | AA23-193A
4,"Jul 11, 2023",Microsoft Releases July 2023 Security Updates,/news-events/alerts/2023/07/11/microsoft-relea...,Alert
5,"Jul 11, 2023",Fortinet Releases Security Update for FortiOS ...,/news-events/alerts/2023/07/11/fortinet-releas...,Alert
6,"Jul 11, 2023",Adobe Releases Security Updates for ColdFusion...,/news-events/alerts/2023/07/11/adobe-releases-...,Alert
7,"Jul 11, 2023",Mozilla Releases Security Update for Firefox a...,/news-events/alerts/2023/07/11/mozilla-release...,Alert
8,"Jul 11, 2023",Panasonic Control FPWin Pro7,/news-events/ics-advisories/icsa-23-192-03,ICS Advisory | ICSA-23-192-03
9,"Jul 11, 2023",​Sensormatic Electronics iSTAR,/news-events/ics-advisories/icsa-23-192-02,ICS Advisory | ICSA-23-192-02


In [14]:
'''
Only shows 10 results because our main page only displays 10. 
Find a way to:
1. expand window width, 
2. overcome page limits, 
3. embed links and 
4. Create our rolling window
5. Stream Intel Feed
'''

'\nOnly shows 10 results because our main page only displays 10. \nFind a way to:\n1. expand window width, \n2. overcome page limits, \n3. embed links and \n4. Create our rolling window\n5. Stream Intel Feed\n'

## Expanding Width of our datarame feed window ##

In [15]:
pd.set_option('display.max_colwidth', 1000)
df.head()

Unnamed: 0,Publication Date,Title,URL,Classification
0,"Jul 12, 2023",CISA Releases One Industrial Control Systems Advisory,/news-events/alerts/2023/07/12/cisa-releases-one-industrial-control-systems-advisory,Alert
1,"Jul 12, 2023",CISA and FBI Release Cybersecurity Advisory on Enhanced Monitoring to Detect APT Activity Targeting Outlook Online,/news-events/alerts/2023/07/12/cisa-and-fbi-release-cybersecurity-advisory-enhanced-monitoring-detect-apt-activity-targeting,Alert
2,"Jul 12, 2023",Rockwell Automation Select Communication Modules,/news-events/ics-advisories/icsa-23-193-01,ICS Advisory | ICSA-23-193-01
3,"Jul 12, 2023",Enhanced Monitoring to Detect APT Activity Targeting Outlook Online,/news-events/cybersecurity-advisories/aa23-193a,Cybersecurity Advisory | AA23-193A
4,"Jul 11, 2023",Microsoft Releases July 2023 Security Updates,/news-events/alerts/2023/07/11/microsoft-releases-july-2023-security-updates,Alert


## Consolidating Pages to build a comprehensive list of advisories ##

In [16]:
# This can be done by creating new webrequest objects and again distilling down to our target content
# We can start by extracting the total number of pages from the "Last" page link.

for link in cisa_soup.find_all("a"):
    try:
        if link["class"]==['c-pager__link', 'c-pager__link--last']: #info obtained through combination of inspect and print
            last_page = int(link["href"].strip("?page="))
            print(last_page)
    except:
        continue

662


In [17]:
# Storing a list of all the advisory pages

i = 0
cisa_advisory_pages = []
while i < last_page:
    cisa_advisory_pages.append('https://www.cisa.gov/news-events/cybersecurity-advisories?page={}'.format(i))
    i = i + 1
cisa_advisory_pages

['https://www.cisa.gov/news-events/cybersecurity-advisories?page=0',
 'https://www.cisa.gov/news-events/cybersecurity-advisories?page=1',
 'https://www.cisa.gov/news-events/cybersecurity-advisories?page=2',
 'https://www.cisa.gov/news-events/cybersecurity-advisories?page=3',
 'https://www.cisa.gov/news-events/cybersecurity-advisories?page=4',
 'https://www.cisa.gov/news-events/cybersecurity-advisories?page=5',
 'https://www.cisa.gov/news-events/cybersecurity-advisories?page=6',
 'https://www.cisa.gov/news-events/cybersecurity-advisories?page=7',
 'https://www.cisa.gov/news-events/cybersecurity-advisories?page=8',
 'https://www.cisa.gov/news-events/cybersecurity-advisories?page=9',
 'https://www.cisa.gov/news-events/cybersecurity-advisories?page=10',
 'https://www.cisa.gov/news-events/cybersecurity-advisories?page=11',
 'https://www.cisa.gov/news-events/cybersecurity-advisories?page=12',
 'https://www.cisa.gov/news-events/cybersecurity-advisories?page=13',
 'https://www.cisa.gov/news-ev

## Final DataFrame: Aggregate Collection ##

In [18]:
df = pd.DataFrame(columns=("Publication Date", "Title", "URL","Classification"))

for page in cisa_advisory_pages[0:3]: # pages 0 to 3 for testing purposes; remove brackets for full version
    current_page = requests.get(page)
    page_html = current_page.content
    page_soup = BeautifulSoup(page_html)
    page_advisories = page_soup.find_all("article")
    for adv in page_advisories:
        entry = []
        for content in adv:
            # Extracting URLs, Titles, and publication date
            if type(content) == bs4.element.Tag:
                entry.append(content.find("time").text)
                entry.append(content.find("span").text)
                entry.append(content.find("a")['href'])
                entry.append(content.find("div").text.split("\n")[4])
                #print(entry)
                df.loc[len(df)] = entry
            else:
                pass

In [19]:
df.head()

Unnamed: 0,Publication Date,Title,URL,Classification
0,"Jul 12, 2023",CISA Releases One Industrial Control Systems Advisory,/news-events/alerts/2023/07/12/cisa-releases-one-industrial-control-systems-advisory,Alert
1,"Jul 12, 2023",CISA and FBI Release Cybersecurity Advisory on Enhanced Monitoring to Detect APT Activity Targeting Outlook Online,/news-events/alerts/2023/07/12/cisa-and-fbi-release-cybersecurity-advisory-enhanced-monitoring-detect-apt-activity-targeting,Alert
2,"Jul 12, 2023",Rockwell Automation Select Communication Modules,/news-events/ics-advisories/icsa-23-193-01,ICS Advisory | ICSA-23-193-01
3,"Jul 12, 2023",Enhanced Monitoring to Detect APT Activity Targeting Outlook Online,/news-events/cybersecurity-advisories/aa23-193a,Cybersecurity Advisory | AA23-193A
4,"Jul 11, 2023",Microsoft Releases July 2023 Security Updates,/news-events/alerts/2023/07/11/microsoft-releases-july-2023-security-updates,Alert


## Embedding html Links and completing our URL snippets. ##

In [20]:
def make_clickable(val):
    return f'<a target="_blank" href="https://www.cisa.gov{val}">https://www.cisa.gov{val}</a>'

In [21]:
# Applies formatting to make urls clickable. A happy side effect was an increased width!
df.style.format({'URL': make_clickable})

Unnamed: 0,Publication Date,Title,URL,Classification
0,"Jul 12, 2023",CISA Releases One Industrial Control Systems Advisory,https://www.cisa.gov/news-events/alerts/2023/07/12/cisa-releases-one-industrial-control-systems-advisory,Alert
1,"Jul 12, 2023",CISA and FBI Release Cybersecurity Advisory on Enhanced Monitoring to Detect APT Activity Targeting Outlook Online,https://www.cisa.gov/news-events/alerts/2023/07/12/cisa-and-fbi-release-cybersecurity-advisory-enhanced-monitoring-detect-apt-activity-targeting,Alert
2,"Jul 12, 2023",Rockwell Automation Select Communication Modules,https://www.cisa.gov/news-events/ics-advisories/icsa-23-193-01,ICS Advisory | ICSA-23-193-01
3,"Jul 12, 2023",Enhanced Monitoring to Detect APT Activity Targeting Outlook Online,https://www.cisa.gov/news-events/cybersecurity-advisories/aa23-193a,Cybersecurity Advisory | AA23-193A
4,"Jul 11, 2023",Microsoft Releases July 2023 Security Updates,https://www.cisa.gov/news-events/alerts/2023/07/11/microsoft-releases-july-2023-security-updates,Alert
5,"Jul 11, 2023",Fortinet Releases Security Update for FortiOS and FortiProxy,https://www.cisa.gov/news-events/alerts/2023/07/11/fortinet-releases-security-update-fortios-and-fortiproxy,Alert
6,"Jul 11, 2023",Adobe Releases Security Updates for ColdFusion and InDesign,https://www.cisa.gov/news-events/alerts/2023/07/11/adobe-releases-security-updates-coldfusion-and-indesign,Alert
7,"Jul 11, 2023",Mozilla Releases Security Update for Firefox and Firefox ESR,https://www.cisa.gov/news-events/alerts/2023/07/11/mozilla-releases-security-update-firefox-and-firefox-esr,Alert
8,"Jul 11, 2023",Panasonic Control FPWin Pro7,https://www.cisa.gov/news-events/ics-advisories/icsa-23-192-03,ICS Advisory | ICSA-23-192-03
9,"Jul 11, 2023",​Sensormatic Electronics iSTAR,https://www.cisa.gov/news-events/ics-advisories/icsa-23-192-02,ICS Advisory | ICSA-23-192-02


## Creating a Rolling Window Feed (Batch Mode) ##

In [22]:
# Update feed routinely
from IPython.display import display, clear_output

In [27]:
# Display and refresh feed on a routine basis 
# with i serving as a marker / placeholder for the next iteration of entries in the sequence. 

refresh_rate = 5 # needs to refresh every 5 seconds with the next 10 entries
window_size = 10
i = 0

#df.style.format({'URL': make_clickable})

while(i < len(df)):
    view = df[df.index > i]
    clear_output(wait=True)
    display(view.head(window_size).style.format({'URL': make_clickable}))
    time.sleep(refresh_rate)
    # num rows to move our window after the refresh. In this example iteration rate is consistent with window_size
    i = i + window_size

Unnamed: 0,Publication Date,Title,URL,Classification
21,"Jul 06, 2023",MAR-10445155-1.v1 Truebot Activity Infects U.S. and Canada Based Networks,https://www.cisa.gov/news-events/analysis-reports/ar23-187a,Analysis Report | AR23-187A
22,"Jun 30, 2023",DoS and DDoS Attacks against Multiple Sectors,https://www.cisa.gov/news-events/alerts/2023/06/30/dos-and-ddos-attacks-against-multiple-sectors,Alert
23,"Jun 29, 2023",CISA Releases Nine Industrial Control Systems Advisories,https://www.cisa.gov/news-events/alerts/2023/06/29/cisa-releases-nine-industrial-control-systems-advisories,Alert
24,"Jun 29, 2023",2023 CWE Top 25 Most Dangerous Software Weaknesses,https://www.cisa.gov/news-events/alerts/2023/06/29/2023-cwe-top-25-most-dangerous-software-weaknesses,Alert
25,"Jun 29, 2023",Medtronic Paceart Optima System,https://www.cisa.gov/news-events/ics-medical-advisories/icsma-23-180-01,ICS Medical Advisory | ICSMA-23-180-01
26,"Jun 29, 2023",​Mitsubishi Electric MELSEC-F Series (Update A),https://www.cisa.gov/news-events/ics-advisories/icsa-23-180-04,ICS Advisory | ​​ICSA-23-180-04
27,"Jun 29, 2023",​Ovarro TBox RTUs,https://www.cisa.gov/news-events/ics-advisories/icsa-23-180-03,ICS Advisory | ICSA-23-180-03
28,"Jun 29, 2023",Schneider Electric EcoStruxure Operator Terminal Expert,https://www.cisa.gov/news-events/ics-advisories/icsa-23-180-02,ICS Advisory | ICSA-23-180-02
29,"Jun 29, 2023",Delta Electronics InfraSuite Device Master,https://www.cisa.gov/news-events/ics-advisories/icsa-23-180-01,ICS Advisory | ICSA-23-180-01


## Creating a Flowing Window Feed (Streaming Mode) ##

In [24]:
df.head(1).index

Index([0], dtype='int64')

In [28]:
# However our feed can be made to flow more naturally with the next entry displayed at the top, streaming sequentially
# This requires slower iteration and a faster refresh rate

refresh_rate = 1
window_size = 10
iteration_rate = 1
marker = 0

while(marker < len(df)-window_size):
    view = df[df.index > marker]
    clear_output(wait=True)
    display(view.head(window_size).style.set_properties(**{'text-align': 'left'}, **{'background-color': '#b3cc83'}, **{'border-bottom': 'solid'}).format({'URL': make_clickable}))
    time.sleep(refresh_rate)
    # num rows to move our window after the refresh. 
    marker = marker + iteration_rate
    #df.style.set_properties(subset=["Title", "URL"], **{'text-align': 'left'}).format({'URL': make_clickable})

Unnamed: 0,Publication Date,Title,URL,Classification
20,"Jul 06, 2023",CISA and Partners Release Joint Cybersecurity Advisory on Newly Identified Truebot Malware Variants,https://www.cisa.gov/news-events/alerts/2023/07/06/cisa-and-partners-release-joint-cybersecurity-advisory-newly-identified-truebot-malware-variants,Alert
21,"Jul 06, 2023",MAR-10445155-1.v1 Truebot Activity Infects U.S. and Canada Based Networks,https://www.cisa.gov/news-events/analysis-reports/ar23-187a,Analysis Report | AR23-187A
22,"Jun 30, 2023",DoS and DDoS Attacks against Multiple Sectors,https://www.cisa.gov/news-events/alerts/2023/06/30/dos-and-ddos-attacks-against-multiple-sectors,Alert
23,"Jun 29, 2023",CISA Releases Nine Industrial Control Systems Advisories,https://www.cisa.gov/news-events/alerts/2023/06/29/cisa-releases-nine-industrial-control-systems-advisories,Alert
24,"Jun 29, 2023",2023 CWE Top 25 Most Dangerous Software Weaknesses,https://www.cisa.gov/news-events/alerts/2023/06/29/2023-cwe-top-25-most-dangerous-software-weaknesses,Alert
25,"Jun 29, 2023",Medtronic Paceart Optima System,https://www.cisa.gov/news-events/ics-medical-advisories/icsma-23-180-01,ICS Medical Advisory | ICSMA-23-180-01
26,"Jun 29, 2023",​Mitsubishi Electric MELSEC-F Series (Update A),https://www.cisa.gov/news-events/ics-advisories/icsa-23-180-04,ICS Advisory | ​​ICSA-23-180-04
27,"Jun 29, 2023",​Ovarro TBox RTUs,https://www.cisa.gov/news-events/ics-advisories/icsa-23-180-03,ICS Advisory | ICSA-23-180-03
28,"Jun 29, 2023",Schneider Electric EcoStruxure Operator Terminal Expert,https://www.cisa.gov/news-events/ics-advisories/icsa-23-180-02,ICS Advisory | ICSA-23-180-02
29,"Jun 29, 2023",Delta Electronics InfraSuite Device Master,https://www.cisa.gov/news-events/ics-advisories/icsa-23-180-01,ICS Advisory | ICSA-23-180-01


### Sources: ###

A Text-Mining Approach to Cyberrisk Management - https://www.isaca.org/-/media/files/isacadp/project/isaca/articles/journal/2021/volume-6/a-text-mining-approach-to-cyberrisk-management-joa-eng-1121.pdf

HTML Formatting in Pandas DataFrame - https://stackoverflow.com/questions/40724475/html-formatting-in-pandas-dataframe

How to create clickable Links - https://datascientyst.com/create-clickable-link-pandas-dataframe-jupyterlab/

Refresh a Panda dataframe while printing using loop - https://stackoverflow.com/questions/65024047/refresh-a-panda-dataframe-while-printing-using-loop

Get item from bs4.element.Tag - https://stackoverflow.com/questions/57395509/get-item-from-bs4-element-tag

How to get the next page on BeautifulSoup? - https://www.geeksforgeeks.org/how-to-get-the-next-page-on-beautifulsoup/#

CISA’s Shields Up: What it is, how to use it - https://www.scmagazine.com/sw-article/asset-management/cisas-shields-up-what-it-is-how-to-use-it

How to left align a dataframe column in python? - http://stackoverflow.com/questions/53460941/ddg#56042544

In [26]:
# For future improvement: 

## 1) Consider marking entries by a color scheme indicating varying levels of danger
## 2) Run a cron job on a weekly basis in order to get a feed of the latest articles and advisories