# QuiverQuant Government Contracts Web Scraper
Source: https://www.quiverquant.com/sources/govcontracts

In [1]:
## pulling data from XML and HTML files
from bs4 import BeautifulSoup 

## automating web browser interaction
from selenium import webdriver # module containing implementations of browser drivers

## data manipulation
import pandas as pd
from datetime import datetime
import numpy as np

## plotting
import matplotlib.pyplot as plt

# Parsing the HTML File

In [4]:
# Initialize Chrome browser and launch the government contracts website
driver = webdriver.Chrome()
url = 'https://www.quiverquant.com/congresstrading/'
driver.get(url)
# Source: https://stackoverflow.com/questions/42478591/python-selenium-chrome-webdriver

In [5]:
# Extract and store page HTML
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [6]:
# Taking a look at the HTML source code
print(soup.prettify())

<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   Congress Trading - Quiver Quantitative
  </title>
  <meta content="en_US" property="og:locale"/>
  <meta content="website" property="og:type"/>
  <meta content="Congress Trading - Quiver Quantitative" property="og:title"/>
  <meta content="Track stock trading activity by members of U.S. Congress. View recent trades, search by politician or stock, and see the most active traders in the Senate and House of Representatives." name="description"/>
  <meta content="Track stock trading activity by members of U.S. Congress. View recent trades, search by politician or stock, and see the most active traders in the Senate and House of Representatives." property="og:description"/>
  <meta content="/static/images/site_preview.png" property="og:image"/>
  <meta content="image/png" property="og:image:type"/>


In [7]:
# Closes Chrome
driver.quit()

### Parse Data from HTML
#### Getting the Data under the table class = "table-congress table-politician" tag
* Source for converting data in HTML tags to text: https://www.geeksforgeeks.org/extracting-text-from-html-file-using-python/
* Source for navigating HTML tags with Beautiful Soup: https://beautiful-soup-4.readthedocs.io/en/latest/#navigating-the-tree

In [6]:
# Initialize Chrome browser and launch the government contracts website
driver = webdriver.Chrome()
url =  'https://www.quiverquant.com/sources/govcontracts'
driver.get(url)
# Source: https://stackoverflow.com/questions/42478591/python-selenium-chrome-webdriver

In [269]:
# Extract and store page HTML
soup = BeautifulSoup(driver.page_source, 'html.parser')

MaxRetryError: HTTPConnectionPool(host='localhost', port=64627): Max retries exceeded with url: /session/604382f2eef020a7c9a7d8332d1fc775/source (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002A43057D5E0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [10]:
# Finds the table tag where the congress trading data is stored 
government_contracts_table = soup.find('table')

In [12]:
# Closes Chrome
driver.quit()

* All the table cells are under the "div" tag with class = "metrics__list_box_body"
* Cells are under the "tbody" tag and their rows are under the "tr" tags
* The cells' data are under "td" tags

In [21]:
# Searching for the <div> tag with class = "metrics__list_box_body"
soup.find_all(class_ = 'metrics__list_box_body')

[<div class="metrics__list_box_body">
 <table class="sortable" id="myTable" style="min-width:0px">
 <thead>
 <tr class="sticky-table-headers">
 <th>Recipient <span>*</span></th>
 <th onclick="if (!window.__cfRLUnblockHandlers) return false; ">Date Awarded</th>
 <th onclick="if (!window.__cfRLUnblockHandlers) return false; ">Amount</th>
 <th onclick="if (!window.__cfRLUnblockHandlers) return false; ">Funding Agency</th>
 <th onclick="if (!window.__cfRLUnblockHandlers) return false; ">Description</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td style="padding-right:2rem"><a href="../stock/AAPL/">AAPL</a></td>
 <td onclick="if (!window.__cfRLUnblockHandlers) return false; ">2025-01-01</td>
 <td onclick="if (!window.__cfRLUnblockHandlers) return false; ">$299</td>
 <td onclick="if (!window.__cfRLUnblockHandlers) return false; ">Department of Justice</td>
 <td onclick="if (!window.__cfRLUnblockHandlers) return false; ">APPLE DEVELOPER ACCOUNT RENEWAL</td>
 </tr>
 <tr>
 <td style="padding-right:2re

In [83]:
# Gets the <tbody> tag
soup.find_all(class_ = 'metrics__list_box_body')[0].contents[1].contents[3]

<tbody>
<tr>
<td style="padding-right:2rem"><a href="../stock/AAPL/">AAPL</a></td>
<td onclick="if (!window.__cfRLUnblockHandlers) return false; ">2025-01-01</td>
<td onclick="if (!window.__cfRLUnblockHandlers) return false; ">$299</td>
<td onclick="if (!window.__cfRLUnblockHandlers) return false; ">Department of Justice</td>
<td onclick="if (!window.__cfRLUnblockHandlers) return false; ">APPLE DEVELOPER ACCOUNT RENEWAL</td>
</tr>
<tr>
<td style="padding-right:2rem"><a href="../stock/A/">A</a></td>
<td onclick="if (!window.__cfRLUnblockHandlers) return false; ">2024-12-31</td>
<td onclick="if (!window.__cfRLUnblockHandlers) return false; ">$387,492</td>
<td onclick="if (!window.__cfRLUnblockHandlers) return false; ">Department of Health and Human Services</td>
<td onclick="if (!window.__cfRLUnblockHandlers) return false; ">SERVICE/MAINTENANCE AGREEMENT FOR THE AGILENT INSTRUMENTATION AT NCATS FOR THE ACC, TDB, AND ADST GROUPS</td>
</tr>
<tr>
<td style="padding-right:2rem"><a href="../s

In [143]:
# Getting each row; change the last index by odd numbers to get each row in range [1, 199]
soup.find_all(class_ = 'metrics__list_box_body')[0].contents[1].contents[3].contents[199]

<tr>
<td style="padding-right:2rem"><a href="../stock/DNOW/">DNOW</a></td>
<td onclick="if (!window.__cfRLUnblockHandlers) return false; ">2024-12-28</td>
<td onclick="if (!window.__cfRLUnblockHandlers) return false; ">$257</td>
<td onclick="if (!window.__cfRLUnblockHandlers) return false; ">General Services Administration</td>
<td onclick="if (!window.__cfRLUnblockHandlers) return false; ">PEN, BALL-POINT: ITEM NAME PEN, BALL-POINT PEN POINT SZ 0.800 MM PEN POINT TYPE BOLD PEN POINT RETRACTABILITY NON-RETRACTABLE UNIT TYPE DESK OR POCKET UNIT DESIGN NON-CORROSIVE ROLLERBALL INK GRADE GEL INK COLOR BLACK INK SUPPLY VISIBILITY VISIBLE IN</td>
</tr>

In [173]:
# Getting the cell values for a single row; change the last index by odd numbers to get each cell value in range [1, 9]
soup.find_all(class_ = 'metrics__list_box_body')[0].contents[1].contents[3].contents[199].contents[7]

<td onclick="if (!window.__cfRLUnblockHandlers) return false; ">General Services Administration</td>

Indexes for each data:
* i = 1: Stock ticker recipient
* i = 3: Date awarded
* i = 5: Amount of money awarded
* i = 7: Funding agency
* i = 9: Description

# Getting the Data into Lists

In [185]:
tbody = soup.find_all(class_ = 'metrics__list_box_body')[0].contents[1].contents[3]

In [243]:
recipients = []
dates_awarded = []
amounts = []
funding_agencies = []
descs = []

In [169]:
number_of_cells = len(soup.find_all(class_ = 'metrics__list_box_body')[0].contents[1].contents[3].contents)

In [199]:
tbody.contents[1].contents[1]

<td style="padding-right:2rem"><a href="../stock/AAPL/">AAPL</a></td>

In [245]:
i = 1
while i < number_of_cells:
    recipients.append(tbody.contents[i].contents[1].get_text())
    dates_awarded.append(tbody.contents[i].contents[3].get_text())
    amounts.append(tbody.contents[i].contents[5].get_text())
    funding_agencies.append(tbody.contents[i].contents[7].get_text())
    descs.append(tbody.contents[i].contents[9].get_text())
    i += 2

In [259]:
# Ensuring all the lists have the same amount of data
len(recipients) == len(amounts) == len(dates_awarded) == len(funding_agencies) == len(descs)

True

# Putting the Data into a Python Dictionary

In [263]:
contracts_dict = {'ticker' : [ticker for ticker in recipients],
                      'date_awarded' : [date for date in dates_awarded],
                      'amount' : [amount for amount in amounts],
                      'funding_agency' : [agency for agency in funding_agencies],
                      'description' : [desc for desc in descs]}

# Converting the Data into a Python Dataframe

In [265]:
government_contracts_df = pd.DataFrame.from_dict(contracts_dict)

In [267]:
government_contracts_df.head()

Unnamed: 0,ticker,date_awarded,amount,funding_agency,description
0,AAPL,2025-01-01,$299,Department of Justice,APPLE DEVELOPER ACCOUNT RENEWAL
1,A,2024-12-31,"$387,492",Department of Health and Human Services,SERVICE/MAINTENANCE AGREEMENT FOR THE AGILENT ...
2,ARAY,2024-12-31,"$376,500",Department of Veterans Affairs,ACCURAY CYBERKNIFE MAINTENANCE
3,CDW,2024-12-31,"$17,205",Department of Health and Human Services,PROCUREMENT OF IT SUPPORT AND COMMODITIES.
4,DNB,2024-12-31,"$17,176",Department of Transportation,"DUN & BRADSTREET, INC. (D&B) FINANCE ANALYTICS..."


# Bibliography
1. K. Mejia. “How to scrape a dashboard with Python,” Towards Data Science. [Online]. Available: https://towardsdatascience.com/how-to-scrape-a-dashboard-with-python-8b088f6cecf3. [Accessed: Jan. 2, 2025].

2. L. T. Vo. "Mining Social Media: Finding Stories in Internet Data," NoStarchPress. [Print] Available: https://nostarch.com/miningsocialmedia. [Accessed Dec. 25, 2024].

3. GeeksforGeeks, “Implementing Web Scraping in Python with Beautiful Soup,” GeeksforGeeks. [Online]. Available: https://www.geeksforgeeks.org/implementing-web-scraping-python-beautiful-soup/. [Accessed: Jan. 3, 2025].

4. Beautiful Soup Documentation, “Beautiful Soup Documentation,” Beautiful Soup 4. [Online]. Available: https://beautiful-soup-4.readthedocs.io/en/latest/. [Accessed: Jan. 1, 2025].