# Top 10k pages data

We're running the top 10k pages from analytics.usa.gov for the last 30 days through the CrUX API to get page-level information.

In [13]:
"""
Loads result to dataframe and analyzes the results
"""

import os
import pandas as pd
from utils.load_results_to_dataframe import load_results_to_dataframe

notebook_dir = os.getcwd()

In [14]:
site_scanning = load_results_to_dataframe(
    file_name="data/site-scanning-weekly-snapshot-20240828.csv",
    notebook_dir=notebook_dir,
)

top_10k = load_results_to_dataframe(
    file_name="data/top-10000-pages-and-screens-30-days-20240828.csv",
    notebook_dir=notebook_dir,
)

top_10k["url"] = top_10k.apply(
    lambda row: "https://" + str(row["domain"]) + str(row["pagePath"]), axis=1
)

top_10k.head()

Unnamed: 0,page_title,domain,pagePath,pageviews,url
0,National Institute of Standards and Technology...,www.time.gov,/,71938105,https://www.time.gov/
1,National Institute of Standards and Technology...,time.gov,/,66873558,https://time.gov/
2,NWS Radar,radar.weather.gov,/,12205408,https://radar.weather.gov/
3,USPS.com® - USPS Tracking® Results,tools.usps.com,/go/trackconfirmaction,12093137,https://tools.usps.com/go/trackconfirmaction
4,Search Public Sex Offender Registries | Dru Sj...,www.nsopw.gov,/search-public-sex-offender-registries,11036250,https://www.nsopw.gov/search-public-sex-offend...


In [15]:
common_urls = pd.merge(site_scanning, top_10k, left_on="final_url", right_on="url")
common_urls.drop_duplicates(subset="url", keep="first", inplace=True)
common_urls.head()

Unnamed: 0,target_url,target_url_domain,target_url_top_level_domain,target_url_redirects,final_url,final_url_domain,final_url_top_level_domain,final_url_website,final_url_live,final_url_status_code,...,uswds_string,uswds_string_in_css,uswds_semantic_version,uswds_version,uswds_count,page_title,domain,pagePath,pageviews,url
0,fec.gov,fec.gov,.gov,True,https://www.fec.gov/,fec.gov,gov,www.fec.gov,True,200,...,0,0,,0,40,Home | FEC,www.fec.gov,/,1654,https://www.fec.gov/
1,monahrq.ahrq.gov,ahrq.gov,.gov,True,https://www.ahrq.gov/,ahrq.gov,gov,www.ahrq.gov,True,200,...,0,20,,0,100,Home | Agency for Healthcare Research and Quality,www.ahrq.gov,/,689,https://www.ahrq.gov/
2,www.spoc.spaceforce.mil,spaceforce.mil,.mil,False,https://www.spoc.spaceforce.mil/,spaceforce.mil,mil,www.spoc.spaceforce.mil,True,200,...,0,0,,0,0,Home,www.spoc.spaceforce.mil,/,666,https://www.spoc.spaceforce.mil/
3,www.fema.gov,fema.gov,.gov,False,https://www.fema.gov/,fema.gov,gov,www.fema.gov,True,200,...,39,20,,0,166,Home | FEMA.gov,www.fema.gov,/,26228,https://www.fema.gov/
4,redirector.nlm.nih.gov,nih.gov,.gov,True,https://www.nlm.nih.gov/,nih.gov,gov,www.nlm.nih.gov,True,200,...,11,20,,0,151,National Library of Medicine - National Instit...,www.nlm.nih.gov,/,3054,https://www.nlm.nih.gov/


In [16]:
print(
    f"There are {len(common_urls)} urls from the site scanning "
    f"report that also appear in the top-10000 pages and screens report from analytics.usa.gov."
)

There are 447 urls from the site scanning report that also appear in the top-10000 pages and screens report from analytics.usa.gov.


In [17]:
distinct_domains = top_10k["domain"].unique()
print(
    f"There are {len(distinct_domains)} distinct domains in the top 10k pages report."
)

There are 1253 distinct domains in the top 10k pages report.


In [18]:
crux_sample = load_results_to_dataframe(
    file_name="data/sampled-crux-data-20240828.csv",
    notebook_dir=notebook_dir,
)

crux_sample = crux_sample.dropna(subset=['largest_contentful_paint'])
crux_sample.head(5000)

Unnamed: 0,page_title,domain,pagePath,pageviews,url,time_to_first_byte,first_contentful_paint,largest_contentful_paint,cumulative_layout_shift,interaction_to_next_paint
0,Diabetes - Search Results - PubMed,pubmed.ncbi.nlm.nih.gov,/,922,https://pubmed.ncbi.nlm.nih.gov/,973.0,1312.0,1397.0,0.05,243.0
1,Live | The White House,www.whitehouse.gov,/live/,17294,https://www.whitehouse.gov/live/,385.0,1073.0,1733.0,0.00,153.0
2,NWS Radar,radar.weather.gov,/station/kfsx/standard,1752,https://radar.weather.gov/station/kfsx/standard,154.0,531.0,1402.0,0.01,112.0
3,Military | USCIS,www.uscis.gov,/military/military,1930,https://www.uscis.gov/military/military,298.0,845.0,1287.0,0.03,131.0
4,Important Visa Information - U.S. Embassy and ...,ng.usembassy.gov,/visas/important-visa-information/,2882,https://ng.usembassy.gov/visas/important-visa-...,1833.0,2601.0,3257.0,0.30,259.0
...,...,...,...,...,...,...,...,...,...,...
1400,NASA Citizen Scientists Spot Object Moving 1 M...,science.nasa.gov,/get-involved/citizen-science/nasa-citizen-sci...,2496,https://science.nasa.gov/get-involved/citizen-...,414.0,1146.0,1129.0,0.00,91.0
1401,"Storm Prediction Center Aug 12, 2024 1630 UTC ...",www.spc.noaa.gov,/products/outlook/day1otlk.html,16285,https://www.spc.noaa.gov/products/outlook/day1...,188.0,548.0,735.0,0.02,313.0
1402,Search Results | InertFinder | Pesticides | US...,ordspub.epa.gov,/ords/pesticides/f,4750,https://ordspub.epa.gov/ords/pesticides/f,2989.0,4811.0,4616.0,0.00,147.0
1403,Southern Region Headquarters,www.weather.gov,/srh/,852,https://www.weather.gov/srh/,815.0,1371.0,1546.0,0.04,402.0
