# Top 10k pages data

We're running the top 10k pages from analytics.usa.gov for the last 30 days through the CrUX API to get page-level information.

In [1]:
"""
Loads result to dataframe and analyzes the results
"""

import os
import pandas as pd
from utils.load_results_to_dataframe import load_results_to_dataframe

notebook_dir = os.getcwd()

In [2]:
site_scanning = load_results_to_dataframe(
    file_name="data/site-scanning-weekly-snapshot-20240828.csv",
    notebook_dir=notebook_dir,
)

top_10k = load_results_to_dataframe(
    file_name="data/top-10000-pages-and-screens-30-days-20240828.csv",
    notebook_dir=notebook_dir,
)

top_10k["url"] = top_10k.apply(
    lambda row: "https://" + str(row["domain"]) + str(row["pagePath"]), axis=1
)

top_10k.head()

Unnamed: 0,page_title,domain,pagePath,pageviews,url
0,National Institute of Standards and Technology...,www.time.gov,/,71938105,https://www.time.gov/
1,National Institute of Standards and Technology...,time.gov,/,66873558,https://time.gov/
2,NWS Radar,radar.weather.gov,/,12205408,https://radar.weather.gov/
3,USPS.com® - USPS Tracking® Results,tools.usps.com,/go/trackconfirmaction,12093137,https://tools.usps.com/go/trackconfirmaction
4,Search Public Sex Offender Registries | Dru Sj...,www.nsopw.gov,/search-public-sex-offender-registries,11036250,https://www.nsopw.gov/search-public-sex-offend...


In [3]:
common_urls = pd.merge(site_scanning, top_10k, left_on="final_url", right_on="url")
common_urls.drop_duplicates(subset="url", keep="first", inplace=True)
common_urls.head()

Unnamed: 0,target_url,target_url_domain,target_url_top_level_domain,target_url_redirects,final_url,final_url_domain,final_url_top_level_domain,final_url_website,final_url_live,final_url_status_code,...,uswds_string,uswds_string_in_css,uswds_semantic_version,uswds_version,uswds_count,page_title,domain,pagePath,pageviews,url
0,fec.gov,fec.gov,.gov,True,https://www.fec.gov/,fec.gov,gov,www.fec.gov,True,200,...,0,0,,0,40,Home | FEC,www.fec.gov,/,1654,https://www.fec.gov/
1,monahrq.ahrq.gov,ahrq.gov,.gov,True,https://www.ahrq.gov/,ahrq.gov,gov,www.ahrq.gov,True,200,...,0,20,,0,100,Home | Agency for Healthcare Research and Quality,www.ahrq.gov,/,689,https://www.ahrq.gov/
2,www.spoc.spaceforce.mil,spaceforce.mil,.mil,False,https://www.spoc.spaceforce.mil/,spaceforce.mil,mil,www.spoc.spaceforce.mil,True,200,...,0,0,,0,0,Home,www.spoc.spaceforce.mil,/,666,https://www.spoc.spaceforce.mil/
3,www.fema.gov,fema.gov,.gov,False,https://www.fema.gov/,fema.gov,gov,www.fema.gov,True,200,...,39,20,,0,166,Home | FEMA.gov,www.fema.gov,/,26228,https://www.fema.gov/
4,redirector.nlm.nih.gov,nih.gov,.gov,True,https://www.nlm.nih.gov/,nih.gov,gov,www.nlm.nih.gov,True,200,...,11,20,,0,151,National Library of Medicine - National Instit...,www.nlm.nih.gov,/,3054,https://www.nlm.nih.gov/


In [4]:
print(
    f"There are {len(common_urls)} urls from the site scanning "
    f"report that also appear in the top-10000 pages and screens report from analytics.usa.gov."
)

There are 447 urls from the site scanning report that also appear in the top-10000 pages and screens report from analytics.usa.gov.


In [5]:
distinct_domains = top_10k["domain"].unique()
print(
    f"There are {len(distinct_domains)} distinct domains in the top 10k pages report."
)

There are 1253 distinct domains in the top 10k pages report.


In [6]:
crux_sample = load_results_to_dataframe(
    file_name="data/sampled-crux-random-2k-20240828.csv",
    notebook_dir=notebook_dir,
)

crux_sample = crux_sample.dropna(subset=['largest_contentful_paint'])
crux_sample.head(5000)

Unnamed: 0,page_title,domain,pagePath,pageviews,url,time_to_first_byte,first_contentful_paint,largest_contentful_paint,cumulative_layout_shift,interaction_to_next_paint
0,Bringing a Dog into the U.S. | Importation | CDC,www.cdc.gov,/importation/dogs/index.html,4868,https://www.cdc.gov/importation/dogs/index.html,533.0,1019.0,1086.0,0.00,95.0
1,Home — TreasuryDirect,treasurydirect.gov,/,34027,https://treasurydirect.gov/,635.0,1618.0,1953.0,0.00,93.0
2,"Charleston, WV",www.weather.gov,/rlx/,4739,https://www.weather.gov/rlx/,489.0,1007.0,1341.0,0.01,333.0
3,Sector Images: Gulf of Mexico - NOAA / NESDIS ...,www.star.nesdis.noaa.gov,/goes/sector.php,4916,https://www.star.nesdis.noaa.gov/goes/sector.php,849.0,1320.0,2056.0,0.01,225.0
4,Biden-Harris administration announces proposed...,www.osha.gov,/news/newsreleases/national/07022024,2524,https://www.osha.gov/news/newsreleases/nationa...,759.0,1591.0,1922.0,0.00,83.0
...,...,...,...,...,...,...,...,...,...,...
809,The Constitution of the United States: A Trans...,www.archives.gov,/founding-docs/constitution-transcript,976,https://www.archives.gov/founding-docs/constit...,558.0,1448.0,2052.0,0.03,204.0
810,Site Index Search | Internal Revenue Service,www.irs.gov,/es/site-index-search,1103,https://www.irs.gov/es/site-index-search,997.0,1354.0,1350.0,0.00,134.0
811,Department of Justice | Privacy Policy,www.justice.gov,/doj/privacy-policy,35160,https://www.justice.gov/doj/privacy-policy,1032.0,2122.0,2321.0,0.00,186.0
812,ECAP — FBI,www.fbi.gov,/wanted/ecap,1025,https://www.fbi.gov/wanted/ecap,398.0,474.0,500.0,0.00,103.0
