# Is there variance in Site Scanning and the Chrome User Experience Report (CrUX)?

Here, we look at the difference between CrUX data that reports the p75 values of the Core Web Vitals and the results from the synthetic performance testing data from the Site Scanning report.

In [54]:
import os
import pandas as pd

def load_results_to_dataframe(**kwargs):
    file_path = os.path.join(kwargs["notebook_dir"], kwargs["file_name"])
    df = pd.read_csv(file_path)
    return df

In [55]:
crux = load_results_to_dataframe(
    file_name="data/crux-data-202406.csv",
    notebook_dir=os.getcwd()
)

sitescanning = load_results_to_dataframe(
    file_name="data/site-scanning-weekly-snapshot-20240722.csv",
    notebook_dir=os.getcwd()
) 

sitescanning['final_url_website'] = 'https://' + sitescanning['final_url_website']

merged_df = pd.merge(sitescanning, crux, left_on='final_url_website', right_on='origin')
required_columns = ['origin', 'p75_lcp', 'largest_contentful_paint', 'p75_cls', 'cumulative_layout_shift', 'p75_inp']
merged_df = merged_df[required_columns].dropna()
merged_df.rename(columns={'p75_lcp': 'lcp_crux_p75', 'p75_cls': 'cls_crux_p75', 'largest_contentful_paint': 'lcp_site_scanning', 'cumulative_layout_shift': 'cls_site_scanning'}, inplace=True)
merged_df['lcp_difference_crux_vs_scan'] = merged_df['lcp_crux_p75'] - merged_df['lcp_site_scanning']
merged_df['cls_difference_crux_vs_scan'] = merged_df['cls_crux_p75'] - merged_df['cls_site_scanning']
merged_df['lcp_abs_difference_crux_vs_scan'] = abs(merged_df['lcp_crux_p75'] - merged_df['lcp_site_scanning'])
merged_df['cls_abs_difference_crux_vs_scan'] = abs(merged_df['cls_crux_p75'] - merged_df['cls_site_scanning'])

merged_df.sort_values(by='lcp_abs_difference_crux_vs_scan', ascending=False).head(1500)

Unnamed: 0,origin,lcp_crux_p75,lcp_site_scanning,cls_crux_p75,cls_site_scanning,p75_inp,lcp_difference_crux_vs_scan,cls_difference_crux_vs_scan,lcp_abs_difference_crux_vs_scan,cls_abs_difference_crux_vs_scan
5437,https://apps.nea.gov,25000.0,107.100,0.00,0.000000,175.0,24892.900,0.000000,24892.900,0.000000
6851,https://invitation.nasa.gov,4900.0,26472.400,0.05,0.000045,175.0,-21572.400,0.049955,21572.400,0.049955
1143,https://passport.intelink.gov,20600.0,1286.399,0.00,0.038292,25.0,19313.601,-0.038292,19313.601,0.038292
8523,https://crg.health.mil,5400.0,23345.399,0.00,0.000000,75.0,-17945.399,0.000000,17945.399,0.000000
2223,https://cce-datasharing.gsfc.nasa.gov,4100.0,21531.400,0.00,0.000000,50.0,-17431.400,0.000000,17431.400,0.000000
...,...,...,...,...,...,...,...,...,...,...
9069,https://espanol.stopbullying.gov,2600.0,404.000,0.00,0.016287,125.0,2196.000,-0.016287,2196.000,0.016287
5654,https://www.cancer.gov,2600.0,404.300,0.40,0.018398,125.0,2195.700,0.381602,2195.700,0.381602
808,https://www.jec.senate.gov,2600.0,404.399,0.00,0.073967,50.0,2195.601,-0.073967,2195.601,0.073967
4994,https://pumas.nasa.gov,5700.0,7895.300,0.00,0.005571,75.0,-2195.300,-0.005571,2195.300,0.005571


## Data differences

A positive value for the mean or median difference in the below means that the values reported by the CrUX dataset were higher than the Site Scanning data, while a negative value for those columns indicates that the value reported by the Site Scanning data was higher.  

In [58]:
data = {
    'Measure': ['LCP', 'CLS'],
    'Mean Difference (abs)': [merged_df['lcp_abs_difference_crux_vs_scan'].mean(), merged_df['cls_abs_difference_crux_vs_scan'].mean()],
    'Mean Difference': [merged_df['lcp_difference_crux_vs_scan'].mean(), merged_df['cls_difference_crux_vs_scan'].mean()],
    'Median Difference (abs)': [merged_df['lcp_abs_difference_crux_vs_scan'].median(), merged_df['cls_abs_difference_crux_vs_scan'].median()],
    'Median Difference': [merged_df['lcp_abs_difference_crux_vs_scan'].mean(), merged_df['cls_abs_difference_crux_vs_scan'].mean()],
    'Standard Deviation (abs)': [merged_df['lcp_abs_difference_crux_vs_scan'].std(), merged_df['cls_abs_difference_crux_vs_scan'].std()],
    'Standard Deviation': [merged_df['lcp_difference_crux_vs_scan'].std(), merged_df['cls_difference_crux_vs_scan'].std()]
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,Measure,Mean Difference (abs),Mean Difference,Median Difference (abs),Median Difference,Standard Deviation (abs),Standard Deviation
0,LCP,1474.350764,1331.969751,1224.399,1474.350764,1255.940027,1406.057555
1,CLS,0.142335,-0.060149,0.049996,0.142335,0.21807,0.253373
