In [32]:
from PythonPSI.api import PSI
import pandas as pd
import time
from ipynb.fs.full.functions import *
import glob

### Notes
This function is pretty slow, because it takes a while to get all the needed data with the API, especially when calling a lot of urls. With 50 urls, it takes about `20 minutes` to run.

Not every url returns useable performance data, since the API has some restrictions. Some webpages require too many queries or the request simply fails. With 50 urls, about `30 urls` return useable data. The results can vary.

This function will need some refactoring at some point to enhance performance.

In [47]:
# returns a dict with the cls, fcp, fid and lcp from given websites dataframe with performance data
def GetSpecificPerformanceData(df):
    url = df.loc["CUMULATIVE_LAYOUT_SHIFT_SCORE","id"]
    cls = pd.DataFrame(df.loc["CUMULATIVE_LAYOUT_SHIFT_SCORE","metrics"]).percentile.iloc[0]
    fcp = pd.DataFrame(df.loc["FIRST_CONTENTFUL_PAINT_MS","metrics"]).percentile.iloc[0]
    fid = pd.DataFrame(df.loc["FIRST_INPUT_DELAY_MS","metrics"]).percentile.iloc[0]
    lcp = pd.DataFrame(df.loc["LARGEST_CONTENTFUL_PAINT_MS","metrics"]).percentile.iloc[0]
    results = {"URL":url,"FCP":fcp,"LCP":lcp,"FID":fid,"CLS":cls}
    return results

def FileExists(filename):
    try:
        with open(f"websitespeed/{filename}.csv"):
            return True
    except IOError:
        potential_csv_file_exists = 0
        return False

# returns dataframe with website performance data
def GetWebsiteSpeed(keyword,df):
    rows_list = []
    urls = df["Ur"]
    failed_urls = []

    for key,value in urls.iteritems():
        filename = "{}-{}".format(keyword,value.replace('.', '').replace('/', '').replace(':', ''))
        if FileExists(filename) is True:
            continue
        
        try:
            data = PSI(value, category='performance', locale='en', stratergy='desktop', metrics='loadingExperience')
        except:
            continue
        data_keys = list(data.keys())
        
        # don't add row to list when API call gives error or data is incomplete
        if data_keys != ['error']:
            if data_keys != ['initial_url']:
                performance = GetSpecificPerformanceData(pd.DataFrame(data))
                rows_list.append(performance)
                temp_row = []
                temp_row.append(performance)
                df = pd.DataFrame(temp_row)
                df.to_csv(r"websitespeed/{}-{}.csv".format(keyword,filename))          
        else:
            failed_urls.append({"URL":value})
            print('one failed url added')
            
        # API restricts from too many calls per minute, so wait a bit
        # this should be refactored to enhance performance
        time.sleep(12)
    
    failed_df = pd.DataFrame(failed_urls)
    for key,value in failed_df.iteritems():
        filename = "{}-{}".format(keyword,value.replace('.', '').replace('/', '').replace(':', ''))
        if FileExists(filename) is True:
            continue
        # get data from Google Pagespeed Insights API
        try:
            data = PSI(value, category='performance', locale='en', stratergy='desktop', metrics='loadingExperience')
        except:
            continue
        
        data_keys = list(data.keys())
        
        # don't add row to list when API call gives error or data is incomplete
        if data_keys != ['error']:
            if data_keys != ['initial_url']:
                performance = GetSpecificPerformanceData(pd.DataFrame(data))
                rows_list.append(performance)
                temp_row = []
                temp_row.append(performance)
                df = pd.DataFrame(temp_row)
                df.to_csv(r"websitespeed/{}-{}.csv".format(keyword,filename))           
        else:
            rows_list.append({"URL":value,"FCP":"NaN","LCP":"NaN","FID":"NaN","CLS":"NaN"})            
        # API restricts from too many calls per minute, so wait a bit
        # this should be refactored to enhance performance
        time.sleep(12)
    
    files = glob.glob("websitespeed/{}-*.csv".format('bouwmaterialen'))
    dfs = [pd.read_csv(f) for f in files]
    complete_df = pd.concat(dfs)
    return complete_df[["URL","FCP","LCP","FID","CLS"]]

In [48]:
# test run
d = {'Ur': ["https://www.bouwmaat.com/bouwmaterialen", 
            "https://www.bouwbestel.nl/bouwmaterialen.html",
            "https://www.hornbach.nl/shop/Bouwmateriaal/S4471/artikeloverzicht.html",
            "https://www.online-bouwmaterialen.nl/",
            "https://www.bouwonline.com/",
            "https://www.hetjagershuis.com/c-4257901/bouwmaterialen/",
            "https://www.3mnederland.nl/3M/nl_NL/p/c/bouwmaterialen/",
            "bouwmaterialenzeeland.nl,https://www.bouwmaterialenzeeland.nl/",
            "https://www.boer-staphorst.nl/klussen-bouwen/bouwmaterialen",
            "https://www.eco-bouwmaterialen.nl/",
            "https://www.flexbouwmateriaal.nl/",
            "https://www.breukers.nl/bouwmaterialen",
            "https://www.bouwhof.nl/bouw/bouwmaterialen/",
            "https://www.kombibouwmaterialen.nl/producten/bouwmaterialen/",
            "https://www.ggoedkoop.nl/",
            "https://www.groenebouwmaterialen.nl/"    
           ]}

# df = pd.DataFrame(data=d)
# keyword = 'bouwmaterialen'
# joe = GetWebsiteSpeed(keyword, df)
#joe

file already exists
file already exists
file already exists
file already exists
file already exists
file already exists
file already exists
one failed url added
file already exists
file already exists
file already exists
file already exists
file already exists
file already exists
file already exists
file already exists


Unnamed: 0,URL,FCP,LCP,FID,CLS
0,https://www.debouwmarktshop.nl/,2414,2940,6,68
0,https://www.bouwhof.nl,1235,1785,3,7
0,https://www.wienerberger.nl,885,1587,2,11
0,https://www.boer-staphorst.nl,1627,2697,4,9
0,https://www.houthandelonline.nl/bouwmaterialen,378,514,2,6
0,https://www.ggoedkoop.nl/,994,1565,4,6
0,https://www.bouwcenter.nl,927,1248,3,9
0,https://www.bakkerdehouthandel.nl,1132,1256,3,4
0,https://www.karwei.nl/assortiment/k/bouwmateri...,700,1096,3,3
0,https://www.stmiddelkoop.nl,1380,1898,3,1
