In [None]:
import requests
import json, re, time
from datetime import datetime
import pandas as pd

with open('api.key','r') as f:
    APIKEY = f.read()

HEADERS = {'apikey': APIKEY,
            'accept': 'application/json',
            'content-type': 'application/json',
            'cookie': 'ASP.NET_SessionId=aiggen1ccck0gq141dgq1sip; ASP.NET_SessionId=aiggen1ccck0gq141dgq1sip'
          }

API_ENDPOINT = 'https://kong.speedcheckerapi.com:8443/ProbeAPIv2/'



In [75]:
def retrievePageLoadTestResults(testID):
    
    url = API_ENDPOINT + "GetPageLoadResults?apikey=" + APIKEY + "&testID=" + testID
    print(url)
    try:    
        r = requests.get(url, headers=HEADERS)
    except requests.exceptions.RequestException as e:  # This is the correct syntax
        return []
    
    res = json.loads(r.text)
    print(res)
    return res['PageLoadTestResults']

In [None]:
df_tests_url = pd.read_csv('data/url_testID_3.csv', sep='|', encoding='utf-8', keep_default_na=False)
df_tests_url = df_tests_url.loc[df_tests_url.testID!='FAILED']
df_tests_url

In [None]:
df_results = pd.DataFrame(columns=['ProbeInfo',
                                       'TestDateTime',
                                       'StatusCode',
                                       'StatusText',
                                       'DNSLookupTime', 
                                       'Destination', 
                                       'HTTPStatus', 
                                       'InitialConnection', 
                                       'NumberOfRequests', 
                                       'PageLoadTime', 
                                       'SSLNegotiationTime', 
                                       'TTFB', 
                                       'TotalDownloadedBytes'
                                      ])

for index, row in df_tests_url.iterrows():
    
    testID = row['testID']
    res =  retrievePageLoadTestResults(testID)

    if (res is None or len(res) < 1):
        continue
    
    probeInfo = res[0]['ProbeInfo']
    testDateTime = res[0]['TestDateTime']

    #extract the epoch
    m = re.search('/Date\((\d+)\+0000\)/', testDateTime)
    if m:
        testDateTime = m.group(1)[:-3]

    #convert into human readable format
    testDateTime = datetime.fromtimestamp(int(testDateTime)).strftime('%Y-%m-%d')
    statusCode = res[0]['TestStatus']['StatusCode']
    statusText = res[0]['TestStatus']['StatusText']
    dnsLookupTime = res[0]['DNSLookupTime']
    destination =  res[0]['Destination']
    hTTPStatus = res[0]['HTTPStatus']
    initialConnection = res[0]['InitialConnection']
    numberOfRequests =  res[0]['NumberOfRequests'] 
    pageLoadTime =  res[0]['PageLoadTime'] 
    sslNegotiationTime = res[0]['SSLNegotiationTime'] 
    ttfb = res[0]['TTFB']
    totalDownloadedBytes = res[0]['TotalDownloadedBytes']

    df_results = df_results.append({'ProbeInfo':probeInfo,
        'TestDateTime': testDateTime,
        'StatusCode':statusCode,
        'StatusText':statusText,
        'DNSLookupTime':dnsLookupTime,
        'Destination':destination,
        'HTTPStatus':hTTPStatus,
        'InitialConnection':initialConnection,
        'NumberOfRequests':numberOfRequests,
        'PageLoadTime':pageLoadTime,
        'SSLNegotiationTime':sslNegotiationTime,
        'TTFB':ttfb,
        'TotalDownloadedBytes':totalDownloadedBytes
    }, ignore_index=True)

df_results

In [None]:
df_results.to_csv('data/url_results_3.csv', sep='|', encoding='utf-8', index=False)


In [None]:
df_results.loc[df_results.HTTPStatus=='200']

In [None]:
df_url_results1 = pd.read_csv('data/url_results_1.csv', sep='|', encoding='utf-8', keep_default_na=False)

In [None]:
df_url_links_sample1 = pd.read_csv('data/url_links_sample1.csv', sep='|', encoding='utf-8', keep_default_na=False)
df_url_links_sample2 = pd.read_csv('data/url_links_sample2.csv', sep='|', encoding='utf-8', keep_default_na=False)

In [None]:
df_url_links_sample = pd.concat([df_url_links_sample1.iloc[:,[0,1,3]], df_url_links_sample2.iloc[:,[0,1,3]]], ignore_index=True)


In [None]:
df_url_links_sample = df_url_links_sample.drop_duplicates()

In [None]:
df_url_links_sample.groupby('domain')['url'].nunique()

In [None]:
df_domains = pd.read_csv('data/domain.csv', sep='|', encoding='latin1', keep_default_na=False)

In [None]:
df_links = pd.read_csv('data/links.csv', sep='|', encoding='latin1', keep_default_na=False)
df_links

In [None]:
len(df_url_links_sample)

In [None]:
df_links.groupby('cc')['domain'].nunique()

In [None]:
df_links.loc[df_links.cc=='ZA'].domain.drop_duplicates()

In [80]:
print(retrievePageLoadTestResults('db943c53-5844-4398-b78d-b38d89874c86'))

https://kong.speedcheckerapi.com:8443/ProbeAPIv2/GetPageLoadResults?apikey=a374d018-7235-4fd8-84f2-e97b80233544&testID=db943c53-5844-4398-b78d-b38d89874c86
{'ResponseStatus': {'StatusCode': '502', 'StatusText': 'Test failed, not enough probes'}, 'PageLoadTestResults': [{'ProbeInfo': {'CountryCode': 'ZW', 'Latitude': -17.8178, 'Longitude': 31.0447, 'Platform': 'PC', 'ProbeID': 'ee9579c4-65b0-4022-a765-a5105b208137'}, 'TestDateTime': '/Date(1531204298922+0000)/', 'TestStatus': {'StatusCode': '200', 'StatusText': 'OK'}, 'DNSLookupTime': 885, 'Destination': 'https://www.google.com/amp/s/www.voazimbabwe.com/amp/4452468.html', 'HTTPStatus': '200', 'InitialConnection': 130, 'NumberOfRequests': 100, 'PageLoadTime': 33362, 'SSLNegotiationTime': 79, 'TTFB': 3474, 'TotalDownloadedBytes': 1398217}]}
[{'ProbeInfo': {'CountryCode': 'ZW', 'Latitude': -17.8178, 'Longitude': 31.0447, 'Platform': 'PC', 'ProbeID': 'ee9579c4-65b0-4022-a765-a5105b208137'}, 'TestDateTime': '/Date(1531204298922+0000)/', 'Tes

In [None]:
df_url_results1 = pd.read_csv('data/url_results_1.csv', sep='|', encoding='utf-8', keep_default_na=False)
df_url_results2 = pd.read_csv('data/url_results_2.csv', sep='|', encoding='utf-8', keep_default_na=False)
df_url_results3 = pd.read_csv('data/url_results_3.csv', sep='|', encoding='utf-8', keep_default_na=False)

df_url_results = pd.concat([df_url_results1, df_url_results2, df_url_results3], ignore_index=True)
df_url_results = df_url_results.loc[df_url_results.StatusCode==200]

In [None]:
df_url_results.to_csv('data/url_results.csv', sep='|', encoding='utf-8', index=False)

In [None]:
df_url_links_sample

In [None]:
df_ori_amp_results = pd.read_csv('data/ori_amp_url_results.csv', sep='|', encoding='utf-8', keep_default_na=False)
df_amp_viewer_results = pd.read_csv('data/amp_viewer_url_results.csv', sep='|', encoding='utf-8', keep_default_na=False)

In [None]:
len(df_amp_viewer_results['domain'].drop_duplicates())

In [None]:
df_amp_viewer_results = pd.read_csv('data/amp_viewer_url_results.csv', sep='|', encoding='utf-8', keep_default_na=False)
# df_amp_viewer_results = df_amp_viewer_results.loc[df_amp_viewer_results['StatusText'] == 'OK']
# df_amp_viewer_results
df_amp_viewer_results = df_amp_viewer_results[(df_amp_viewer_results.HTTPStatus == 200) & (df_amp_viewer_results['StatusText'] == 'OK')]
df_amp_viewer_results = df_amp_viewer_results[(df_amp_viewer_results.InitialConnection > 0) & (df_amp_viewer_results.SSLNegotiationTime > 0) & (df_amp_viewer_results.DNSLookupTime > 0)]
df_amp_viewer_results

In [None]:
df_amp_viewer_results[df_amp_viewer_results.StatusCode==200]

In [None]:
df_amp_cdn_results = pd.read_csv('data/amp_cdn_url_results.csv', sep='|', encoding='utf-8', keep_default_na=False)
df_amp_cdn_results = df_amp_cdn_results[(df_amp_cdn_results.HTTPStatus=='200') & 
                                        (df_amp_cdn_results.StatusText=='OK') & 
                                        (df_amp_cdn_results.InitialConnection > 0)]
df_amp_cdn_results

In [61]:
df_url_results = pd.read_csv('data/url_results.csv', sep='|', encoding='utf-8', keep_default_na=False)
df_url_results = df_url_results[(df_url_results.HTTPStatus==200) & (df_url_results.StatusText=='OK')]

In [65]:
df_ori_amp_results = pd.read_csv('data/ori_amp_url_results.csv', sep='|', encoding='utf-8', 
                                 keep_default_na=False, 
                                 converters={'InitialConnection':int, 'DNSLookupTime':int})
df_ori_amp_results = df_ori_amp_results[(df_ori_amp_results.HTTPStatus == '200') & 
                                        (df_ori_amp_results.StatusText == 'OK') &
                                        (df_ori_amp_results['InitialConnection'] > 0) &
                                        (df_ori_amp_results['DNSLookupTime'] > 0)]

In [70]:
df_amp_viewer_results = pd.read_csv('data/amp_viewer_url_results.csv', sep='|', encoding='utf-8', 
                                    keep_default_na=False,
                                    converters={'InitialConnection':int, 'DNSLookupTime':int})                                  
df_amp_viewer_results = df_amp_viewer_results[(df_amp_viewer_results.HTTPStatus == 200) & 
                                              (df_amp_viewer_results.StatusText == 'OK') &
                                              (df_amp_viewer_results['InitialConnection'] > 0) &
                                              (df_amp_viewer_results['DNSLookupTime'] > 0)]

In [89]:
df_amp_viewer_results.loc[df_amp_viewer_results['TotalDownloadedBytes'] > 10000000]

Unnamed: 0,domain,cc,ProbeInfo,TestDateTime,StatusCode,StatusText,DNSLookupTime,Destination,HTTPStatus,InitialConnection,NumberOfRequests,PageLoadTime,SSLNegotiationTime,TTFB,TotalDownloadedBytes
103,kemptonexpress.co.za,ZA,"{'ASN': 10474, 'CityName': 'Cape Town', 'Conne...",2018-07-06,200,OK,52,https://www.google.com/amp/s/kemptonexpress.co...,200,113,346,35544,88,1132,5289926
174,www.zimbabwesituation.com,ZW,"{'ASN': 37183, 'ConnectionType': 'Ethernet', '...",2018-07-06,200,OK,55,https://www.google.com/amp/s/www.zimbabwesitua...,200,507,232,37595,278,2876,5271402
195,germistoncitynews.co.za,ZA,"{'ASN': 37105, 'CityName': 'Johannesburg', 'Co...",2018-07-06,200,OK,79,https://www.google.com/amp/s/germistoncitynews...,200,227,380,29811,149,4138,6141063
287,kemptonexpress.co.za,ZA,"{'ASN': 10474, 'CityName': 'Cape Town', 'Conne...",2018-07-06,200,OK,51,https://www.google.com/amp/s/kemptonexpress.co...,200,111,430,30731,88,1245,7058178
288,kemptonexpress.co.za,ZA,"{'ASN': 10474, 'CityName': 'Cape Town', 'Conne...",2018-07-06,200,OK,35,https://www.google.com/amp/s/kemptonexpress.co...,200,101,435,31634,68,3845,6511389
290,kemptonexpress.co.za,ZA,"{'ASN': 37105, 'CityName': 'Johannesburg', 'Co...",2018-07-06,200,OK,81,https://www.google.com/amp/s/kemptonexpress.co...,200,236,351,29314,150,3281,5477851
313,risingsunchatsworth.co.za,ZA,"{'ASN': 37105, 'CityName': 'Johannesburg', 'Co...",2018-07-06,200,OK,26,https://www.google.com/amp/s/risingsunoverport...,200,112,460,22287,89,2790,7362134
314,risingsunchatsworth.co.za,ZA,"{'ASN': 37105, 'CityName': 'Johannesburg', 'Co...",2018-07-06,200,OK,85,https://www.google.com/amp/s/risingsunoverport...,200,287,484,31931,137,3188,6968988
317,risingsunchatsworth.co.za,ZA,"{'ASN': 10474, 'CityName': 'Cape Town', 'Conne...",2018-07-06,200,OK,41,https://www.google.com/amp/s/risingsunoverport...,200,109,338,24925,70,3799,5399531
320,www.standardmedia.co.ke,KE,"{'ASN': 328271, 'ConnectionType': 'Ethernet', ...",2018-07-06,200,OK,5,https://www.google.com/amp/s/www.sde.co.ke/amp...,200,208,153,17829,130,2761,6416553


In [104]:
df_links = pd.read_csv('data/links_sample.csv', sep='|', encoding='latin1', keep_default_na=False)
len(df_links)

455

In [106]:
len(df_links['cc'].drop_duplicates())

41

In [None]:
df