# Experiments

In [2]:
# Importing necessary packages
import json
import requests
import pandas as pd
from urllib.parse import urlparse
from IPython.display import display
from portal_handler import add_prefixes, validate_list, analyze_list, extract_working_apis

## Validating "www." URLs with and without "www."

The experiments carried out below should support the decision-making process regarding the deduplication of URLs in the portal handler. Since a minority of URLs are appearing in the list twice, once with and once without the "www." prefix, the question is whether this prefix can be removed in the early deduplication step in the portal handler or if this would cause problems, e.g. a large number of sites not responding to requests anymore.

### Preparation

In [None]:
# Loading URLs
df = pd.read_csv("data/4_prefixed_portals.csv")
print("Total sites:", len(df))

# Keeping only active sites
df = df[df["active"] == True]
print("Active sites:", len(df))

# Adding column for netloc (= part of URL after the protocol prefix)
for index, portal in df.iterrows():
    if portal["url"].startswith("http"):
        df.loc[index, "netloc"] = urlparse(portal["url"]).netloc

### Finding duplicates by removing "www." from URLs

In [None]:
# Getting a copy of the dataframe
all_portals = df.copy()

# Removing "www." from the netloc and saving in a new column
for index, portal in all_portals.iterrows():
    all_portals.loc[index, "netloc_without_www"] = urlparse(portal["url"]).netloc.removeprefix("www.")

# Finding all duplicate URLs and keeping both the first occurences and the duplicates
duplicates = all_portals[all_portals.duplicated("netloc_without_www", False)].sort_values("netloc_without_www")

# Counting the number of duplicates (without the first occurences)
print("Duplicate sites after removing \"www.\":", len(all_portals[all_portals.duplicated("netloc_without_www")].sort_values("netloc_without_www")))

# Showing all the duplicates (with the first occurences)
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     display(duplicates)
display(duplicates)

### Validating the URLs

In [None]:
# Keeping only portals with "www."
www_portals = df[df["netloc"].str.startswith("www.")]
www_portals = www_portals.sort_values(by=["url"])
print("URLs with \"www.\" (including duplicates, see above):", len(www_portals))
# www_portals.drop(columns = "netloc").to_csv("experimental_data/www/with_www_prefixed.csv", index = None)

# Removing "www."
for index, portal in www_portals.iterrows():
    www_portals.loc[index, "url"] = urlparse(portal["url"]).scheme + "://" + urlparse(portal["url"]).netloc.removeprefix("www.")
# www_portals.drop(columns = "netloc").to_csv("experimental_data/www/www_removed_prefixed.csv", index = None)

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     display(www_portals)

# Validating the "www." URLs
# validate_list(input_list = "experimental_data/www/with_www_prefixed.csv",
#             output_list = "experimental_data/www/with_www_validated.csv",
#             output_markers = "experimental_data/www/with_www_validated_sites.json")

# Validating the same URLs after removing "www."
# validate_list(input_list = "experimental_data/www/www_removed_prefixed.csv",
#             output_list = "experimental_data/www/www_removed_validated.csv",
#             output_markers = "experimental_data/www/www_removed_validated_sites.json")

### Analyzing validation results

In [None]:
# Analyzing the results and adding them to the statistics CSV file
analyze_list("experimental_data/www/with_www_validated.csv", show = False, export = True)
analyze_list("experimental_data/www/www_removed_validated.csv", show = False, export = True)

In [40]:
# Loading the statistics
pd.read_csv("data/validation_statistics.csv")

Unnamed: 0,file,total,active,inactive,validated,unvalidated,subpage_endpoints,no_markers,ckan_suspected,ckan_working,opendatasoft_suspected,opendatasoft_working,socrata_suspected,socrata_working,timestamp
0,data/5_validated_portals_retry.csv,5539,4380,1159,4377,1162,4,3609,276,233,362,356,130,128,2023-07-31 19:28:07.676967
1,experimental_data/www/with_www_validated.csv,1193,1193,0,1180,13,0,1126,29,20,17,16,8,8,2023-08-01 00:37:52.636677
2,experimental_data/www/www_removed_validated.csv,1193,1193,0,992,201,0,956,26,17,6,6,4,4,2023-08-01 00:49:28.683073
3,experimental_data/railway/railway_validated.csv,8,8,0,8,0,1,2,3,2,3,3,0,0,2023-08-01 01:07:45.890909
4,experimental_data/universities/universities_va...,9,9,0,8,1,1,6,2,2,0,0,0,0,2023-08-01 01:09:59.786191
5,experimental_data/portalwatch/2_validated_port...,267,177,90,176,91,267,0,97,61,10,9,69,46,2023-09-11 18:43:57.437129


Duplicate URLs that appear once with and once without "www" compared to all URLs with "www": ~11% (130/1193)

All results below contain duplicates. <br>

URLs that are broken after removing "www" compared to validated URLs with "www": ~16% (188/1180) <br>
URLs that are broken after removing "www" compared to all validated URLs: ~4% (188/4377) <br>
API URLs that are broken after removing "www" compared to working API URLs with "www": <br>
* CKAN: ~15% (3/20)
* Opendatasoft: ~63% (10/16)
* Socrata: 50% (4/8)

API URLs that are broken after removing "www" compared to all working API URLs:
* CKAN: ~1% (3/233)
* Opendatasoft: ~3% (10/356)
* Socrata: ~3% (4/128)

In [None]:
# Loading the detailed validation results
with_www_validated = pd.read_csv("experimental_data/www/with_www_validated.csv")
www_removed_validated = pd.read_csv("experimental_data/www/www_removed_validated.csv")

# Merging the dataframes to have the validation results before and after removing "www." in one line for each site
merged = with_www_validated.merge(www_removed_validated, left_index = True, right_index = True)

# Keeping only sites whose API was working before and that could not be validated (did not respond) or whose API didn't work after removing "www."
merged = merged[(merged["api_working_x"] == True) & ((merged["api_working_y"] == False) | (merged["api_working_y"].isna()))]

# Selecting and rearranging the relevant columnns
merged = merged[["url_x", "url_y", "validated_x", "validated_y", "suspected_api_x", "api_working_x", "api_working_y", "api_version_x", "error_type_y"]]

print("Portals with working APIs that were broken after removing the \"www.\" prefix:", len(merged))

# Showing the dataframe - the suffix "_x" in the column names stands for "before", while "_y" stands for "after" (removing "www.")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(merged)

Based on the results above, the decision was made not to remove the "www." prefix in the early deduplication step. Taking into account the small share of the described duplicates among all URLs, the benefits derived from this deduplication (reducing the number of HTTP requests during validation, avoiding any duplicate sites in the list) are not outweighing the disadvantages ("losing" 10-20 Open Data portals with working APIs), especially since one main goal is to collect as many Open Data portals as possible and there are some interesting portals in the list of portals broken by the prefix removal, like the Open Data portals of Bahrain, Wallonia, Corsica and the City of Dallas.

Instead, after the validation step, any remaining duplicates that appear with and without "www." are removed from the list of portals with working APIs. See the portal handler for details.

An alternative solution would have been to request every URL with and without the "www." prefix, similar to the already implemented function that performs an HTTPS request and, if necessary, an HTTP request to determine the best available protocol. However, this would have lead to multiple additional requests for many of the approximately 5000 sites as multiple combinations of HTTP or HTTPS and WWW or no WWW would have had to be tried out and would have vastly exceeded the reduction in requests from removing just over 100 duplicates.

## Validating Opendatasoft file export formats

Since Opendatasoft is specifically designed for structured datasets only (https://help.opendatasoft.com/faq-glossary/en/faq_index.html), it could be hypothesized that every dataset on every Opendatasoft portal might be available in the CSV format. If true, this would allow a simplification of the Opendatasoft crawling function(s) and would reduce the required crawling time and number of HTTP requests, as the availability of the CSV file format would not have to be checked in every single instance.

Technically, the hypothesis would have to be the other way around: "Not every Opendatasoft portal offers every dataset in CSV". We could show that there is not sufficient evidence to prove this hypothesis if the number of datasets was the same as the number of supported datasets for every single portal which was successfully checked. While this would not prove that every Opendatasoft portal offers every dataset in CSV, it would should show that the opposite cannot be proven and would be a very strong indicator that for the purposes of simplification, the existence of the CSV format can be assumed without major information loss.

To test this, the crawl_opendatasoft_v2 function from the portal_crawler checked every dataset on every Opendatasoft portal for the existence of a CSV file export option. Below are the portal statistics created by the function.

In [13]:
# Loading the 4 parts of the results
part_1 = pd.read_csv("experimental_data/opendatasoft/portal_statistics_opendatasoft_PART_1.csv")
part_2 = pd.read_csv("experimental_data/opendatasoft/portal_statistics_opendatasoft_PART_2.csv")
part_3 = pd.read_csv("experimental_data/opendatasoft/portal_statistics_opendatasoft_PART_3.csv")
part_4 = pd.read_csv("experimental_data/opendatasoft/portal_statistics_opendatasoft_PART_4.csv")

# Combining the 4 dataframes
opendatasoft = pd.concat([part_1, part_2, part_3, part_4], ignore_index = True).drop_duplicates()
display(opendatasoft)

Unnamed: 0,url,api_software,number_of_datasets,number_of_supported_datasets,timestamp
0,https://acoss.opendatasoft.com,Opendatasoft,110.0,110.0,2023-09-02 23:20:50
1,https://aix-en-provence.opendatasoft.com,Opendatasoft,53.0,53.0,2023-09-02 23:21:58
2,https://analisis.datosabiertos.jcyl.es,Opendatasoft,355.0,355.0,2023-09-02 23:29:24
3,https://analyzejerseycity.opendatasoft.com,Opendatasoft,1275.0,1275.0,2023-09-03 00:00:50
4,https://angersloiremetropole.opendatasoft.com,Opendatasoft,149.0,149.0,2023-09-03 00:04:05
...,...,...,...,...,...
350,https://www.opendata.corsica,Opendatasoft,260.0,260.0,2023-09-03 13:57:53
351,https://www.opendata56.fr,Opendatasoft,85.0,85.0,2023-09-03 13:58:26
352,https://www.projets-environnement.gouv.fr,Opendatasoft,4.0,4.0,2023-09-03 13:58:27
353,https://zabal-agriculture.opendata-paysbasque.fr,Opendatasoft,163.0,163.0,2023-09-03 13:59:12


In [14]:
# Displaying only the portals where the number of datasets and supported datasets is not the same
display(opendatasoft[opendatasoft["number_of_datasets"] != opendatasoft["number_of_supported_datasets"]])

Unnamed: 0,url,api_software,number_of_datasets,number_of_supported_datasets,timestamp
114,https://data.opendatasoft.com,Opendatasoft,31891.0,4999.0,2023-09-03 03:58:17
118,https://data.randwick.nsw.gov.au,Opendatasoft,,,2023-09-03 04:01:35
216,https://mav-technology-geelongvic.opendatasoft...,Opendatasoft,,,2023-09-03 11:58:46
223,https://navitia.opendatasoft.com,Opendatasoft,,,2023-09-03 12:08:27
238,https://opendata-iles-ponant.edf.fr,Opendatasoft,,,2023-09-03 12:25:03
240,https://opendata-newcastlenswiar.opendatasoft.com,Opendatasoft,,,2023-09-03 12:25:16
255,https://opendata.dormagen.de,Opendatasoft,,,2023-09-03 12:40:56
299,https://prive.data.occitanie.education.gouv.fr,Opendatasoft,,,2023-09-03 13:07:29
313,https://smartregionidf.opendatasoft.com,Opendatasoft,8872.0,4999.0,2023-09-03 13:35:33


As we can see in the dataframe, out of 355 total Opendatasoft portals, the only ones for which the two numbers do not match are 7 portals which could not be reached and 2 portals which offer so many datasets that the daily API call rate was exceeded by the function. Thus, it can be concluded that currently, every dataset on every Opendatasoft portal is available in CSV format.

Based on these results, the crawl_opendatasoft_v2 function was designed to always assume the availability of CSV and build the dataset URL using only the dataset ID taken from the metadata catalog and the standard structure of export URLs. Due to this simplification compared to the experimental code, the number of requests per execution of the crawling function is reduced by 1 request per dataset (around 91000 as of August 2023), each taking about 0.5 to 1.5 seconds depending on whether a 1 second delay is used to lighten the load on the used APIs. This reduction also prevents the issue of reaching the API call limit of 5000 requests per portal per day which was encountered during the experiment and would make it much more complicated to regularly check every dataset on Opendatasoft portals with more than 5000 datasets.

## Validating railway and university portals

Below are the results from the validation of different Open Data portals of railway companies and universities, provided for interested colleagues.

In [17]:
# Validating the railway sites
# validate_list(input_list = "experimental_data/railway/railway_prefixed.csv",
#             output_list = "experimental_data/railway/railway_validated.csv",
#             output_markers = "experimental_data/railway/railway_validated_sites.json")

# Analyzing the results and adding them to the statistics CSV file
# analyze_list("experimental_data/railway/railway_validated.csv", show = False, export = True)

# Showing the results
pd.read_csv("experimental_data/railway/railway_validated.csv")

Unnamed: 0,url,active,validated,manually_checked_api,suspected_api,api_working,api_version,error_type
0,https://data.deutschebahn.com,True,True,,CKAN,False,,['JSONDecodeError']
1,https://data.oebb.at,True,True,,Unknown,,,
2,https://data.overheid.nl/data,True,True,CKAN,CKAN,True,2.8.2,
3,https://data.renfe.com,True,True,,CKAN,True,2.8.2,
4,https://data.sbb.ch,True,True,,OpenDataSoft,True,"['v1.0', 'v2.0', 'v2.1']",
5,https://data.sncf.com,True,True,,OpenDataSoft,True,"['v1.0', 'v2.0', 'v2.1']",
6,https://opendata.infrabel.be,True,True,,OpenDataSoft,True,"['v1.0', 'v2.0', 'v2.1']",
7,https://prorailnl.hub.arcgis.com,True,True,,Unknown,,,


The validation of the given railway portals showed some working CKAN and Opendatasoft portals which are now included in our main portal list. The DB portal is supposed to be based on CKAN, but the endpoints don’t work. The ÖBB portal is not using CKAN, Opendatasoft or Socrata. For Prorail, there is a portal that is based on ArcGIS and thus out of scope currently, but there is also some Prorail data in the Dutch government’s Open Data portal (https://data.overheid.nl/data) which has a working CKAN API but contains much more than just railway data.

In [18]:
# Validating the universities' sites
# validate_list(input_list = "experimental_data/universities/universities_prefixed.csv",
#             output_list = "experimental_data/universities/universities_validated.csv",
#             output_markers = "experimental_data/universities/universities_validated_sites.json")

# Analyzing the results and adding them to the statistics CSV file
# analyze_list("experimental_data/universities/universities_validated.csv", show = False, export = True)

# Showing the results
pd.read_csv("experimental_data/universities/universities_validated.csv")

Unnamed: 0,url,active,validated,manually_checked_api,suspected_api,api_working,api_version,error_type
0,http://opendata.calstate.edu,True,True,,CKAN,True,2.4.0,
1,https://data.ed.gov,True,True,,CKAN,True,2.9.9,
2,https://data.ox.ac.uk,True,True,,Unknown,,,
3,https://data.southampton.ac.uk,True,True,,Unknown,,,
4,https://datashare.ed.ac.uk,True,True,,Unknown,,,
5,https://deepblue.lib.umich.edu/data,True,True,Unknown,Unknown,,,
6,https://stanfordopendata.org,True,True,,Unknown,,,
7,https://transparencia.ua.es,True,False,,,,,['SSLError']
8,https://ucopendata.netlify.app,True,True,,Unknown,,,


Only two of the given organizations have a working API that’s based on one of the portal software options we support (CKAN, Opendatasoft, Socrata). Of those two, one is the US Department of Education, the other is California State University and both of them use CKAN.

In [39]:
# Loading the statistics
pd.read_csv("data/validation_statistics.csv")

Unnamed: 0,file,total,active,inactive,validated,unvalidated,subpage_endpoints,no_markers,ckan_suspected,ckan_working,opendatasoft_suspected,opendatasoft_working,socrata_suspected,socrata_working,timestamp
0,data/5_validated_portals_retry.csv,5539,4380,1159,4377,1162,4,3609,276,233,362,356,130,128,2023-07-31 19:28:07.676967
1,experimental_data/www/with_www_validated.csv,1193,1193,0,1180,13,0,1126,29,20,17,16,8,8,2023-08-01 00:37:52.636677
2,experimental_data/www/www_removed_validated.csv,1193,1193,0,992,201,0,956,26,17,6,6,4,4,2023-08-01 00:49:28.683073
3,experimental_data/railway/railway_validated.csv,8,8,0,8,0,1,2,3,2,3,3,0,0,2023-08-01 01:07:45.890909
4,experimental_data/universities/universities_va...,9,9,0,8,1,1,6,2,2,0,0,0,0,2023-08-01 01:09:59.786191
5,experimental_data/portalwatch/2_validated_port...,267,177,90,176,91,267,0,97,61,10,9,69,46,2023-09-11 18:43:57.437129


## Validating the old Portal Watch list

First, the old list has to be cleaned and prepared for the new functions:

In [28]:
# Loading the old Portal Watch list
old = pd.read_csv("data/portalwatch_portals.csv")

# Setting the API URL to the value of the portal URL for CKAN DCAT portals so the "catalog.ttl" path is removed
old.loc[old["api_software"] == "CKANDCAT", "api_url"] = old.loc[old["api_software"] == "CKANDCAT", "portal_url"]

# Changing CKANDCAT labels so that all CKAN portals are included in the check
old.loc[old["api_software"] == "CKANDCAT", "api_software"] = "CKAN"

# Keeping only relevant portals and columns
old = old[(old["api_software"] == "CKAN") | (old["api_software"] == "OpenDataSoft") | (old["api_software"] == "Socrata")]
old = old[["api_url", "api_software", "active"]]
old = old.rename(columns = {"api_url" : "url", "api_software" : "manually_checked_api"})

# Removing the protocol prefix and any URL parameters 
for index, portal in old.iterrows():
    parsed_url = urlparse(portal["url"])
    old.loc[index, "url"] = parsed_url.netloc + parsed_url.path.rstrip("/")

# Removing duplicates
old = old.drop_duplicates(subset = "url", keep = "first", ignore_index = True) 

# Printing the number of portals labeled "active" per portal software
print("CKAN portals labeled active:", len(old[(old["manually_checked_api"] == "CKAN") & (old["active"] == True)]))
print("Opendatasoft portals labeled active:", len(old[(old["manually_checked_api"] == "OpenDataSoft") & (old["active"] == True)]))
print("Socrata portals labeled active:", len(old[(old["manually_checked_api"] == "Socrata") & (old["active"] == True)]))

# Exporting the cleaned dataframe to a CSV file
old = old[["url", "manually_checked_api"]]
old.to_csv("experimental_data/portalwatch/0_extended_portals.csv", index = False)

# Printing the dataframe
display(old)

CKAN portals labeled active: 115
Opendatasoft portals labeled active: 11
Socrata portals labeled active: 65


Unnamed: 0,url,manually_checked_api
0,ckan.publishing.service.gov.uk,CKAN
1,beta.avoindata.fi,CKAN
2,dados.gov.br,CKAN
3,data.gov.md/ckan,CKAN
4,catalog.data.gov,CKAN
...,...,...
262,www.dallasopendata.com,Socrata
263,www.data.vic.gov.au,CKAN
264,www.dati.friuliveneziagiulia.it,Socrata
265,www.opendataphilly.org,CKAN


Then, the relevant functions can be called:

In [4]:
# Step 1
# add_prefixes(extended_portals_file = "experimental_data/portalwatch/0_extended_portals.csv", output_file = "experimental_data/portalwatch/1_prefixed_portals.csv")

# Step 2
# validate_list(input_list = "experimental_data/portalwatch/1_prefixed_portals.csv",
#             output_list = "experimental_data/portalwatch/2_validated_portals.csv",
#             output_markers = "experimental_data/portalwatch/2_validated_sites.json")

# Step 3
# analyze_list(validated_portals_file = "experimental_data/portalwatch/2_validated_portals.csv", show = True, export = True)

# Step 4
# extract_working_apis(validated_portals_file = "experimental_data/portalwatch/2_validated_portals.csv", output_file = "experimental_data/portalwatch/working_portals.csv")

Note that the "0_extended_portals.csv" portal list used above was created using the add_api_endpoints function from the portal handler. Therefore, the portals will be treated as manually validated portals and counted as "suspected" for their respective portal software in the validation statistics. The number of "suspected" portals in this case does **not** indicate sites on which HTML markers were found (see portal_handler.ipynb and thesis).

In [41]:
# Loading the statistics
pd.read_csv("data/validation_statistics.csv")

Unnamed: 0,file,total,active,inactive,validated,unvalidated,subpage_endpoints,no_markers,ckan_suspected,ckan_working,opendatasoft_suspected,opendatasoft_working,socrata_suspected,socrata_working,timestamp
0,data/5_validated_portals_retry.csv,5539,4380,1159,4377,1162,4,3609,276,233,362,356,130,128,2023-07-31 19:28:07.676967
1,experimental_data/www/with_www_validated.csv,1193,1193,0,1180,13,0,1126,29,20,17,16,8,8,2023-08-01 00:37:52.636677
2,experimental_data/www/www_removed_validated.csv,1193,1193,0,992,201,0,956,26,17,6,6,4,4,2023-08-01 00:49:28.683073
3,experimental_data/railway/railway_validated.csv,8,8,0,8,0,1,2,3,2,3,3,0,0,2023-08-01 01:07:45.890909
4,experimental_data/universities/universities_va...,9,9,0,8,1,1,6,2,2,0,0,0,0,2023-08-01 01:09:59.786191
5,experimental_data/portalwatch/2_validated_port...,267,177,90,176,91,267,0,97,61,10,9,69,46,2023-09-11 18:43:57.437129


## Checking false positives of marker validation

On some sites, validation markers can be found in the first part of the validation step, but no working API is located subsequently. Given the chosen approach of only testing the API functionality on sites for which the validation marker search has been successful, these cases, which could be described as false positives, are worth investigating.

In [22]:
validated_portals = pd.read_csv("data/5_validated_portals_retry.csv")
ckan_markers_portals = validated_portals[validated_portals["suspected_api"] == "CKAN"]
ckan_not_working_api_portals = ckan_markers_portals[ckan_markers_portals["api_working"] == False]

display(ckan_not_working_api_portals)

Unnamed: 0,url,active,validated,manually_checked_api,suspected_api,api_working,api_version,error_type
659,http://daten.buergernetz.bz.it,True,True,,CKAN,False,,['JSONDecodeError']
996,http://www.governoaberto.sp.gov.br,True,True,,CKAN,False,,['JSONDecodeError']
1032,http://www.nosdonnees.fr,True,True,,CKAN,False,,['JSONDecodeError']
1089,https://academy.opendatasoft.com,True,True,,CKAN,False,,['JSONDecodeError']
1254,https://canwin-datahub.ad.umanitoba.ca,True,True,,CKAN,False,,['JSONDecodeError']
1315,https://ckan.de,True,True,,CKAN,False,,['JSONDecodeError']
1320,https://ckan.en.softonic.com,True,True,,CKAN,False,,['KeyError']
1321,https://ckan.github.io,True,True,,CKAN,False,,['JSONDecodeError']
1326,https://ckan.odpt.org,True,True,,CKAN,False,,['JSONDecodeError']
1327,https://ckan.org,True,True,,CKAN,False,,['JSONDecodeError']


## Checking DCAT extension on CKAN portals

CKAN offers an extension that enables the retrieval of metadata using the Data Catalog Vocabulary (DCAT). The code below shows how to check the availability of this extension and the TTL / RDF catalog for all CKAN portals on the list. More information: https://extensions.ckan.org/extension/dcat/

In [None]:
# Loading the new portal list
portals = pd.read_csv("data/portals.csv")

# Keeping only CKAN portals
portals = portals[portals["api_software"] == "CKAN"]

# Iterating through all CKAN portals
for index, portal in portals.iterrows():
    portal_api_base_url = portal["url"]
    portal_api_status_url = portal_api_base_url + "/api/3/action/status_show"
    portal_api_catalog_url = portal_api_base_url + "/catalog.ttl"
    print(portal_api_base_url)

    try:
        # Checking DCAT extension availability
        status_response = json.loads(requests.get(portal_api_status_url).text)
        if "dcat" in status_response["result"]["extensions"]:
            print("DCAT available!")
            portals.loc[index, "dcat_available"] = True
        else:
            print("DCAT not available!")
            portals.loc[index, "dcat_available"] = False

        # Checking TTL catalog availability
        catalog_response = requests.get(portal_api_catalog_url).text
        if "@prefix" in catalog_response:
            print("TTL catalog available!" + "\n")
            portals.loc[index, "ttl_catalog_available"] = True
        else:
            print("TTL catalog not available!" + "\n")
            portals.loc[index, "ttl_catalog_available"] = False
    except Exception as exception:
        print(exception)
        portals.loc[index, "dcat_available"] = "Check failed"
        portals.loc[index, "ttl_catalog_available"] = "Check failed"
        continue

# Exporting the dataframe
# portals.to_csv("experimental_data/dcat/ckan_dcat_check.csv", index = False)

In [28]:
# Loading the results
results = pd.read_csv("experimental_data/dcat/ckan_dcat_check.csv")
display(results)

print("Number of CKAN portals:", len(results[results["api_software"] == "CKAN"]))
print("Number of CKAN portals for which the check failed:", len(results[results["dcat_available"] == "Check failed"]))
print("Number of CKAN portals with DCAT extension:", len(results[results["dcat_available"] == "True"]))
print("Number of CKAN portals with TTL catalog:", len(results[results["ttl_catalog_available"] == "True"]))

Unnamed: 0,url,api_working,api_software,api_version,dcat_available,ttl_catalog_available
0,http://101.79.9.128,True,CKAN,2.9.4,False,False
1,http://103.231.194.8,True,CKAN,2.7.7,False,False
2,http://116.203.208.239,True,CKAN,2.8.2,False,False
3,http://129.194.213.24,True,CKAN,2.5.2,False,False
4,http://130.179.67.140,True,CKAN,2.2.1,False,False
...,...,...,...,...,...,...
219,https://www.offenedaten.frankfurt.de,True,CKAN,2.9.7,True,True
220,https://www.offenesdatenportal.de,True,CKAN,2.8.2,True,True
221,https://www.opendata.nhs.scot,True,CKAN,2.8.4,False,False
222,https://www.opentourism.net,True,CKAN,2.9.7,False,False


Number of CKAN portals: 224
Number of CKAN portals for which the check failed: 7
Number of CKAN portals with DCAT extension: 77
Number of CKAN portals with TTL catalog: 68
