In [227]:
import requests as rq
import pandas as pd

In [228]:
dataset = pd.read_csv("dataset.csv")

In [229]:
total = 1815240

In [230]:
if dataset is None:
    count = 2000

    metadatasList = []

    headers = {
        "Accept": "application/json"
    }

    for i in range(0, int(total/10), int(total/(count*10))):
        url = f"https://data.europa.eu/api/hub/search/search?q=&filter=dataset&includes=id,title.en,description.en,languages,modified,issued,catalog.id,catalog.title,catalog.country.id,distributions.id,distributions.format.label,distributions.format.id,distributions.license,categories.label,publisher&page={i}"
        response = rq.get(url, headers=headers)
        print(i*10, response)
        data = response.json()
        metadatasList.append(data["result"]["results"][0])

    dataset = pd.json_normalize(metadatasList)

    dataset["categories"].fillna("", inplace=True)

    dataset.loc[:, "category"] = dataset.loc[:, "categories"].apply(lambda x: x[0]['label']['en'] if x != '' else None)
    dataset["description.en"].fillna("")

    dataset.to_csv("dataset.csv")

In [231]:
dataset.loc[:, "url"] = dataset.loc[:, "id"].apply(lambda x : f"https://data.europa.eu/data/datasets/{x}?locale=en")

In [232]:
dataset.loc[0, "url"]

'https://data.europa.eu/data/datasets/025856e5-1a50-4cf1-8091-73ca6dfd9a6b?locale=en'

In [233]:
dataset.columns

Index(['Unnamed: 0', 'keywords', 'modified', 'id', 'categories', 'issued',
       'distributions', 'catalog.id', 'catalog.title.de', 'catalog.country.id',
       'description.en', 'title.en', 'publisher.name', 'publisher.type',
       'publisher.email', 'publisher.homepage', 'catalog.title.en',
       'publisher.resource', 'catalog.title.nl', 'catalog.title.hr',
       'catalog.title.sv', 'catalog.title.sl', 'catalog.title.fr',
       'catalog.title.es', 'catalog.title.ga', 'catalog.title.sk',
       'catalog.title.it', 'catalog.title.lv', 'catalog.title.ro',
       'catalog.title.pt', 'catalog.title.fi', 'catalog.title.hu',
       'catalog.title.da', 'catalog.title.no', 'catalog.title.et',
       'catalog.title.pl', 'category', 'url'],
      dtype='object')

In [234]:
import spacy as sp

In [235]:
nlp = sp.load("en_core_web_sm")

In [236]:
nlp(dataset["description.en"][0]).similarity(nlp(dataset["description.en"][1]))

  nlp(dataset["description.en"][0]).similarity(nlp(dataset["description.en"][1]))


0.8351060152053833

In [237]:
pr_count = len(dataset["description.en"])

In [238]:
s = [[-1 for _ in range(pr_count)] for _ in range(pr_count)]

In [239]:
import numpy as np

In [240]:
np.argsort(s[0])

array([5039, 5038, 5037, ...,   16, 5041,    0])

In [241]:
columns = ["title.en", "description.en", "category"]

In [242]:
fs = np.load("similarity.npy")

In [243]:
if fs is None:
    fs = np.array([[-1 for _ in range(pr_count)] for _ in range(pr_count)], dtype='d')
    for i in range(pr_count):
        for j in range(i+1, pr_count):
            sim = 0
            for column in columns:
                t1, t2 = dataset[column][i], dataset[column][j]
                s = 0
                if t1 and t2:
                    s = nlp(str(t1)).similarity(nlp(str(t2)))
                sim += s*s
            fs[i][j] = sim
            fs[j][i] = sim
            print(i, j)

    np.save("similarity.npy", fs)

In [244]:
threshold = 80

In [245]:
pr_count

5042

In [246]:
sample = []
for i in range(5000):
    main_dataset = dataset.loc[i]
    recommended_datasets = dataset.loc[np.nonzero(fs[i]>threshold*3/100)].to_dict('records')
    sample.append((main_dataset, recommended_datasets))


In [247]:
set([x[0]["category"] for x in sample])

{'Agriculture, fisheries, forestry and food',
 'Economy and finance',
 'Education, culture and sport',
 'Energy',
 'Environment',
 'Government and public sector',
 'Health',
 'International issues',
 'Justice, legal system and public safety',
 'Population and society',
 'Provisional data',
 'Regions and cities',
 'Science and technology',
 'Transport',
 nan}

In [248]:
survey_sample = [(x[0], x[1][:5]) for x in filter(lambda x: len(x[1])>5 and not pd.isna(x[0]["category"]) and not pd.isna(x[0]["description.en"]), sample)]

In [449]:
set([x[0]["category"] for x in survey_sample])

{'Agriculture, fisheries, forestry and food',
 'Economy and finance',
 'Education, culture and sport',
 'Energy',
 'Environment',
 'Government and public sector',
 'Health',
 'Justice, legal system and public safety',
 'Population and society',
 'Provisional data',
 'Regions and cities',
 'Science and technology',
 'Transport'}

In [450]:
survey_count = 30

In [451]:
survey = [survey_sample[i] for i in range(0, len(survey_sample), int(len(survey_sample)/survey_count))]

In [677]:
survey = survey[:-1]

In [678]:
len(survey)

30

In [679]:
dataset.loc[np.nonzero(fs[1]*fs[1]>threshold*3/100)].to_dict('records')

[{'Unnamed: 0': 0,
  'keywords': "[{'language': 'de', 'id': 'bauleitplanung', 'label': 'Bauleitplanung'}, {'language': 'de', 'id': 'bebauungsplan', 'label': 'Bebauungsplan'}, {'language': 'de', 'id': 'infofeatureaccessservice', 'label': 'infoFeatureAccessService'}, {'language': 'de', 'id': 'inspireidentifiziert', 'label': 'inspireidentifiziert'}]",
  'modified': '2021-11-23',
  'id': '025856e5-1a50-4cf1-8091-73ca6dfd9a6b',
  'categories': "[{'label': {'nn': 'Jordbruk, fiskeri, skogbruk og mat', 'de': 'Landwirtschaft, Fischerei, Forstwirtschaft und Nahrungsmittel', 'no': 'Jordbruk, fiskeri, skogbruk og mat', 'fi': 'Maatalous, kalastus, metsätalous ja elintarvikkeet', 'pt': 'Agricultura, pesca, silvicultura e alimentação', 'bg': 'Селско стопанство, рибарство, горско стопанство и храни', 'lt': 'Žemės ūkis, žuvininkystė, miškininkystė ir maistas', 'lv': 'Lauksaimniecība, zivsaimniecība, mežsaimniecība un pārtika', 'hr': 'Poljoprivreda, ribarstvo, šumarstvo i hrana', 'fr': 'Agriculture, pêc

In [680]:
import jinja2 as jj

In [681]:
fsl = jj.FileSystemLoader(searchpath="templates")

In [682]:
env = jj.Environment(loader=fsl)

In [683]:
fsl.list_templates()

['template.jinja']

In [684]:
template = env.get_template("template.jinja")

In [685]:
recs = []

In [686]:
survey[0][0]["description.en"]

'Building plan “Schafhof II and Schafhof III (Change)” of the city of Kirchheim under Teck based on an XPlanung dataset in version 5.0.'

In [687]:
indexHtml = template.render(survey = survey, count=len(survey)).encode( "utf-8" )

In [688]:
file=open("index.html", "wb")
file.write(indexHtml)
file.close()

In [689]:
survey[2][0]["title.en"]

'WMS XPlanung BPL “Seestraße (original)”'

In [690]:
survey[2][1][0]["title.en"]

'WMS XPlanung BPL “Riedwiesen (3\xa0rd change)”'

In [691]:
medians = [np.median(fs[i]) for i in range(len(fs))]

In [692]:
medians

[np.float64(1.1869719947246242),
 np.float64(1.521190836883525),
 np.float64(1.1496375814385171),
 np.float64(1.3949063825964925),
 np.float64(1.498857703202228),
 np.float64(1.5303229567857937),
 np.float64(1.534217594459161),
 np.float64(1.4673908117502883),
 np.float64(1.3911541771100606),
 np.float64(1.0337227995955147),
 np.float64(1.5605571504026727),
 np.float64(0.5800663773013641),
 np.float64(0.8499355973573711),
 np.float64(1.5961039322031714),
 np.float64(0.5528551660825579),
 np.float64(1.0200035962742717),
 np.float64(0.4929867027058279),
 np.float64(0.9795519295338053),
 np.float64(1.4494408258541043),
 np.float64(1.353692935292861),
 np.float64(1.4187890746973615),
 np.float64(0.646683133025459),
 np.float64(1.074479356307906),
 np.float64(1.239782405930058),
 np.float64(1.4494209005687435),
 np.float64(1.4207541853105603),
 np.float64(1.167509802773142),
 np.float64(1.2538851272973988),
 np.float64(1.5741601692574427),
 np.float64(1.2694883311537701),
 np.float64(1.5497

In [693]:
np.median(medians)*2/3

np.float64(0.8885225167772163)