In [1]:
import requests as rq
import pandas as pd

In [2]:
dataset = pd.read_csv("dataset-5000.csv")

In [3]:
total = 1815240

In [4]:
if dataset is None:
    count = 2000

    metadatasList = []

    headers = {
        "Accept": "application/json"
    }

    for i in range(0, int(total/10), int(total/(count*10))):
        url = f"https://data.europa.eu/api/hub/search/search?q=&filter=dataset&includes=id,title.en,description.en,languages,modified,issued,catalog.id,catalog.title,catalog.country.id,distributions.id,distributions.format.label,distributions.format.id,distributions.license,categories.label,publisher&page={i}"
        response = rq.get(url, headers=headers)
        print(i*10, response)
        data = response.json()
        metadatasList.append(data["result"]["results"][0])

    dataset = pd.json_normalize(metadatasList)

    dataset["categories"].fillna("", inplace=True)

    dataset.loc[:, "category"] = dataset.loc[:, "categories"].apply(lambda x: x[0]['label']['en'] if x != '' else None)
    dataset = dataset.drop(columns=["modified", "issued", "distributions", "categories"])
    dataset["description.en"].fillna("")

    dataset.to_csv("dataset.csv")

In [5]:
import spacy as sp

In [6]:
nlp = sp.load("en_core_web_sm")

In [7]:
nlp(dataset["description.en"][0]).similarity(nlp(dataset["description.en"][1]))

  nlp(dataset["description.en"][0]).similarity(nlp(dataset["description.en"][1]))


0.8351060152053833

In [8]:
pr_count = len(dataset["description.en"])

In [9]:
s = [[-1 for _ in range(pr_count)] for _ in range(pr_count)]

In [10]:
import numpy as np

In [11]:
np.argsort(s[0])

array([5023, 5022, 5021, ..., 5037, 5038,    0])

In [12]:
columns = ["title.en", "description.en", "category"]

In [13]:
fs = np.load("similarity-5000.npy")

In [14]:
if fs is None:
    fs = np.array([[-1 for _ in range(pr_count)] for _ in range(pr_count)], dtype='d')
    for i in range(pr_count):
        for j in range(i+1, pr_count):
            sim = 0
            for column in columns:
                t1, t2 = dataset[column][i], dataset[column][j]
                s = 0
                if t1 and t2:
                    s = nlp(str(t1)).similarity(nlp(str(t2)))
                sim += s*s
            fs[i][j] = sim
            fs[j][i] = sim
            print(i, j)

    np.save("similarity.npy", fs)

In [15]:
threshold = 80

In [16]:
pr_count

5039

In [17]:
sample = []
for i in range(5000):
    main_dataset = dataset.loc[i]
    recommended_datasets = dataset.loc[np.nonzero(fs[i]>threshold*3/100)].to_dict('records')
    sample.append((main_dataset, recommended_datasets))


In [18]:
set([x[0]["category"] for x in sample])

{'Agriculture, fisheries, forestry and food',
 'Economy and finance',
 'Education, culture and sport',
 'Energy',
 'Environment',
 'Government and public sector',
 'Health',
 'International issues',
 'Justice, legal system and public safety',
 'Population and society',
 'Provisional data',
 'Regions and cities',
 'Science and technology',
 'Transport',
 nan}

In [19]:
survey_sample = [(x[0], x[1][:5]) for x in filter(lambda x: len(x[1])>5 and not pd.isna(x[0]["category"]) and not pd.isna(x[0]["description.en"]), sample)]

In [20]:
set([x[0]["category"] for x in survey_sample])

{'Agriculture, fisheries, forestry and food',
 'Economy and finance',
 'Education, culture and sport',
 'Environment',
 'Government and public sector',
 'Health',
 'Justice, legal system and public safety',
 'Population and society',
 'Provisional data',
 'Regions and cities',
 'Science and technology',
 'Transport'}

In [21]:
survey_count = 30

In [22]:
survey = [survey_sample[i] for i in range(0, len(survey_sample), int(len(survey_sample)/survey_count))]

In [23]:
dataset.loc[np.nonzero(fs[1]*fs[1]>threshold*3/100)].to_dict('records')

[{'Unnamed: 0': 0,
  'id': '025856e5-1a50-4cf1-8091-73ca6dfd9a6b',
  'catalog.id': 'gdi-de',
  'catalog.title.de': 'GDI-DE',
  'catalog.country.id': 'de',
  'description.en': 'WFS Development Plan ‘Gänberg + Simplified Change Gänsberg’ of the city of Herrenberg.',
  'title.en': 'WFS Gänberg + Simplified change Gänsberg',
  'publisher.name': nan,
  'publisher.type': nan,
  'publisher.email': nan,
  'publisher.homepage': nan,
  'catalog.title.en': nan,
  'publisher.resource': nan,
  'catalog.title.nl': nan,
  'catalog.title.no': nan,
  'catalog.title.sv': nan,
  'catalog.title.sl': nan,
  'catalog.title.fr': nan,
  'catalog.title.ga': nan,
  'catalog.title.it': nan,
  'catalog.title.es': nan,
  'catalog.title.sk': nan,
  'catalog.title.lv': nan,
  'catalog.title.ro': nan,
  'catalog.title.pt': nan,
  'catalog.title.fi': nan,
  'catalog.title.hu': nan,
  'catalog.title.da': nan,
  'catalog.title.cs': nan,
  'catalog.title.hr': nan,
  'catalog.title.pl': nan,
  'category': 'Agriculture, fi

In [333]:
import jinja2 as jj

In [334]:
fsl = jj.FileSystemLoader(searchpath="templates")

In [335]:
env = jj.Environment(loader=fsl)

In [336]:
fsl.list_templates()

['template.jinja']

In [337]:
template = env.get_template("template.jinja")

In [338]:
recs = []

In [339]:
survey[0][0]["description.en"]

'Building plan “Schafhof II and Schafhof III (Change)” of the city of Kirchheim under Teck based on an XPlanung dataset in version 5.0.'

In [340]:
indexHtml = template.render(survey = survey).encode( "utf-8" )

In [341]:
file=open("index.html", "wb")
file.write(indexHtml)
file.close()

In [342]:
survey[2][0]["title.en"]

'WMS INSPIRE BPL In the Lettenäckern (original)'

In [343]:
survey[2][1][0]["title.en"]

'WMS INSPIRE BPL On the go'

In [344]:
medians = [np.median(fs[i]) for i in range(len(fs))]

In [345]:
medians

[np.float64(1.0088750649709421),
 np.float64(1.3282734651921828),
 np.float64(0.9784070426821952),
 np.float64(1.2227146440710577),
 np.float64(1.2027553321946591),
 np.float64(1.2492258497378264),
 np.float64(1.2423604792275207),
 np.float64(1.2867583910977807),
 np.float64(1.0981768238209009),
 np.float64(0.938752836627101),
 np.float64(1.2510839026182579),
 np.float64(0.47364234873166555),
 np.float64(0.7293601833954468),
 np.float64(1.2960067975204732),
 np.float64(0.5056541979858953),
 np.float64(0.6846447013965415),
 np.float64(0.16307750059552084),
 np.float64(0.6499080217221833),
 np.float64(1.2731944635337913),
 np.float64(1.1780591309811632),
 np.float64(1.3207545256497673),
 np.float64(0.5374827368918825),
 np.float64(0.8912290296435327),
 np.float64(1.054487892164042),
 np.float64(1.273781228190897),
 np.float64(1.2648895495443195),
 np.float64(0.9772717924557278),
 np.float64(1.0855555313526575),
 np.float64(1.2934542479831397),
 np.float64(1.0920806918506616),
 np.float64

In [346]:
np.median(medians)*2/3

np.float64(0.7443318103299843)