In [1]:
import requests as rq
import pandas as pd

In [2]:
dataset = pd.read_csv("dataset.csv")

In [3]:
total = 1815240

In [4]:
if dataset is None:
    count = 500

    metadatasList = []

    headers = {
        "Accept": "application/json"
    }

    for i in range(0, int(total/10), int(total/(count*10))):
        url = f"https://data.europa.eu/api/hub/search/search?q=&filter=dataset&includes=id,title.en,description.en,languages,modified,issued,catalog.id,catalog.title,catalog.country.id,distributions.id,distributions.format.label,distributions.format.id,distributions.license,categories.label,publisher&page={i}"
        response = rq.get(url, headers=headers)
        print(i*10, response)
        data = response.json()
        metadatasList.append(data["result"]["results"][0])

    dataset = pd.json_normalize(metadatasList)

    dataset["categories"].fillna("", inplace=True)

    dataset.loc[:, "category"] = dataset.loc[:, "categories"].apply(lambda x: x[0]['label']['en'] if x != '' else None)
    dataset = dataset.drop(columns=["modified", "issued", "distributions", "categories"])
    dataset["description.en"].fillna("")

    dataset.to_csv("dataset.csv")

In [5]:
import spacy as sp

In [6]:
nlp = sp.load("en_core_web_sm")

In [7]:
nlp(dataset["description.en"][0]).similarity(nlp(dataset["description.en"][1]))

  nlp(dataset["description.en"][0]).similarity(nlp(dataset["description.en"][1]))


0.6227189302444458

In [8]:
pr_count = len(dataset["description.en"])

In [9]:
s = [[-1 for _ in range(pr_count)] for _ in range(pr_count)]

In [10]:
import numpy as np

In [11]:
np.argsort(s[0])

array([495, 494, 493, 492, 491, 490, 489, 488, 487, 486, 485, 484, 483,
       482, 481, 480,  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,
         5,   4,   3,   2,   1, 496,  47,  46,  45,  44,  43,  42,  41,
        40,  39,  38,  37,  36,  35,  34,  33,  32,  63,  62,  61,  60,
        59,  58,  57,  56,  55,  54,  53,  52,  51,  50,  49,  48,  79,
        78,  77,  76,  75,  74,  73,  72,  71,  70,  69,  68,  67,  66,
        65,  64,  95,  94,  93,  92,  91,  90,  89,  88,  87,  86,  85,
        84,  83,  82,  81,  80, 111, 110, 109, 108, 107, 106, 105, 104,
       103, 102, 101, 100,  99,  98,  97,  96, 127, 126, 125, 124, 123,
       122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 143, 142,
       141, 140, 139, 138, 137, 136, 135, 134, 133, 132, 131, 130, 129,
       128, 159, 158, 157, 156, 155, 154, 153, 152, 151, 150, 149, 148,
       147, 146, 145, 144, 175, 174, 173, 172, 171, 170, 169, 168, 167,
       166, 165, 164, 163, 162, 161, 160, 191, 190, 189, 188, 18

In [12]:
columns = ["title.en", "description.en", "category"]

In [None]:
fs = np.load("similarity-md.npy")

In [14]:
if fs is None:
    fs = np.array([[-1 for _ in range(pr_count)] for _ in range(pr_count)], dtype='d')
    for i in range(pr_count):
        for j in range(i+1, pr_count):
            sim = 0
            for column in columns:
                t1, t2 = dataset[column][i], dataset[column][j]
                s = 0
                if t1 and t2:
                    s = nlp(str(t1)).similarity(nlp(str(t2)))
                sim += s*s
            fs[i][j] = sim
            fs[j][i] = sim
            print(i, j)

    np.save("similarity.npy", fs)

In [15]:
threshold = 65

In [16]:
sample_count = 500
sample = []
for i in range(0, pr_count, int(pr_count/sample_count)):
    main_dataset = dataset.loc[i]
    recommended_datasets = dataset.loc[np.nonzero(fs[i]>threshold*3/100)].to_dict('records')
    sample.append((main_dataset, recommended_datasets))


In [17]:
survey_sample = [(x[0], x[1][:5]) for x in filter(lambda x: len(x[1])>5 and not pd.isna(x[0]["category"]) and not pd.isna(x[0]["description.en"]), sample)]

In [18]:
[x[0]["title.en"] for x in survey_sample]

['WFS Gänberg + Simplified change Gänsberg',
 'WMS XPlanung BPL “Lotenbergstraße 3.Change”',
 'WMS XPlanung BPL “Amendment 2 stations”',
 'WMS At the Beer Cellar II',
 'Altenburg: Development plan “Lidl Kauerndorfer Allee”',
 'WFS XPlanung BPL “Building Plan Kochsbuehn”',
 'WMS to the development plan Interior Regulations IV (Sulingen) Wiesenweg (origin plan)',
 'WFS INSPIRE BPL zw_den_Wegen_II',
 'XPlanung dataset BPL “Zellerstraße zw. Strohberg- u. Römerstr.»',
 'WFS XPlanung BPL “City Core IV”',
 'WFS) — Development plan — At the elementary school (origin plan), City of Twistringen',
 'Inspire data set BPL “Scab baker part B 2. change”',
 'INSPIRE dataset BPL "Lugauf II, Outer Kirchäcker - 1st Amendment"',
 'WFS traffic calmed area Griesstr. — Josefstr.',
 'WFS XPlanung BPL ‘Ochsenstall (Origin Plan)’',
 'WMS INSPIRE BPL Mühlwiesen – 2nd Amendment',
 'WFS INSPIRE BPL Hintere Hofäcker',
 'WMS XPlanung BPL “Hungerberg (Deckblatt 1968)”',
 'WMS XPlanung BPL “Halden”',
 'WMS XPlanung BP

In [19]:
survey_count = 30

In [20]:
survey = [survey_sample[i] for i in range(0, len(survey_sample), int(len(survey_sample)/survey_count))]

In [21]:
dataset.loc[np.nonzero(fs[1]*fs[1]>threshold*3/100)].to_dict('records')

[{'Unnamed: 0': 11,
  'id': 'ff0463ce-4404-454f-970c-0de4ea661586',
  'catalog.id': 'gdi-de',
  'catalog.title.de': 'GDI-DE',
  'catalog.country.id': 'de',
  'description.en': 'Atomic feed on the development plan 081-4 No. 50-1 of the municipality of Wardenburg',
  'title.en': 'Atomic feed on the development plan 081-4 No. 50-1 of the municipality of Wardenburg',
  'publisher.name': nan,
  'publisher.type': nan,
  'publisher.email': nan,
  'publisher.resource': nan,
  'catalog.title.nl': nan,
  'catalog.title.sv': nan,
  'publisher.homepage': nan,
  'catalog.title.en': nan,
  'catalog.title.fr': nan,
  'catalog.title.it': nan,
  'catalog.title.lv': nan,
  'catalog.title.es': nan,
  'catalog.title.ga': nan,
  'catalog.title.hu': nan,
  'catalog.title.pt': nan,
  'catalog.title.fi': nan,
  'catalog.title.sk': nan,
  'catalog.title.no': nan,
  'catalog.title.sl': nan,
  'catalog.title.ro': nan,
  'catalog.title.hr': nan,
  'category': nan},
 {'Unnamed: 0': 15,
  'id': '61ea6d1b-e676-55c7-

In [22]:
import jinja2 as jj

In [23]:
fsl = jj.FileSystemLoader(searchpath="templates")

In [24]:
env = jj.Environment(loader=fsl)

In [25]:
fsl.list_templates()

['template.jinja']

In [26]:
template = env.get_template("template.jinja")

In [27]:
recs = []

In [28]:
survey[0][0]["description.en"]

'WFS Development Plan ‘Gänberg + Simplified Change Gänsberg’ of the city of Herrenberg.'

In [29]:
indexHtml = template.render(survey = survey).encode( "utf-8" )

In [30]:
file=open("index.html", "wb")
file.write(indexHtml)
file.close()

In [31]:
survey[2][0]["title.en"]

'WMS to the development plan Interior Regulations IV (Sulingen) Wiesenweg (origin plan)'

In [32]:
survey[2][1][0]["title.en"]

'Altenburg: Development plan “Lidl Kauerndorfer Allee”'

In [33]:
medians = [np.median(fs[i]) for i in range(len(fs))]

In [34]:
medians

[np.float64(0.7668898523725081),
 np.float64(0.7047961634524511),
 np.float64(0.7868099761510381),
 np.float64(0.7975495802198234),
 np.float64(0.7409180063255474),
 np.float64(0.7420217694787148),
 np.float64(0.8390401090376085),
 np.float64(0.8208938122873595),
 np.float64(0.6655311624688638),
 np.float64(0.6201438492198594),
 np.float64(0.5478701608463672),
 np.float64(0.7147090826041077),
 np.float64(0.6914060436373619),
 np.float64(0.7082997149744488),
 np.float64(0.8055266145979412),
 np.float64(0.6932870888778777),
 np.float64(0.8679796877374264),
 np.float64(0.8572667000396952),
 np.float64(0.9692664178322556),
 np.float64(0.8779804829302345),
 np.float64(0.7873577752279977),
 np.float64(0.8768156820903299),
 np.float64(0.7860146413586593),
 np.float64(0.7998513667792411),
 np.float64(0.6048912190068736),
 np.float64(0.887551757585368),
 np.float64(0.7491248545391831),
 np.float64(0.796563267935984),
 np.float64(0.9212138504969012),
 np.float64(0.8064034040845276),
 np.float64(

In [35]:
np.median(medians)

np.float64(0.7493373789475124)

In [36]:
0.74/3

0.24666666666666667