Run this notebook after manully populating the dev dataset with the help of the notebook 1_curated_dataset_creation.ipynb to complement the dataset.

In [1]:
import json
from pathlib import Path

import os

from fasttext.FastText import _FastText
import pandas as pd

from automated_dataset_completion_helpers import extract_html_plain_text, extract_gdpr_plain_text_if_gdpr_website, fasttext_language_predict

# Load Config

In [2]:
# Load config for dev dataset creation process
config_file = open(f"{str(Path.cwd())}/dev_dataset_creation_config.json")
config = json.load(config_file)
config_file.close()

# Get plain text from HTMLs in each subfolder

In [3]:
subfolder_names = sorted([int(x[0].replace(f"{config['dev_dataset_path']}/", "")) for x in list(os.walk(config['dev_dataset_path']))[1:]])

for sample_index in subfolder_names:
    sample_dir_path = f"{config['dev_dataset_path']}/{sample_index}"

    extract_html_plain_text(sample_dir_path)

    extract_gdpr_plain_text_if_gdpr_website(sample_dir_path)


# Detect Languages
For the extracted text of each website determine its language.
- In order to use TI-IDF method, which is based on a vocabulary, we need to handle german and english websites separately.

In [4]:
ft_model = _FastText(model_path="./pretrained_language_detection_model/lid.176.bin")

languages = []

for sample_index in subfolder_names:
    sample_dir_path = f"{config['dev_dataset_path']}/{sample_index}"
    content_text_path = f"{sample_dir_path}/content.txt"
    with open(content_text_path, 'r') as f:
        content = f.read()

        language_label = fasttext_language_predict(content, ft_model)
        languages.append(language_label[0][0][0])

languages_df = pd.DataFrame(languages)
languages_df.to_json(f"{config['dev_dataset_path']}/dev_dataset_languages.json")

# Add URLs

In [5]:
# Load tabel of all seed websites
file = pd.ExcelFile(f"{config['input_data_path']}/companies_enc.xlsx")
seeds_df = pd.read_excel(file, 'Companies')

In [6]:
urls_of_dev_dateset = []

for sample_index in subfolder_names:
    url = seeds_df.loc[int(sample_index)]['url']
    urls_of_dev_dateset.append(url)

dev_dataset_urls_df = pd.DataFrame(urls_of_dev_dateset)
dev_dataset_urls_df.to_json(f"{config['dev_dataset_path']}/dev_dataset_urls.json")

Now the dev_dataset is complete and can be accessed with the GDPRDataset pytorch Dataset class.