In [1]:
import json
from pathlib import Path
from GDPRDataset import GDPRDataset
import os
import pandas as pd
import random

# Load config

In [2]:
# Load config for dev dataset creation process
config_file = open(f"{str(Path.cwd())}/dev_dataset_refinement_config.json")
config = json.load(config_file)
config_file.close()

# Load dev dataset

In [3]:
dev_dataset_df = GDPRDataset(config['dev_dataset_path']).dataframe

## Remove english samples

In [4]:
dev_dataset_refined_df = dev_dataset_df[dev_dataset_df['language'] == "__label__de"]

## Remove samples with empty HTML

In [5]:
dev_dataset_refined_df = dev_dataset_refined_df[dev_dataset_refined_df['page_source_html'] != "-1"]
dev_dataset_refined_df.reset_index(inplace=True, drop=True)

## Store refined dataset at this point

In [6]:
intermediate_ref_dataset = GDPRDataset(dataset_dir=config['dev_dataset_refined_intermediate_path'])
intermediate_ref_dataset.dataframe = dev_dataset_refined_df
intermediate_ref_dataset.store()

In [7]:
dev_dataset_refined_loaded_df = GDPRDataset(config['dev_dataset_refined_intermediate_path']).dataframe

## Remove unicodes from texts

In [8]:
def remove_unicodes(text_nullable):
    if text_nullable:
        return text_nullable.replace("\n", "").replace("\xa0", " ")
    else:
        return text_nullable

dev_dataset_refined_loaded_df['content'] = dev_dataset_refined_loaded_df['content'].apply(lambda text: remove_unicodes(text))
dev_dataset_refined_loaded_df['content_removed'] = dev_dataset_refined_loaded_df['content_removed'].apply(lambda text: remove_unicodes(text))

## Check if words of removed GDPR content appear in HTML
By doing so we can spot websites that we can not use for training, e.g. because a #shadow-root hides the GDPR in the downloaded HTML

In [9]:
def get_words_in_removed_gdpr_that_are_not_in_page_source_html(html, removed_content):
    if removed_content is None:
        return []
    words = removed_content.split(" ")
    not_in_html_words = []
    for word in words:
        if not word in html:
            not_in_html_words.append(word)
    return not_in_html_words

In [10]:
c = 0
for i in range(len(dev_dataset_refined_loaded_df)):
    row = dev_dataset_refined_loaded_df.loc[i]
    if len(get_words_in_removed_gdpr_that_are_not_in_page_source_html(row['page_source_html'], row['content_removed'])) > 0:
        c += 1
        print("")
        print(i)
        print(row['url'])
        print(get_words_in_removed_gdpr_that_are_not_in_page_source_html(row['page_source_html'], row['content_removed']))
        print("")
print(c)


5
http://www.vsb.de/
['Selecting', 'immediately', 'Privatsphäre-Einstellungen', 'einwilligungsbedürftige', 'Cookies', 'Drittunternehmen', 'Integration', 'bestimmter', 'Funktionen.', 'Wenn', 'Button', '"Alles', 'akzeptieren"', 'klicken,', 'Funktionen', 'aktiviert', '(Einwilligung).', 'Nach', 'Einwilligung', 'verarbeiten', 'betroffenen', 'Drittunternehmen', 'Ihre', 'personenbezogenen', 'verschiedene', 'Zwecke.', 'Detaillierte', 'Informationen', 'Zweck,', 'Rechtsgrundlagen,', 'Drittunternehmen', 'Button', '"Mehr"', 'Datenschutzerklärung', 'einsehen.', 'Ihre', 'Einwilligung', 'widerrufen.', 'Datenschutzerklärung', 'eRecht24', 'Mehr', 'Alles', 'akzeptieren']

42
http://www.cadraw.de/
['Microsoft', 'CLID', 'user’s', 'behavior', 'compile', 'reports', 'heatmaps', 'owner.', '_clck', 'user’s', 'behavior', 'compile', 'reports', 'heatmaps', 'owner.', '_clsk', '_cltk', 'Microsoft', 'MR', 'multiple', 'websites,', 'based', "visitor's", 'preferences.', 'MUID', 'widely', 'Microsoft', 'ID.', 'synchroni

## Collect indizes of websites that need to be removed:
Check ReadMe for explanation of why I removed websites

In [12]:
remove_idx = [
    7,
    60,
    89,
    116,
    134,
    139,
    141,
    196,
    197,
    229
]

## Remove problematic websites from refined dataset.

In [13]:
dev_dataset_refined_loaded_df = dev_dataset_refined_loaded_df.drop(remove_idx)
dev_dataset_refined_loaded_df.reset_index(inplace=True)
print(len(dev_dataset_refined_loaded_df))

300


## Store refined dev dataset
Make sure to delete all subfolders from dev_dataset_refined before storing the new one where we removed some samples. Otherwise the removed ones will stay

In [14]:
store_refined_dataset(dev_dataset_refined_loaded_df, dataset_path=config['dev_dataset_refined_path'])

## Split into train+validation set and test set
I have 300 samples of valid german websites. 250 are my train/validation set, 50 get set aside to be only used for final testing.

In [15]:
df = GDPRDataset(dataset_dir=config['dev_dataset_refined_path']).dataframe
# Split into train and test set
indices = list(range(len(df)))
random.seed(420)
random.shuffle(indices)
train_indices = indices[:-50]
test_indices = indices[-50:]
train_df = df.iloc[train_indices]
train_df.reset_index(inplace=True, drop=True)
test_df = df.iloc[test_indices]
test_df.reset_index(inplace=True, drop=True)

In [16]:
store_refined_dataset(train_df, dataset_path=config['train_dataset_refined_path'])
store_refined_dataset(test_df, dataset_path=config['test_dataset_refined_path'])

In [17]:
len(test_df[test_df['contains_GDPR']])

29