# Tiltify
Project to test out Natural Language Processing on Labeled Privacy Policies. The annotations of the policies are performed in the TILT Schema The goal is to infer TILT Labels for a given privacy policy and thus perform automated annotations for these policies.

In [None]:
# get env variables

In [None]:
import yaml
config_file_path = 'config.yml'
with open(config_file_path, "r") as f:
    config = yaml.safe_load(f)
username = config['username']
password = config['password']
database = config['database']

In [None]:
# connect to database
import os
from pymongo import MongoClient

client = MongoClient(f'mongodb://{username}:{password}@127.0.0.1:27017')
db = client[database]

In [34]:
# getting examples
annotation_cursor = db.annotation.find({'label': {'$regex': 'Right to.*'}})
good_examples = []
bad_examples = []
bad_examples_ids = []

for annotation in annotation_cursor:
    if annotation['text'].count(' ') > 10:
        good_examples.append(annotation)
    elif annotation['text'].count(' ') < 3:
        bad_examples.append(annotation)
        if annotation['task'] not in bad_examples_ids:
            bad_examples_ids.append(annotation['task'])
            
task_cursor = db.task.find()
bad_examples_names = []
grey_zone_examples_names = []
for task in task_cursor:
    if task['_id'] in bad_examples_ids:
        bad_examples_names.append(task['name'])
    elif 'parent' not in task:
        grey_zone_examples_names.append(task['name'])

In [35]:
# HackMD format

for i in grey_zone_examples_names:
    print(f'|{i}|grey|-|')

|facebook|grey|-|
|bnp_paribas|grey|-|
|tesco|grey|-|
|carrefour|grey|-|
|signal|grey|-|
|siemens|grey|-|
|verivox|grey|-|
|webde|grey|-|
|stihl|grey|-|
|vw|grey|-|
|xing|grey|-|
|h&m|grey|-|
|takeda|grey|-|
|whatsapp|grey|-|
|dhl|grey|-|
|ryanair|grey|-|
|qwant|grey|-|
|telefonica|grey|-|
|ltur|grey|-|
|viessmann|grey|-|
|bvg|grey|-|
|amazon|grey|-|
|google|grey|-|
|Amazon Alexa Terms of Use|grey|-|
|Google Assistant|grey|-|
|Siemens AG|grey|-|
|Zoom|grey|-|
|Cisco|grey|-|
|Deutsche Bahn|grey|-|
|ARD Mediathek|grey|-|
|Dropbox|grey|-|
|Github|grey|-|
|Spiegel Online|grey|-|
|Discord|grey|-|
|Twitter|grey|-|
|SPD|grey|-|
|GRÜNE|grey|-|
|Mircosoft Teams|grey|-|
|Fitbit|grey|-|
|Runtastic|grey|-|
|Niantic (Pokemon Go)|grey|-|
|Fiducia|grey|-|
|Duden|grey|-|
|Chefkoch|grey|-|
|Setting.io|grey|-|
|Shopify|grey|-|
|Tagesschau App|grey|-|
|Twitch|grey|-|
|Adobe|grey|-|
|giki|grey|-|
|endcitizensunited|grey|-|


In [None]:
# good examples
for annotation in good_examples[:5]:    
    print(f"Label: {annotation['label']}\nText: {annotation['text']}")
    print("-------------------------------------------------------------------")
print(f"#goodexamples: {len(good_examples)}")

In [None]:
# bad examples
for annotation in bad_examples[:5]:    
    print(f"Label: {annotation['label']}\nText: {annotation['text']}")
    print("-------------------------------------------------------------------")
print(f"#badexamples: {len(bad_examples)}")

In [None]:
# names of tasks with bad examples
print('bad examples names:')
print(bad_examples_names)
print(f"#bad examples names: {len(bad_examples_names)}")

In [None]:
# iterate through all tasks and get sentence dataset
import spacy
from tqdm import tqdm
from spacy_langdetect import LanguageDetector

nlp = spacy.load('de_core_news_lg')
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
task_cursor = db.task.find({})

de_data = []
en_data = []
counter = 0

for task in tqdm(task_cursor):
    
    # get text and id of task
    text = task['text']
    doc_id = task['_id']
    
    # find all annotations of this task and extract them
    annotation_cursor = db.annotation.find({'task': doc_id, 'label': {'$regex': 'Right to.*'}})
    annotations = []
    for annotation in annotation_cursor:
        annotations.append({'start': annotation['start'], 'end': annotation['end'], 'label': annotation['label'], 'text': annotation['text']})
        
    if not annotations:
        continue
        
    # iterate over text and save sentences
    doc = nlp(text)
    for sentence in list(doc.sents):
        is_sentence_appended = False
        sentence_start = len(text.split(str(sentence))[0]) + 1
        sentence_end = sentence_start + len(str(sentence))
        for annotation in annotations:
            if sentence_start <= annotation['start'] <= sentence_end or\
                sentence_start <= annotation['end'] <= sentence_end or\
                annotation['start'] <= sentence_start <= annotation['end'] or\
                annotation['start'] <= sentence_end <= annotation['end']:
                    if sentence._.language['en']:
                        en_data.append({'sentence': str(sentence), 'label': annotation['label']})
                        is_sentence_appended = True
                    elif sentence._.language['de']:
                        de_data.append({'sentence': str(sentence), 'label': annotation['label']})
                        is_sentence_appended = True
        if not is_sentence_appended:
            if sentence._.language['en']:
                en_data.append({'sentence': str(sentence), 'label': annotation['label']})
            elif sentence._.language['de']:
                de_data.append({'sentence': str(sentence), 'label': annotation['label']})

In [None]:
# create dataframe
import pandas

de_df = pandas.DataFrame.from_dict(de_data)
print(de_df)
en_df = pandas.DataFrame.from_dict(en_data)
print(en_df)

In [None]:
# create csv files
de_df.to_csv('~/Documents/DaSKITA/playground/tiltify/data/de_sentence_data.csv')
en_df.to_csv('~/Documents/DaSKITA/playground/tiltify/data/en_sentence_data.csv')