In [1]:
# general imports
import os
import coreferee
import re
import spacy
import pandas as pd
# own path/ class imports
from file_paths import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Application Selection ########################################START
# choose method 
direct_s_bert = True #if True --> no clustering or other means are implemented, all sentences are comapred with each other via S-Bert
legal_s_bert = False #if True --> implementation like S-BERT but based on legal BERT instead of BERT
clustering = False #if True --> 2 approaches calculated: a) topic model + word2vec + cosine sim; b) bert embeddings + kmeans and word2vev + cosine sim
key_phrase = False # if True --> setp one is performed on only key phrases (identified by tfidf), instead of whole sentences
# choose case study
### GDPR adjusted, ISO not!
iso = False #if False --> running with gdpr setup
# choose set up
rea_only_signal = False #if False --> gdpr realization input is not filtered to contain only sentences with signalwords
# choose thresholds:
gamma_s_bert = 0.7 #0.67 #used for sentence mapping 
gamma_grouping = 0.9 #used for sentence mapping in k-means & topic Model approach
gamma_key_phrase = 0.92 #used for key phrase extraction
gamma_one = 0.26 #used for subject phrase mapping
gamma_two = 0.23 #used for verb phrase mapping
gamma_three = 0.2 #used for object phrase mapping
################################################################# END

In [3]:
# Create the nlp object
nlp = spacy.load('en_core_web_trf')
nlp.add_pipe('coreferee', config={}) # resolves coreferences

<coreferee.manager.CorefereeBroker at 0x7f4303bd6680>

In [4]:
## parse defined lists of constraint signalwords, sequencemarkers and stopwords ########################### START
def read_defined_lists(directory): 
  '''reads in defined txts of constraint signalwords, sequencemarkers and stopwords as lists
  Input: .txt
  Output: list'''
  try:
    with open(directory) as f:
      defined_list = f.read().splitlines()
  except FileNotFoundError:
      print("Wrong file or file path.")
      quit()
  return defined_list

if iso:
  signalwords = read_defined_lists(ISO_SIGNALWORDS)
  ISMS_words = read_defined_lists(ISO_REA_SPEZIFICATION1)
  top_management_words = read_defined_lists(ISO_REA_SPEZIFICATION2)
else:
  signalwords = read_defined_lists(GDPR_SIGNALWORDS)
  controller_words = read_defined_lists(GDPR_REA_SPEZIFICATION1)
  data_protection_officer_words = read_defined_lists(GDPR_REA_SPEZIFICATION2)
  management_words = read_defined_lists(GDPR_REA_SPEZIFICATION3)

################################################################# END

In [5]:

## parse documents ############################################ START
def read_documents(directory): 
  '''reads in txts of regulatory and realization documents
  Input: multiple .txt (each a document article)
  Output: dictionary with article name as key and article text as value'''
  doc_dict = dict()
  files = os.listdir(directory)
  try:
    for fi in files:
        if fi.endswith('.txt'):
          with open(directory+'/'+fi,'r') as f:
              doc_dict[re.sub('\.txt', '', fi)] = f.read()
  except FileNotFoundError:
    print("Wrong file or file path to dir.")
    quit()
  return doc_dict

# reading the raw .txt text
if iso:
  reg_paragraphs = read_documents(ISO_REGULATION_INPUT_DIRECTORY) 
  rea_paragraphs = read_documents(ISO_REALIZATION_INPUT_DIRECTORY) 

else: 
  reg_paragraphs = read_documents(GDPR_REGULATION_INPUT_DIRECTORY) 
  rea_paragraphs = read_documents(GDPR_REALIZATION_INPUT_DIRECTORY) 
################################################################# END

In [6]:
df = pd.DataFrame(reg_paragraphs.items(), columns=['reg_title', 'reg_text'])

In [7]:
def apply_coreference_resolution(text):
    doc = nlp(text)
    # split text in tokens
    list_tokens = list(token.text_with_ws for token in doc)
    for index, _ in enumerate(list_tokens):
        #check if token an identified coreference token
        if None != doc._.coref_chains.resolve(doc[index]):
            new_token = ""
            #extract those tokens that are identified via index by coreferee and replace with best refrence token
            for resolved_token in doc._.coref_chains.resolve(doc[index]):
                new_token = new_token + resolved_token.text + " "
                list_tokens[index] = new_token
    resolved_text = "".join(list_tokens)
    return resolved_text

In [8]:
df['reg_text_resolved'] = df.apply(lambda row : apply_coreference_resolution(row['reg_text']), axis = 1)

In [9]:
pd.options.display.max_colwidth= 2000

In [10]:
def clean_text(text):  
    '''cleans texts'''
    cleaned_text = text.replace(";", ".") #in reg there are many ; which should be counted as seperate senteces
    cleaned_text = cleaned_text.replace("or\n\n\n", "")
    cleaned_text = cleaned_text.replace("or\n\n", "")
    cleaned_text = cleaned_text.replace("and\n\n\n", "")
    cleaned_text = cleaned_text.replace("and\n\n", "")
    cleaned_text = cleaned_text.replace("\n\n\n", "")
    cleaned_text = cleaned_text.replace("\n\n", "")
    cleaned_text = cleaned_text.replace("\n \n", "")
    cleaned_text = cleaned_text.replace("\n", "")
    return cleaned_text 

In [11]:
df['reg_text_cleaned'] = df.apply(lambda row : clean_text(row['reg_text_resolved']), axis = 1)

In [12]:
df = df.drop(['reg_text_resolved'], axis=1)

In [13]:
def ensure_word_embeddings(text):
    '''delete words which are not in spacy vocab - would lead to problems later if not done''' 
    doc = nlp(text) 
    new_para = text
    for token in doc:
        if nlp.vocab.has_vector("token.text"):
            continue
        else:
            new_para = new_para.replace("token.text", "")
    return new_para

In [14]:
df['reg_text_cleaned_2'] = df.apply(lambda row : ensure_word_embeddings(row['reg_text_cleaned']), axis = 1)

In [15]:
df = df.drop(['reg_text_cleaned'], axis=1)

In [16]:
from pandas import Series
s = df['reg_text_cleaned_2'].str.split('.').apply(Series, 1).stack()
s.index = s.index.droplevel(-1) # to line up with df's index
s.name = 'reg_sent' # needs a name to join
df_new = df.join(s)

In [17]:
df_new = df_new.drop(['reg_text','reg_text_cleaned_2'], axis=1)
df_new.head()

Unnamed: 0,reg_title,reg_sent
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that the data subject has given consent to the processing of subject or her personal data for one or more specific purposes
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for the performance of a contract to which the data subject is party or in order to take steps at the request of the data subject prior to entering into a contract
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for compliance with a legal obligation to which the controller is subject
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary in order to protect the vital interests of the data subject or of another natural person
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for the performance of a task carried out in the public interest or in the exercise of official authority vested in the controller


In [18]:
def keep_only_signalword_sentences(text):
    '''splits the paragaraphs into sentences, only keeping those sentences that contain at least one signalword; 
    output: df with one sentence per row only sent containing signalword'''
    doc = nlp(text) 
    i = 0
    for token in doc: 
        if (token.text in signalwords):
            i = 1
            break
        else:
            continue
    return i

In [19]:
df_new['contains_signalword'] = df_new.apply(lambda row : keep_only_signalword_sentences(row['reg_sent']), axis = 1)

In [20]:
df_new = df_new[df_new.contains_signalword != 0]
df_new = df_new.drop(['contains_signalword'], axis=1)


In [23]:
"""
def join_texts(a,b):
    c = a + " " + b
    return c
"""

In [25]:
"""
df_new['reg_text_total'] = df_new.apply(lambda row : join_texts(row['reg_title'], row['reg_sent']), axis =1)
df_new.head()
"""

Unnamed: 0,reg_title,reg_sent,reg_text_total
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that the data subject has given consent to the processing of subject or her personal data for one or more specific purposes,Lawfulness of processing Processing shall be lawful only if and to the extent that the data subject has given consent to the processing of subject or her personal data for one or more specific purposes
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for the performance of a contract to which the data subject is party or in order to take steps at the request of the data subject prior to entering into a contract,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary for the performance of a contract to which the data subject is party or in order to take steps at the request of the data subject prior to entering into a contract
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for compliance with a legal obligation to which the controller is subject,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary for compliance with a legal obligation to which the controller is subject
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary in order to protect the vital interests of the data subject or of another natural person,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary in order to protect the vital interests of the data subject or of another natural person
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for the performance of a task carried out in the public interest or in the exercise of official authority vested in the controller,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary for the performance of a task carried out in the public interest or in the exercise of official authority vested in the controller


In [26]:
# extract sent keyphrase with RAKE
import pandas as pd
from rake_nltk import Rake
import re
import os

In [27]:
def RAKE_Keyword_Extraction(text, stop_word_path, threshold):

    # our extracted keywords, min 1, max 5.
    keywords = []
    phrases = []
    phrases1 = []
    phrases2 = []
    max_len = 5
    min_len = 1
    # uncustomized stopwordlist
    stop_words = []
    with open(stop_word_path, 'r') as f:
        for w in f.readlines():
            stop_words.append(w.strip())
        f.close()
    # initialize the Rake keyword extractor
    r = Rake(stopwords=stop_words, max_length=max_len, min_length=min_len)
    #text = re.sub('[^a-zA-Z]', ' ', text)
    r.extract_keywords_from_sentences([text])
    # rank the extracted keywords
    phrases = r.get_ranked_phrases_with_scores()
    # exclude keywords, with scores lower than the threshold
    phrases2.extend([p[1] for p in phrases if len(p[1]) > 1 and p[0] > threshold and p[1] not in phrases2])
    if len(phrases2) >= 5:  # maximal 5 keywords
        keywords.append(phrases2[:5])
    elif 0 < len(phrases2) < 5:  # take the rest
        keywords.append(phrases2)
    else:
        phrases1 = r.get_ranked_phrases()
        if len(phrases) >= 5:
            keywords.append(phrases1[:5])
        else:
            keywords.append(phrases1)
    keyword_list = ' '.join(map(str, keywords))
    return keyword_list

In [30]:
#output of function = one concated keyword string
df_new['keywords_sent'] = df_new.apply(lambda row : RAKE_Keyword_Extraction(row['reg_sent'], GDPR_STOPWORDS_RAKE, 3), axis = 1)
df_new.head()

Unnamed: 0,reg_title,reg_sent,reg_text_total,keywords,keywords_sent
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that the data subject has given consent to the processing of subject or her personal data for one or more specific purposes,Lawfulness of processing Processing shall be lawful only if and to the extent that the data subject has given consent to the processing of subject or her personal data for one or more specific purposes,"['specific purposes', 'personal data', 'given consent', 'data subject', 'processing processing']","['specific purposes', 'personal data', 'given consent', 'data subject']"
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for the performance of a contract to which the data subject is party or in order to take steps at the request of the data subject prior to entering into a contract,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary for the performance of a contract to which the data subject is party or in order to take steps at the request of the data subject prior to entering into a contract,"['data subject prior', 'data subject', 'take steps', 'processing processing']","['data subject prior', 'data subject', 'take steps']"
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for compliance with a legal obligation to which the controller is subject,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary for compliance with a legal obligation to which the controller is subject,"['legal obligation', 'processing processing']",['legal obligation']
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary in order to protect the vital interests of the data subject or of another natural person,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary in order to protect the vital interests of the data subject or of another natural person,"['another natural person', 'vital interests', 'data subject', 'processing processing']","['another natural person', 'vital interests', 'data subject']"
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for the performance of a task carried out in the public interest or in the exercise of official authority vested in the controller,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary for the performance of a task carried out in the public interest or in the exercise of official authority vested in the controller,"['official authority vested', 'task carried', 'public interest', 'processing processing']","['official authority vested', 'task carried', 'public interest']"


### joint keywords doesn't seem to work well - title is not really included in keywords, instead keyphrase like "processing, processing" is created which doesn't make sense  --> try without title?

In [31]:
df_new['keywords_title'] = df_new.apply(lambda row : RAKE_Keyword_Extraction(row['reg_title'], GDPR_STOPWORDS_RAKE, 3), axis = 1)
df_new.head()

Unnamed: 0,reg_title,reg_sent,reg_text_total,keywords,keywords_sent,keywords_title
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that the data subject has given consent to the processing of subject or her personal data for one or more specific purposes,Lawfulness of processing Processing shall be lawful only if and to the extent that the data subject has given consent to the processing of subject or her personal data for one or more specific purposes,"['specific purposes', 'personal data', 'given consent', 'data subject', 'processing processing']","['specific purposes', 'personal data', 'given consent', 'data subject']","['processing', 'lawfulness']"
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for the performance of a contract to which the data subject is party or in order to take steps at the request of the data subject prior to entering into a contract,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary for the performance of a contract to which the data subject is party or in order to take steps at the request of the data subject prior to entering into a contract,"['data subject prior', 'data subject', 'take steps', 'processing processing']","['data subject prior', 'data subject', 'take steps']","['processing', 'lawfulness']"
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for compliance with a legal obligation to which the controller is subject,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary for compliance with a legal obligation to which the controller is subject,"['legal obligation', 'processing processing']",['legal obligation'],"['processing', 'lawfulness']"
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary in order to protect the vital interests of the data subject or of another natural person,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary in order to protect the vital interests of the data subject or of another natural person,"['another natural person', 'vital interests', 'data subject', 'processing processing']","['another natural person', 'vital interests', 'data subject']","['processing', 'lawfulness']"
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for the performance of a task carried out in the public interest or in the exercise of official authority vested in the controller,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary for the performance of a task carried out in the public interest or in the exercise of official authority vested in the controller,"['official authority vested', 'task carried', 'public interest', 'processing processing']","['official authority vested', 'task carried', 'public interest']","['processing', 'lawfulness']"


In [32]:
def join_keywords(a,b):
    string = a.strip("[]")
    string2 = b.strip("[]")
    c = string + "," + string2
    return c

In [33]:
import re
df_new['reg_kw_total'] = df_new.apply(lambda row : join_keywords(row['keywords_sent'], row['keywords_title']), axis =1)
df_new.head()

Unnamed: 0,reg_title,reg_sent,reg_text_total,keywords,keywords_sent,keywords_title,reg_kw_total
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that the data subject has given consent to the processing of subject or her personal data for one or more specific purposes,Lawfulness of processing Processing shall be lawful only if and to the extent that the data subject has given consent to the processing of subject or her personal data for one or more specific purposes,"['specific purposes', 'personal data', 'given consent', 'data subject', 'processing processing']","['specific purposes', 'personal data', 'given consent', 'data subject']","['processing', 'lawfulness']","'specific purposes', 'personal data', 'given consent', 'data subject','processing', 'lawfulness'"
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for the performance of a contract to which the data subject is party or in order to take steps at the request of the data subject prior to entering into a contract,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary for the performance of a contract to which the data subject is party or in order to take steps at the request of the data subject prior to entering into a contract,"['data subject prior', 'data subject', 'take steps', 'processing processing']","['data subject prior', 'data subject', 'take steps']","['processing', 'lawfulness']","'data subject prior', 'data subject', 'take steps','processing', 'lawfulness'"
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for compliance with a legal obligation to which the controller is subject,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary for compliance with a legal obligation to which the controller is subject,"['legal obligation', 'processing processing']",['legal obligation'],"['processing', 'lawfulness']","'legal obligation','processing', 'lawfulness'"
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary in order to protect the vital interests of the data subject or of another natural person,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary in order to protect the vital interests of the data subject or of another natural person,"['another natural person', 'vital interests', 'data subject', 'processing processing']","['another natural person', 'vital interests', 'data subject']","['processing', 'lawfulness']","'another natural person', 'vital interests', 'data subject','processing', 'lawfulness'"
0,Lawfulness of processing,Processing shall be lawful only if and to the extent that processing is necessary for the performance of a task carried out in the public interest or in the exercise of official authority vested in the controller,Lawfulness of processing Processing shall be lawful only if and to the extent that processing is necessary for the performance of a task carried out in the public interest or in the exercise of official authority vested in the controller,"['official authority vested', 'task carried', 'public interest', 'processing processing']","['official authority vested', 'task carried', 'public interest']","['processing', 'lawfulness']","'official authority vested', 'task carried', 'public interest','processing', 'lawfulness'"


In [None]:
# save preprocessed reg to excel
pd.DataFrame(df_new).to_excel(join(INTERMEDIATE_DIRECTORY, "gdpr_reg_preprocessed_optionb.xlsx"))  

In [None]:
# check if embedding of "specific purposes" possible
# check if sim to apple would be same as sim of specific and purposes 

In [None]:
# apply to rea

# calculate sim with word2vec?

# prepare 2-4 more Data protection policies?

In [None]:
nlp.add_pipe("merge_entities")

In [None]:
# concat title + key phrase 

In [None]:
# apply everything for rea (incl. specific word change)

In [None]:
if rea:
    
def substitude_specific_realization_formulations(self):
    '''replaces realization specific words with a general term from regulation
    like "Group Company" with "controller"'''
    for para in self.cleaned_paragraphs_list5:
        new_para = para
        for item in self.management_words:
            new_para = new_para.replace(item, 'management')
        self.cleaned_paragraphs_list6.append(new_para)
    for para in self.cleaned_paragraphs_list6:
        new_para = para
        for item in self.data_protection_officer_words:
            new_para = new_para.replace(item, 'data protection officer')
        self.cleaned_paragraphs_list7.append(new_para)
    for para in self.cleaned_paragraphs_list7:
        new_para = para
        for item in self.controller_words:
            new_para = new_para.replace(item, 'controller')
        self.cleaned_paragraphs_list8.append(new_para)

In [None]:
# calculate # reg sent and # rea sent

# calculate sim between newly generated key phrases

# ordne jedem reg sent die rea sent zu, wo die key phrases am ähnlichsten sind? 