In [1]:
# general imports
import os
import coreferee
import re
import spacy
import pandas as pd
# own path/ class imports
from file_paths import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Application Selection ########################################START
# choose method 
direct_s_bert = True #if True --> no clustering or other means are implemented, all sentences are comapred with each other via S-Bert
legal_s_bert = False #if True --> implementation like S-BERT but based on legal BERT instead of BERT
clustering = False #if True --> 2 approaches calculated: a) topic model + word2vec + cosine sim; b) bert embeddings + kmeans and word2vev + cosine sim
key_phrase = False # if True --> setp one is performed on only key phrases (identified by tfidf), instead of whole sentences
# choose case study
### GDPR adjusted, ISO not!
iso = False #if False --> running with gdpr setup
# choose set up
rea_only_signal = False #if False --> gdpr realization input is not filtered to contain only sentences with signalwords
# choose thresholds:
gamma_s_bert = 0.7 #0.67 #used for sentence mapping 
gamma_grouping = 0.9 #used for sentence mapping in k-means & topic Model approach
gamma_key_phrase = 0.92 #used for key phrase extraction
gamma_one = 0.26 #used for subject phrase mapping
gamma_two = 0.23 #used for verb phrase mapping
gamma_three = 0.2 #used for object phrase mapping
################################################################# END

In [3]:
# Create the nlp object
nlp = spacy.load('en_core_web_trf')
nlp.add_pipe('coreferee', config={}) # resolves coreferences

<coreferee.manager.CorefereeBroker at 0x7fb4f45edcf0>

In [4]:
## parse defined lists of constraint signalwords, sequencemarkers and stopwords ########################### START
def read_defined_lists(directory): 
  '''reads in defined txts of constraint signalwords, sequencemarkers and stopwords as lists
  Input: .txt
  Output: list'''
  try:
    with open(directory) as f:
      defined_list = f.read().splitlines()
  except FileNotFoundError:
      print("Wrong file or file path.")
      quit()
  return defined_list

if iso:
  signalwords = read_defined_lists(ISO_SIGNALWORDS)
  ISMS_words = read_defined_lists(ISO_REA_SPEZIFICATION1)
  top_management_words = read_defined_lists(ISO_REA_SPEZIFICATION2)
else:
  signalwords = read_defined_lists(GDPR_SIGNALWORDS)
  controller_words = read_defined_lists(GDPR_REA_SPEZIFICATION1)
  data_protection_officer_words = read_defined_lists(GDPR_REA_SPEZIFICATION2)
  management_words = read_defined_lists(GDPR_REA_SPEZIFICATION3)

################################################################# END

In [5]:

## parse documents ############################################ START
def read_documents(directory): 
  '''reads in txts of regulatory and realization documents
  Input: multiple .txt (each a document article)
  Output: dictionary with article name as key and article text as value'''
  doc_dict = dict()
  files = os.listdir(directory)
  try:
    for fi in files:
        if fi.endswith('.txt'):
          with open(directory+'/'+fi,'r') as f:
              doc_dict[re.sub('\.txt', '', fi)] = f.read()
  except FileNotFoundError:
    print("Wrong file or file path to dir.")
    quit()
  return doc_dict

# reading the raw .txt text
if iso:
  reg_paragraphs = read_documents(ISO_REGULATION_INPUT_DIRECTORY) 
  rea_paragraphs = read_documents(ISO_REALIZATION_INPUT_DIRECTORY) 

else: 
  reg_paragraphs = read_documents(GDPR_REGULATION_INPUT_DIRECTORY) 
  rea_paragraphs = read_documents(GDPR_REALIZATION_INPUT_DIRECTORY) 
################################################################# END

In [6]:
df = pd.DataFrame(rea_paragraphs.items(), columns=['rea_title', 'rea_text'])

In [7]:
def apply_coreference_resolution(text):
    doc = nlp(text)
    # split text in tokens
    list_tokens = list(token.text_with_ws for token in doc)
    for index, _ in enumerate(list_tokens):
        #check if token an identified coreference token
        if None != doc._.coref_chains.resolve(doc[index]):
            new_token = ""
            #extract those tokens that are identified via index by coreferee and replace with best refrence token
            for resolved_token in doc._.coref_chains.resolve(doc[index]):
                new_token = new_token + resolved_token.text + " "
                list_tokens[index] = new_token
    resolved_text = "".join(list_tokens)
    return resolved_text

In [8]:
df['rea_text_resolved'] = df.apply(lambda row : apply_coreference_resolution(row['rea_text']), axis = 1)

In [9]:
pd.options.display.max_colwidth= 2000

In [10]:
def clean_text(text):  
    '''cleans texts'''
    cleaned_text = text.replace(";", ".") #in reg there are many ; which should be counted as seperate senteces
    cleaned_text = cleaned_text.replace("or\n\n\n", "")
    cleaned_text = cleaned_text.replace("or\n\n", "")
    cleaned_text = cleaned_text.replace("and\n\n\n", "")
    cleaned_text = cleaned_text.replace("and\n\n", "")
    cleaned_text = cleaned_text.replace("\n\n\n", "")
    cleaned_text = cleaned_text.replace("\n\n", "")
    cleaned_text = cleaned_text.replace("\n \n", "")
    cleaned_text = cleaned_text.replace("\n", "")
    return cleaned_text 

In [11]:
df['rea_text_cleaned'] = df.apply(lambda row : clean_text(row['rea_text_resolved']), axis = 1)

In [12]:
df = df.drop(['rea_text_resolved'], axis=1)

In [13]:
def substitude_specific_realization_formulations(text):
    '''replaces realization specific words with a general term from regulation
    like "Group Company" with "controller"'''
    resolved_text = text
    for item in management_words:
        resolved_text  = resolved_text.replace(item, 'management')
    for item in data_protection_officer_words:
        resolved_text  = resolved_text.replace(item, 'data protection officer')
    for item in controller_words:
        resolved_text = resolved_text.replace(item, 'controller')
    return resolved_text

In [14]:
df.head()

Unnamed: 0,rea_title,rea_text,rea_text_cleaned
0,Responsibility,"The members of managing bodies of the Group Companies are responsible for data processing in their area of responsibility. Therefore, they are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within their area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the Chief Officer Corporate Data Protection must be informed immediately.","The members of managing bodies of the Group Companies are responsible for data processing in members area of responsibility. Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the Chief Officer Corporate Data Protection must be informed immediately."
1,(Further) Transmission outside the Daimler Group,"Transmission of personal data to recipients outside or inside the Group Companies is subject to the authorization requirements for processing personal data under this Section 5. The data recipient must be required to use the data only for defined purposes. In the event of a cross-border transmission of personal data (including granting access from another country), the relevant national requirements for the transfer of personal data abroad must be fulfilled. In particular, personal data from the EU/ EEA may only be processed outside the Group Companies in a third country if the recipient can prove that it has a data protection level equivalent to this Policy. Suitable tools can be: Agreement on EU standard contractual clauses, Participation of the recipient in an EU-accredited certification system for ensuring an adequate level of data protection, or Recognition of binding corporate rules of the recipient to create an adequate level of data protection by the responsible supervisory authorities.\n\nTransfers of personal data to any public authority cannot be massive, disproportionate and indiscriminate in a manner that would go beyond what is necessary in a democratic society. In the event of conflicts between these and public authority requirements, Daimler AG will work with the responsible Group Company to find a practical solution that fulfills the purpose of this Policy (Section 14.3).\n\nAll duties listed in this Section 5 are third party beneficiary rights for the data subject.","Transmission of personal data to recipients outside or inside the Group Companies is subject to the authorization requirements for processing personal data under this Section 5. The data recipient must be required to use the data only for defined purposes. In the event of a cross-border transmission of personal data (including granting access from another country), the relevant national requirements for the transfer of personal data abroad must be fulfilled. In particular, personal data from the EU/ EEA may only be processed outside the Group Companies in a third country if the recipient can prove that recipient has a data protection level equivalent to this Policy. Suitable tools can be: Agreement on EU standard contractual clauses, Participation of the recipient in an EU-accredited certification system for ensuring an adequate level of data protection, or Recognition of binding corporate rules of the recipient to create an adequate level of data protection by the responsible supervisory authorities.Transfers of personal data to any public authority cannot be massive, disproportionate and indiscriminate in a manner that would go beyond what is necessary in a democratic society. In the event of conflicts between these and public authority requirements, Daimler AG will work with the responsible Group AG to find a practical solution that fulfills the purpose of this Policy (Section 14.3).All duties listed in this Section 5 are third party beneficiary rights for the data subject."
2,Joint Controllership,"In the event that multiple Group Companies jointly define the means and purposes of processing personal data (along with one or more third parties, if applicable) (joint controllers), the companies must conclude an agreement that stipulates their duties and responsibilities to the data subject whose data they process. The contract templates provided by the Chief Officer Corporate Data Protection must be observed.","In the event that multiple Group Companies jointly define the means and purposes of processing personal data (along with one or more third parties, if applicable) (joint controllers), the companies must conclude an agreement that stipulates companies duties and responsibilities to the data subject whose data companies process. The contract templates provided by the Chief Officer Corporate Data Protection must be observed."
3,Place of Jurisdiction,"The data subject may bring an action before the courts at the establishment of the controller or processor or at his habitual residence.\n\nThe data subject who claims an infringement of this Policy in the context of a third country processing can assert his legal claims against both the data importing and the data exporting company in the EU/ EEA. Therefore, the data subject may bring the alleged infringement and the resulting legal claims before the competent courts and regulatory authorities either at the establishment of the controller or at his habitual residence.\n\nThe provisions on liability and place of jurisdiction in this Section are third party beneficiary rights for the data subject.","The data subject may bring an action before the courts at the establishment of the controller or processor or at processor habitual residence.The data subject who claims an infringement of this Policy in the context of a third country processing can assert subject legal claims against both the data importing and the data exporting company in the EU/ EEA. Therefore, the data subject may bring the alleged infringement and the resulting legal claims before the competent courts and regulatory authorities either at the establishment of the controller or at subject habitual residence.The provisions on liability and place of jurisdiction in this Section are third party beneficiary rights for the data subject."
4,Legal basis Customer and Partner Data,"Personal data of the prospective customer, customer, or partner can be processed to establish, perform and terminate a contract. \n\nThis also includes advisory services for the customer or partner under the contract if this is related to the contractual purpose. \n\nPrior to a contract, personal data can be processed to prepare bids or purchase orders or to fulfill other requests of the prospective customer relating to contract conclusion. \n\nProspective customers can be contacted during the contract preparation process using the information that they have provided. \n\nAny restrictions requested by the prospective customers must be complied with. \n\nIf the data subject contacts a Group Company with a request for information (e. g. request to receive information material about a product), processing of personal data to meet this request is permitted. \n\nCustomer loyalty or advertising measures are subject to further legal requirements. \n\nPersonal data can be processed for advertising purposes or market and opinion research, provided that this is consistent with the purpose for which the data was originally collected. \n\nThe data subject must be informed in advance about the use of his/her personal data for advertising purposes. \n\nIf personal data is collected only for advertising purposes, the data subject can choose whether to provide this data. \n\nThe data subject shall be informed that providing data for this purpose is voluntary. \n\nAs part of the communication process, consent should be obtained from the data subject. \n\nWhen giving consent, the data subject should be given a choice among available forms of contact, such as e-mail and phone (consent see Section 5.2.3). \n\nIf the data subject objects to the use of his/her data for advertising purposes, it can no longer be used for these purposes and must be restricted or blocked from use for these purposes. Any other restrictions from specific countries regarding the use of data for advertising ...","Personal data of the prospective customer, customer, or partner can be processed to establish, perform and terminate a contract. This also includes advisory services for the customer or partner under the contract if this is related to the contractual purpose. Prior to a contract, personal data can be processed to prepare bids or purchase orders or to fulfill other requests of the prospective customer relating to contract conclusion. Prospective customers can be contacted during the contract preparation process using the information that customers have provided. Any restrictions requested by the prospective customers must be complied with. If the data subject contacts a Group Company with a request for information (e. g. request to receive information material about a product), processing of personal data to meet this request is permitted. Customer loyalty or advertising measures are subject to further legal requirements. Personal data can be processed for advertising purposes or market and opinion research, provided that this is consistent with the purpose for which the data was originally collected. The data subject must be informed in advance about the use of Customer /subject personal data for advertising purposes. If personal data is collected only for advertising purposes, the data subject can choose whether to provide this data. The data subject shall be informed that providing data for this purpose is voluntary. As part of the communication process, consent should be obtained from the data subject. When giving consent, the data subject should be given a choice among available forms of contact, such as e-mail and phone (consent see Section 5.2.3). If the data subject objects to the use of Customer /subject data for advertising purposes, it can no longer be used for these purposes and must be restricted or blocked from use for these purposes . Any other restrictions from specific countries regarding the use of data for advertising purposes must be observed...."


In [15]:
df['rea_exchanged'] = df.apply(lambda row : substitude_specific_realization_formulations(row['rea_text_cleaned']), axis = 1)
df.head()

Unnamed: 0,rea_title,rea_text,rea_text_cleaned,rea_exchanged
0,Responsibility,"The members of managing bodies of the Group Companies are responsible for data processing in their area of responsibility. Therefore, they are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within their area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the Chief Officer Corporate Data Protection must be informed immediately.","The members of managing bodies of the Group Companies are responsible for data processing in members area of responsibility. Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the Chief Officer Corporate Data Protection must be informed immediately.","The members of managing bodies of the controller are responsible for data processing in members area of responsibility. Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the data protection officer must be informed immediately."
1,(Further) Transmission outside the Daimler Group,"Transmission of personal data to recipients outside or inside the Group Companies is subject to the authorization requirements for processing personal data under this Section 5. The data recipient must be required to use the data only for defined purposes. In the event of a cross-border transmission of personal data (including granting access from another country), the relevant national requirements for the transfer of personal data abroad must be fulfilled. In particular, personal data from the EU/ EEA may only be processed outside the Group Companies in a third country if the recipient can prove that it has a data protection level equivalent to this Policy. Suitable tools can be: Agreement on EU standard contractual clauses, Participation of the recipient in an EU-accredited certification system for ensuring an adequate level of data protection, or Recognition of binding corporate rules of the recipient to create an adequate level of data protection by the responsible supervisory authorities.\n\nTransfers of personal data to any public authority cannot be massive, disproportionate and indiscriminate in a manner that would go beyond what is necessary in a democratic society. In the event of conflicts between these and public authority requirements, Daimler AG will work with the responsible Group Company to find a practical solution that fulfills the purpose of this Policy (Section 14.3).\n\nAll duties listed in this Section 5 are third party beneficiary rights for the data subject.","Transmission of personal data to recipients outside or inside the Group Companies is subject to the authorization requirements for processing personal data under this Section 5. The data recipient must be required to use the data only for defined purposes. In the event of a cross-border transmission of personal data (including granting access from another country), the relevant national requirements for the transfer of personal data abroad must be fulfilled. In particular, personal data from the EU/ EEA may only be processed outside the Group Companies in a third country if the recipient can prove that recipient has a data protection level equivalent to this Policy. Suitable tools can be: Agreement on EU standard contractual clauses, Participation of the recipient in an EU-accredited certification system for ensuring an adequate level of data protection, or Recognition of binding corporate rules of the recipient to create an adequate level of data protection by the responsible supervisory authorities.Transfers of personal data to any public authority cannot be massive, disproportionate and indiscriminate in a manner that would go beyond what is necessary in a democratic society. In the event of conflicts between these and public authority requirements, Daimler AG will work with the responsible Group AG to find a practical solution that fulfills the purpose of this Policy (Section 14.3).All duties listed in this Section 5 are third party beneficiary rights for the data subject.","Transmission of personal data to recipients outside or inside the controller is subject to the authorization requirements for processing personal data under this Section 5. The data recipient must be required to use the data only for defined purposes. In the event of a cross-border transmission of personal data (including granting access from another country), the relevant national requirements for the transfer of personal data abroad must be fulfilled. In particular, personal data from the EU/ EEA may only be processed outside the controller in a third country if the recipient can prove that recipient has a data protection level equivalent to this Policy. Suitable tools can be: Agreement on EU standard contractual clauses, Participation of the recipient in an EU-accredited certification system for ensuring an adequate level of data protection, or Recognition of binding corporate rules of the recipient to create an adequate level of data protection by the responsible supervisory authorities.Transfers of personal data to any public authority cannot be massive, disproportionate and indiscriminate in a manner that would go beyond what is necessary in a democratic society. In the event of conflicts between these and public authority requirements, controller will work with the responsible Group AG to find a practical solution that fulfills the purpose of this Policy (Section 14.3).All duties listed in this Section 5 are third party beneficiary rights for the data subject."
2,Joint Controllership,"In the event that multiple Group Companies jointly define the means and purposes of processing personal data (along with one or more third parties, if applicable) (joint controllers), the companies must conclude an agreement that stipulates their duties and responsibilities to the data subject whose data they process. The contract templates provided by the Chief Officer Corporate Data Protection must be observed.","In the event that multiple Group Companies jointly define the means and purposes of processing personal data (along with one or more third parties, if applicable) (joint controllers), the companies must conclude an agreement that stipulates companies duties and responsibilities to the data subject whose data companies process. The contract templates provided by the Chief Officer Corporate Data Protection must be observed.","In the event that multiple controller jointly define the means and purposes of processing personal data (along with one or more third parties, if applicable) (joint controllers), the companies must conclude an agreement that stipulates companies duties and responsibilities to the data subject whose data companies process. The contract templates provided by the data protection officer must be observed."
3,Place of Jurisdiction,"The data subject may bring an action before the courts at the establishment of the controller or processor or at his habitual residence.\n\nThe data subject who claims an infringement of this Policy in the context of a third country processing can assert his legal claims against both the data importing and the data exporting company in the EU/ EEA. Therefore, the data subject may bring the alleged infringement and the resulting legal claims before the competent courts and regulatory authorities either at the establishment of the controller or at his habitual residence.\n\nThe provisions on liability and place of jurisdiction in this Section are third party beneficiary rights for the data subject.","The data subject may bring an action before the courts at the establishment of the controller or processor or at processor habitual residence.The data subject who claims an infringement of this Policy in the context of a third country processing can assert subject legal claims against both the data importing and the data exporting company in the EU/ EEA. Therefore, the data subject may bring the alleged infringement and the resulting legal claims before the competent courts and regulatory authorities either at the establishment of the controller or at subject habitual residence.The provisions on liability and place of jurisdiction in this Section are third party beneficiary rights for the data subject.","The data subject may bring an action before the courts at the establishment of the controller or processor or at processor habitual residence.The data subject who claims an infringement of this Policy in the context of a third country processing can assert subject legal claims against both the data importing and the data exporting company in the EU/ EEA. Therefore, the data subject may bring the alleged infringement and the resulting legal claims before the competent courts and regulatory authorities either at the establishment of the controller or at subject habitual residence.The provisions on liability and place of jurisdiction in this Section are third party beneficiary rights for the data subject."
4,Legal basis Customer and Partner Data,"Personal data of the prospective customer, customer, or partner can be processed to establish, perform and terminate a contract. \n\nThis also includes advisory services for the customer or partner under the contract if this is related to the contractual purpose. \n\nPrior to a contract, personal data can be processed to prepare bids or purchase orders or to fulfill other requests of the prospective customer relating to contract conclusion. \n\nProspective customers can be contacted during the contract preparation process using the information that they have provided. \n\nAny restrictions requested by the prospective customers must be complied with. \n\nIf the data subject contacts a Group Company with a request for information (e. g. request to receive information material about a product), processing of personal data to meet this request is permitted. \n\nCustomer loyalty or advertising measures are subject to further legal requirements. \n\nPersonal data can be processed for advertising purposes or market and opinion research, provided that this is consistent with the purpose for which the data was originally collected. \n\nThe data subject must be informed in advance about the use of his/her personal data for advertising purposes. \n\nIf personal data is collected only for advertising purposes, the data subject can choose whether to provide this data. \n\nThe data subject shall be informed that providing data for this purpose is voluntary. \n\nAs part of the communication process, consent should be obtained from the data subject. \n\nWhen giving consent, the data subject should be given a choice among available forms of contact, such as e-mail and phone (consent see Section 5.2.3). \n\nIf the data subject objects to the use of his/her data for advertising purposes, it can no longer be used for these purposes and must be restricted or blocked from use for these purposes. Any other restrictions from specific countries regarding the use of data for advertising ...","Personal data of the prospective customer, customer, or partner can be processed to establish, perform and terminate a contract. This also includes advisory services for the customer or partner under the contract if this is related to the contractual purpose. Prior to a contract, personal data can be processed to prepare bids or purchase orders or to fulfill other requests of the prospective customer relating to contract conclusion. Prospective customers can be contacted during the contract preparation process using the information that customers have provided. Any restrictions requested by the prospective customers must be complied with. If the data subject contacts a Group Company with a request for information (e. g. request to receive information material about a product), processing of personal data to meet this request is permitted. Customer loyalty or advertising measures are subject to further legal requirements. Personal data can be processed for advertising purposes or market and opinion research, provided that this is consistent with the purpose for which the data was originally collected. The data subject must be informed in advance about the use of Customer /subject personal data for advertising purposes. If personal data is collected only for advertising purposes, the data subject can choose whether to provide this data. The data subject shall be informed that providing data for this purpose is voluntary. As part of the communication process, consent should be obtained from the data subject. When giving consent, the data subject should be given a choice among available forms of contact, such as e-mail and phone (consent see Section 5.2.3). If the data subject objects to the use of Customer /subject data for advertising purposes, it can no longer be used for these purposes and must be restricted or blocked from use for these purposes . Any other restrictions from specific countries regarding the use of data for advertising purposes must be observed....","Personal data of the prospective customer, customer, or partner can be processed to establish, perform and terminate a contract. This also includes advisory services for the customer or partner under the contract if this is related to the contractual purpose. Prior to a contract, personal data can be processed to prepare bids or purchase orders or to fulfill other requests of the prospective customer relating to contract conclusion. Prospective customers can be contacted during the contract preparation process using the information that customers have provided. Any restrictions requested by the prospective customers must be complied with. If the data subject contacts a controller with a request for information (e. g. request to receive information material about a product), processing of personal data to meet this request is permitted. Customer loyalty or advertising measures are subject to further legal requirements. Personal data can be processed for advertising purposes or market and opinion research, provided that this is consistent with the purpose for which the data was originally collected. The data subject must be informed in advance about the use of Customer /subject personal data for advertising purposes. If personal data is collected only for advertising purposes, the data subject can choose whether to provide this data. The data subject shall be informed that providing data for this purpose is voluntary. As part of the communication process, consent should be obtained from the data subject. When giving consent, the data subject should be given a choice among available forms of contact, such as e-mail and phone (consent see Section 5.2.3). If the data subject objects to the use of Customer /subject data for advertising purposes, it can no longer be used for these purposes and must be restricted or blocked from use for these purposes . Any other restrictions from specific countries regarding the use of data for advertising purposes must be observed. Pe..."


In [16]:
def ensure_word_embeddings(text):
    '''delete words which are not in spacy vocab - would lead to problems later if not done''' 
    doc = nlp(text) 
    new_para = text
    for token in doc:
        if nlp.vocab.has_vector("token.text"):
            continue
        else:
            new_para = new_para.replace("token.text", "")
    return new_para

In [17]:
df['rea_text_cleaned_2'] = df.apply(lambda row : ensure_word_embeddings(row['rea_exchanged']), axis = 1)
df.head()

Unnamed: 0,rea_title,rea_text,rea_text_cleaned,rea_exchanged,rea_text_cleaned_2
0,Responsibility,"The members of managing bodies of the Group Companies are responsible for data processing in their area of responsibility. Therefore, they are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within their area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the Chief Officer Corporate Data Protection must be informed immediately.","The members of managing bodies of the Group Companies are responsible for data processing in members area of responsibility. Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the Chief Officer Corporate Data Protection must be informed immediately.","The members of managing bodies of the controller are responsible for data processing in members area of responsibility. Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the data protection officer must be informed immediately.","The members of managing bodies of the controller are responsible for data processing in members area of responsibility. Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the data protection officer must be informed immediately."
1,(Further) Transmission outside the Daimler Group,"Transmission of personal data to recipients outside or inside the Group Companies is subject to the authorization requirements for processing personal data under this Section 5. The data recipient must be required to use the data only for defined purposes. In the event of a cross-border transmission of personal data (including granting access from another country), the relevant national requirements for the transfer of personal data abroad must be fulfilled. In particular, personal data from the EU/ EEA may only be processed outside the Group Companies in a third country if the recipient can prove that it has a data protection level equivalent to this Policy. Suitable tools can be: Agreement on EU standard contractual clauses, Participation of the recipient in an EU-accredited certification system for ensuring an adequate level of data protection, or Recognition of binding corporate rules of the recipient to create an adequate level of data protection by the responsible supervisory authorities.\n\nTransfers of personal data to any public authority cannot be massive, disproportionate and indiscriminate in a manner that would go beyond what is necessary in a democratic society. In the event of conflicts between these and public authority requirements, Daimler AG will work with the responsible Group Company to find a practical solution that fulfills the purpose of this Policy (Section 14.3).\n\nAll duties listed in this Section 5 are third party beneficiary rights for the data subject.","Transmission of personal data to recipients outside or inside the Group Companies is subject to the authorization requirements for processing personal data under this Section 5. The data recipient must be required to use the data only for defined purposes. In the event of a cross-border transmission of personal data (including granting access from another country), the relevant national requirements for the transfer of personal data abroad must be fulfilled. In particular, personal data from the EU/ EEA may only be processed outside the Group Companies in a third country if the recipient can prove that recipient has a data protection level equivalent to this Policy. Suitable tools can be: Agreement on EU standard contractual clauses, Participation of the recipient in an EU-accredited certification system for ensuring an adequate level of data protection, or Recognition of binding corporate rules of the recipient to create an adequate level of data protection by the responsible supervisory authorities.Transfers of personal data to any public authority cannot be massive, disproportionate and indiscriminate in a manner that would go beyond what is necessary in a democratic society. In the event of conflicts between these and public authority requirements, Daimler AG will work with the responsible Group AG to find a practical solution that fulfills the purpose of this Policy (Section 14.3).All duties listed in this Section 5 are third party beneficiary rights for the data subject.","Transmission of personal data to recipients outside or inside the controller is subject to the authorization requirements for processing personal data under this Section 5. The data recipient must be required to use the data only for defined purposes. In the event of a cross-border transmission of personal data (including granting access from another country), the relevant national requirements for the transfer of personal data abroad must be fulfilled. In particular, personal data from the EU/ EEA may only be processed outside the controller in a third country if the recipient can prove that recipient has a data protection level equivalent to this Policy. Suitable tools can be: Agreement on EU standard contractual clauses, Participation of the recipient in an EU-accredited certification system for ensuring an adequate level of data protection, or Recognition of binding corporate rules of the recipient to create an adequate level of data protection by the responsible supervisory authorities.Transfers of personal data to any public authority cannot be massive, disproportionate and indiscriminate in a manner that would go beyond what is necessary in a democratic society. In the event of conflicts between these and public authority requirements, controller will work with the responsible Group AG to find a practical solution that fulfills the purpose of this Policy (Section 14.3).All duties listed in this Section 5 are third party beneficiary rights for the data subject.","Transmission of personal data to recipients outside or inside the controller is subject to the authorization requirements for processing personal data under this Section 5. The data recipient must be required to use the data only for defined purposes. In the event of a cross-border transmission of personal data (including granting access from another country), the relevant national requirements for the transfer of personal data abroad must be fulfilled. In particular, personal data from the EU/ EEA may only be processed outside the controller in a third country if the recipient can prove that recipient has a data protection level equivalent to this Policy. Suitable tools can be: Agreement on EU standard contractual clauses, Participation of the recipient in an EU-accredited certification system for ensuring an adequate level of data protection, or Recognition of binding corporate rules of the recipient to create an adequate level of data protection by the responsible supervisory authorities.Transfers of personal data to any public authority cannot be massive, disproportionate and indiscriminate in a manner that would go beyond what is necessary in a democratic society. In the event of conflicts between these and public authority requirements, controller will work with the responsible Group AG to find a practical solution that fulfills the purpose of this Policy (Section 14.3).All duties listed in this Section 5 are third party beneficiary rights for the data subject."
2,Joint Controllership,"In the event that multiple Group Companies jointly define the means and purposes of processing personal data (along with one or more third parties, if applicable) (joint controllers), the companies must conclude an agreement that stipulates their duties and responsibilities to the data subject whose data they process. The contract templates provided by the Chief Officer Corporate Data Protection must be observed.","In the event that multiple Group Companies jointly define the means and purposes of processing personal data (along with one or more third parties, if applicable) (joint controllers), the companies must conclude an agreement that stipulates companies duties and responsibilities to the data subject whose data companies process. The contract templates provided by the Chief Officer Corporate Data Protection must be observed.","In the event that multiple controller jointly define the means and purposes of processing personal data (along with one or more third parties, if applicable) (joint controllers), the companies must conclude an agreement that stipulates companies duties and responsibilities to the data subject whose data companies process. The contract templates provided by the data protection officer must be observed.","In the event that multiple controller jointly define the means and purposes of processing personal data (along with one or more third parties, if applicable) (joint controllers), the companies must conclude an agreement that stipulates companies duties and responsibilities to the data subject whose data companies process. The contract templates provided by the data protection officer must be observed."
3,Place of Jurisdiction,"The data subject may bring an action before the courts at the establishment of the controller or processor or at his habitual residence.\n\nThe data subject who claims an infringement of this Policy in the context of a third country processing can assert his legal claims against both the data importing and the data exporting company in the EU/ EEA. Therefore, the data subject may bring the alleged infringement and the resulting legal claims before the competent courts and regulatory authorities either at the establishment of the controller or at his habitual residence.\n\nThe provisions on liability and place of jurisdiction in this Section are third party beneficiary rights for the data subject.","The data subject may bring an action before the courts at the establishment of the controller or processor or at processor habitual residence.The data subject who claims an infringement of this Policy in the context of a third country processing can assert subject legal claims against both the data importing and the data exporting company in the EU/ EEA. Therefore, the data subject may bring the alleged infringement and the resulting legal claims before the competent courts and regulatory authorities either at the establishment of the controller or at subject habitual residence.The provisions on liability and place of jurisdiction in this Section are third party beneficiary rights for the data subject.","The data subject may bring an action before the courts at the establishment of the controller or processor or at processor habitual residence.The data subject who claims an infringement of this Policy in the context of a third country processing can assert subject legal claims against both the data importing and the data exporting company in the EU/ EEA. Therefore, the data subject may bring the alleged infringement and the resulting legal claims before the competent courts and regulatory authorities either at the establishment of the controller or at subject habitual residence.The provisions on liability and place of jurisdiction in this Section are third party beneficiary rights for the data subject.","The data subject may bring an action before the courts at the establishment of the controller or processor or at processor habitual residence.The data subject who claims an infringement of this Policy in the context of a third country processing can assert subject legal claims against both the data importing and the data exporting company in the EU/ EEA. Therefore, the data subject may bring the alleged infringement and the resulting legal claims before the competent courts and regulatory authorities either at the establishment of the controller or at subject habitual residence.The provisions on liability and place of jurisdiction in this Section are third party beneficiary rights for the data subject."
4,Legal basis Customer and Partner Data,"Personal data of the prospective customer, customer, or partner can be processed to establish, perform and terminate a contract. \n\nThis also includes advisory services for the customer or partner under the contract if this is related to the contractual purpose. \n\nPrior to a contract, personal data can be processed to prepare bids or purchase orders or to fulfill other requests of the prospective customer relating to contract conclusion. \n\nProspective customers can be contacted during the contract preparation process using the information that they have provided. \n\nAny restrictions requested by the prospective customers must be complied with. \n\nIf the data subject contacts a Group Company with a request for information (e. g. request to receive information material about a product), processing of personal data to meet this request is permitted. \n\nCustomer loyalty or advertising measures are subject to further legal requirements. \n\nPersonal data can be processed for advertising purposes or market and opinion research, provided that this is consistent with the purpose for which the data was originally collected. \n\nThe data subject must be informed in advance about the use of his/her personal data for advertising purposes. \n\nIf personal data is collected only for advertising purposes, the data subject can choose whether to provide this data. \n\nThe data subject shall be informed that providing data for this purpose is voluntary. \n\nAs part of the communication process, consent should be obtained from the data subject. \n\nWhen giving consent, the data subject should be given a choice among available forms of contact, such as e-mail and phone (consent see Section 5.2.3). \n\nIf the data subject objects to the use of his/her data for advertising purposes, it can no longer be used for these purposes and must be restricted or blocked from use for these purposes. Any other restrictions from specific countries regarding the use of data for advertising ...","Personal data of the prospective customer, customer, or partner can be processed to establish, perform and terminate a contract. This also includes advisory services for the customer or partner under the contract if this is related to the contractual purpose. Prior to a contract, personal data can be processed to prepare bids or purchase orders or to fulfill other requests of the prospective customer relating to contract conclusion. Prospective customers can be contacted during the contract preparation process using the information that customers have provided. Any restrictions requested by the prospective customers must be complied with. If the data subject contacts a Group Company with a request for information (e. g. request to receive information material about a product), processing of personal data to meet this request is permitted. Customer loyalty or advertising measures are subject to further legal requirements. Personal data can be processed for advertising purposes or market and opinion research, provided that this is consistent with the purpose for which the data was originally collected. The data subject must be informed in advance about the use of Customer /subject personal data for advertising purposes. If personal data is collected only for advertising purposes, the data subject can choose whether to provide this data. The data subject shall be informed that providing data for this purpose is voluntary. As part of the communication process, consent should be obtained from the data subject. When giving consent, the data subject should be given a choice among available forms of contact, such as e-mail and phone (consent see Section 5.2.3). If the data subject objects to the use of Customer /subject data for advertising purposes, it can no longer be used for these purposes and must be restricted or blocked from use for these purposes . Any other restrictions from specific countries regarding the use of data for advertising purposes must be observed....","Personal data of the prospective customer, customer, or partner can be processed to establish, perform and terminate a contract. This also includes advisory services for the customer or partner under the contract if this is related to the contractual purpose. Prior to a contract, personal data can be processed to prepare bids or purchase orders or to fulfill other requests of the prospective customer relating to contract conclusion. Prospective customers can be contacted during the contract preparation process using the information that customers have provided. Any restrictions requested by the prospective customers must be complied with. If the data subject contacts a controller with a request for information (e. g. request to receive information material about a product), processing of personal data to meet this request is permitted. Customer loyalty or advertising measures are subject to further legal requirements. Personal data can be processed for advertising purposes or market and opinion research, provided that this is consistent with the purpose for which the data was originally collected. The data subject must be informed in advance about the use of Customer /subject personal data for advertising purposes. If personal data is collected only for advertising purposes, the data subject can choose whether to provide this data. The data subject shall be informed that providing data for this purpose is voluntary. As part of the communication process, consent should be obtained from the data subject. When giving consent, the data subject should be given a choice among available forms of contact, such as e-mail and phone (consent see Section 5.2.3). If the data subject objects to the use of Customer /subject data for advertising purposes, it can no longer be used for these purposes and must be restricted or blocked from use for these purposes . Any other restrictions from specific countries regarding the use of data for advertising purposes must be observed. Pe...","Personal data of the prospective customer, customer, or partner can be processed to establish, perform and terminate a contract. This also includes advisory services for the customer or partner under the contract if this is related to the contractual purpose. Prior to a contract, personal data can be processed to prepare bids or purchase orders or to fulfill other requests of the prospective customer relating to contract conclusion. Prospective customers can be contacted during the contract preparation process using the information that customers have provided. Any restrictions requested by the prospective customers must be complied with. If the data subject contacts a controller with a request for information (e. g. request to receive information material about a product), processing of personal data to meet this request is permitted. Customer loyalty or advertising measures are subject to further legal requirements. Personal data can be processed for advertising purposes or market and opinion research, provided that this is consistent with the purpose for which the data was originally collected. The data subject must be informed in advance about the use of Customer /subject personal data for advertising purposes. If personal data is collected only for advertising purposes, the data subject can choose whether to provide this data. The data subject shall be informed that providing data for this purpose is voluntary. As part of the communication process, consent should be obtained from the data subject. When giving consent, the data subject should be given a choice among available forms of contact, such as e-mail and phone (consent see Section 5.2.3). If the data subject objects to the use of Customer /subject data for advertising purposes, it can no longer be used for these purposes and must be restricted or blocked from use for these purposes . Any other restrictions from specific countries regarding the use of data for advertising purposes must be observed. Pe..."


In [18]:
df = df.drop(['rea_text_cleaned','rea_exchanged'], axis=1)

In [19]:
df.head()

Unnamed: 0,rea_title,rea_text,rea_text_cleaned_2
0,Responsibility,"The members of managing bodies of the Group Companies are responsible for data processing in their area of responsibility. Therefore, they are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within their area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the Chief Officer Corporate Data Protection must be informed immediately.","The members of managing bodies of the controller are responsible for data processing in members area of responsibility. Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the data protection officer must be informed immediately."
1,(Further) Transmission outside the Daimler Group,"Transmission of personal data to recipients outside or inside the Group Companies is subject to the authorization requirements for processing personal data under this Section 5. The data recipient must be required to use the data only for defined purposes. In the event of a cross-border transmission of personal data (including granting access from another country), the relevant national requirements for the transfer of personal data abroad must be fulfilled. In particular, personal data from the EU/ EEA may only be processed outside the Group Companies in a third country if the recipient can prove that it has a data protection level equivalent to this Policy. Suitable tools can be: Agreement on EU standard contractual clauses, Participation of the recipient in an EU-accredited certification system for ensuring an adequate level of data protection, or Recognition of binding corporate rules of the recipient to create an adequate level of data protection by the responsible supervisory authorities.\n\nTransfers of personal data to any public authority cannot be massive, disproportionate and indiscriminate in a manner that would go beyond what is necessary in a democratic society. In the event of conflicts between these and public authority requirements, Daimler AG will work with the responsible Group Company to find a practical solution that fulfills the purpose of this Policy (Section 14.3).\n\nAll duties listed in this Section 5 are third party beneficiary rights for the data subject.","Transmission of personal data to recipients outside or inside the controller is subject to the authorization requirements for processing personal data under this Section 5. The data recipient must be required to use the data only for defined purposes. In the event of a cross-border transmission of personal data (including granting access from another country), the relevant national requirements for the transfer of personal data abroad must be fulfilled. In particular, personal data from the EU/ EEA may only be processed outside the controller in a third country if the recipient can prove that recipient has a data protection level equivalent to this Policy. Suitable tools can be: Agreement on EU standard contractual clauses, Participation of the recipient in an EU-accredited certification system for ensuring an adequate level of data protection, or Recognition of binding corporate rules of the recipient to create an adequate level of data protection by the responsible supervisory authorities.Transfers of personal data to any public authority cannot be massive, disproportionate and indiscriminate in a manner that would go beyond what is necessary in a democratic society. In the event of conflicts between these and public authority requirements, controller will work with the responsible Group AG to find a practical solution that fulfills the purpose of this Policy (Section 14.3).All duties listed in this Section 5 are third party beneficiary rights for the data subject."
2,Joint Controllership,"In the event that multiple Group Companies jointly define the means and purposes of processing personal data (along with one or more third parties, if applicable) (joint controllers), the companies must conclude an agreement that stipulates their duties and responsibilities to the data subject whose data they process. The contract templates provided by the Chief Officer Corporate Data Protection must be observed.","In the event that multiple controller jointly define the means and purposes of processing personal data (along with one or more third parties, if applicable) (joint controllers), the companies must conclude an agreement that stipulates companies duties and responsibilities to the data subject whose data companies process. The contract templates provided by the data protection officer must be observed."
3,Place of Jurisdiction,"The data subject may bring an action before the courts at the establishment of the controller or processor or at his habitual residence.\n\nThe data subject who claims an infringement of this Policy in the context of a third country processing can assert his legal claims against both the data importing and the data exporting company in the EU/ EEA. Therefore, the data subject may bring the alleged infringement and the resulting legal claims before the competent courts and regulatory authorities either at the establishment of the controller or at his habitual residence.\n\nThe provisions on liability and place of jurisdiction in this Section are third party beneficiary rights for the data subject.","The data subject may bring an action before the courts at the establishment of the controller or processor or at processor habitual residence.The data subject who claims an infringement of this Policy in the context of a third country processing can assert subject legal claims against both the data importing and the data exporting company in the EU/ EEA. Therefore, the data subject may bring the alleged infringement and the resulting legal claims before the competent courts and regulatory authorities either at the establishment of the controller or at subject habitual residence.The provisions on liability and place of jurisdiction in this Section are third party beneficiary rights for the data subject."
4,Legal basis Customer and Partner Data,"Personal data of the prospective customer, customer, or partner can be processed to establish, perform and terminate a contract. \n\nThis also includes advisory services for the customer or partner under the contract if this is related to the contractual purpose. \n\nPrior to a contract, personal data can be processed to prepare bids or purchase orders or to fulfill other requests of the prospective customer relating to contract conclusion. \n\nProspective customers can be contacted during the contract preparation process using the information that they have provided. \n\nAny restrictions requested by the prospective customers must be complied with. \n\nIf the data subject contacts a Group Company with a request for information (e. g. request to receive information material about a product), processing of personal data to meet this request is permitted. \n\nCustomer loyalty or advertising measures are subject to further legal requirements. \n\nPersonal data can be processed for advertising purposes or market and opinion research, provided that this is consistent with the purpose for which the data was originally collected. \n\nThe data subject must be informed in advance about the use of his/her personal data for advertising purposes. \n\nIf personal data is collected only for advertising purposes, the data subject can choose whether to provide this data. \n\nThe data subject shall be informed that providing data for this purpose is voluntary. \n\nAs part of the communication process, consent should be obtained from the data subject. \n\nWhen giving consent, the data subject should be given a choice among available forms of contact, such as e-mail and phone (consent see Section 5.2.3). \n\nIf the data subject objects to the use of his/her data for advertising purposes, it can no longer be used for these purposes and must be restricted or blocked from use for these purposes. Any other restrictions from specific countries regarding the use of data for advertising ...","Personal data of the prospective customer, customer, or partner can be processed to establish, perform and terminate a contract. This also includes advisory services for the customer or partner under the contract if this is related to the contractual purpose. Prior to a contract, personal data can be processed to prepare bids or purchase orders or to fulfill other requests of the prospective customer relating to contract conclusion. Prospective customers can be contacted during the contract preparation process using the information that customers have provided. Any restrictions requested by the prospective customers must be complied with. If the data subject contacts a controller with a request for information (e. g. request to receive information material about a product), processing of personal data to meet this request is permitted. Customer loyalty or advertising measures are subject to further legal requirements. Personal data can be processed for advertising purposes or market and opinion research, provided that this is consistent with the purpose for which the data was originally collected. The data subject must be informed in advance about the use of Customer /subject personal data for advertising purposes. If personal data is collected only for advertising purposes, the data subject can choose whether to provide this data. The data subject shall be informed that providing data for this purpose is voluntary. As part of the communication process, consent should be obtained from the data subject. When giving consent, the data subject should be given a choice among available forms of contact, such as e-mail and phone (consent see Section 5.2.3). If the data subject objects to the use of Customer /subject data for advertising purposes, it can no longer be used for these purposes and must be restricted or blocked from use for these purposes . Any other restrictions from specific countries regarding the use of data for advertising purposes must be observed. Pe..."


In [20]:
def split_into_sent(text):
    doc = nlp(text) 
    sentences = doc.sents
    sent_list = []
    for sentence in sentences:
        sent_list.append(sentence.text.strip())
    return sent_list

In [21]:
df['rea_sent'] = df.apply(lambda row : split_into_sent(row['rea_text_cleaned_2']), axis = 1)
df.head()

Unnamed: 0,rea_title,rea_text,rea_text_cleaned_2,rea_sent
0,Responsibility,"The members of managing bodies of the Group Companies are responsible for data processing in their area of responsibility. Therefore, they are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within their area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the Chief Officer Corporate Data Protection must be informed immediately.","The members of managing bodies of the controller are responsible for data processing in members area of responsibility. Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the data protection officer must be informed immediately.","[The members of managing bodies of the controller are responsible for data processing in members area of responsibility., Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties)., Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements., Compliance with these requirements is the responsibility of the relevant employees., If public authorities perform data protection checks, the data protection officer must be informed immediately.]"
1,(Further) Transmission outside the Daimler Group,"Transmission of personal data to recipients outside or inside the Group Companies is subject to the authorization requirements for processing personal data under this Section 5. The data recipient must be required to use the data only for defined purposes. In the event of a cross-border transmission of personal data (including granting access from another country), the relevant national requirements for the transfer of personal data abroad must be fulfilled. In particular, personal data from the EU/ EEA may only be processed outside the Group Companies in a third country if the recipient can prove that it has a data protection level equivalent to this Policy. Suitable tools can be: Agreement on EU standard contractual clauses, Participation of the recipient in an EU-accredited certification system for ensuring an adequate level of data protection, or Recognition of binding corporate rules of the recipient to create an adequate level of data protection by the responsible supervisory authorities.\n\nTransfers of personal data to any public authority cannot be massive, disproportionate and indiscriminate in a manner that would go beyond what is necessary in a democratic society. In the event of conflicts between these and public authority requirements, Daimler AG will work with the responsible Group Company to find a practical solution that fulfills the purpose of this Policy (Section 14.3).\n\nAll duties listed in this Section 5 are third party beneficiary rights for the data subject.","Transmission of personal data to recipients outside or inside the controller is subject to the authorization requirements for processing personal data under this Section 5. The data recipient must be required to use the data only for defined purposes. In the event of a cross-border transmission of personal data (including granting access from another country), the relevant national requirements for the transfer of personal data abroad must be fulfilled. In particular, personal data from the EU/ EEA may only be processed outside the controller in a third country if the recipient can prove that recipient has a data protection level equivalent to this Policy. Suitable tools can be: Agreement on EU standard contractual clauses, Participation of the recipient in an EU-accredited certification system for ensuring an adequate level of data protection, or Recognition of binding corporate rules of the recipient to create an adequate level of data protection by the responsible supervisory authorities.Transfers of personal data to any public authority cannot be massive, disproportionate and indiscriminate in a manner that would go beyond what is necessary in a democratic society. In the event of conflicts between these and public authority requirements, controller will work with the responsible Group AG to find a practical solution that fulfills the purpose of this Policy (Section 14.3).All duties listed in this Section 5 are third party beneficiary rights for the data subject.","[Transmission of personal data to recipients outside or inside the controller is subject to the authorization requirements for processing personal data under this Section 5., The data recipient must be required to use the data only for defined purposes., In the event of a cross-border transmission of personal data (including granting access from another country), the relevant national requirements for the transfer of personal data abroad must be fulfilled., In particular, personal data from the EU/ EEA may only be processed outside the controller in a third country if the recipient can prove that recipient has a data protection level equivalent to this Policy., Suitable tools can be: Agreement on EU standard contractual clauses, Participation of the recipient in an EU-accredited certification system for ensuring an adequate level of data protection, or Recognition of binding corporate rules of the recipient to create an adequate level of data protection by the responsible supervisory authorities., Transfers of personal data to any public authority cannot be massive, disproportionate and indiscriminate in a manner that would go beyond what is necessary in a democratic society., In the event of conflicts between these and public authority requirements, controller will work with the responsible Group AG to find a practical solution that fulfills the purpose of this Policy (Section 14.3).All duties listed in this Section 5 are third party beneficiary rights for the data subject.]"
2,Joint Controllership,"In the event that multiple Group Companies jointly define the means and purposes of processing personal data (along with one or more third parties, if applicable) (joint controllers), the companies must conclude an agreement that stipulates their duties and responsibilities to the data subject whose data they process. The contract templates provided by the Chief Officer Corporate Data Protection must be observed.","In the event that multiple controller jointly define the means and purposes of processing personal data (along with one or more third parties, if applicable) (joint controllers), the companies must conclude an agreement that stipulates companies duties and responsibilities to the data subject whose data companies process. The contract templates provided by the data protection officer must be observed.","[In the event that multiple controller jointly define the means and purposes of processing personal data (along with one or more third parties, if applicable) (joint controllers), the companies must conclude an agreement that stipulates companies duties and responsibilities to the data subject whose data companies process., The contract templates provided by the data protection officer must be observed.]"
3,Place of Jurisdiction,"The data subject may bring an action before the courts at the establishment of the controller or processor or at his habitual residence.\n\nThe data subject who claims an infringement of this Policy in the context of a third country processing can assert his legal claims against both the data importing and the data exporting company in the EU/ EEA. Therefore, the data subject may bring the alleged infringement and the resulting legal claims before the competent courts and regulatory authorities either at the establishment of the controller or at his habitual residence.\n\nThe provisions on liability and place of jurisdiction in this Section are third party beneficiary rights for the data subject.","The data subject may bring an action before the courts at the establishment of the controller or processor or at processor habitual residence.The data subject who claims an infringement of this Policy in the context of a third country processing can assert subject legal claims against both the data importing and the data exporting company in the EU/ EEA. Therefore, the data subject may bring the alleged infringement and the resulting legal claims before the competent courts and regulatory authorities either at the establishment of the controller or at subject habitual residence.The provisions on liability and place of jurisdiction in this Section are third party beneficiary rights for the data subject.","[The data subject may bring an action before the courts at the establishment of the controller or processor or at processor habitual residence., The data subject who claims an infringement of this Policy in the context of a third country processing can assert subject legal claims against both the data importing and the data exporting company in the EU/ EEA., Therefore, the data subject may bring the alleged infringement and the resulting legal claims before the competent courts and regulatory authorities either at the establishment of the controller or at subject habitual residence., The provisions on liability and place of jurisdiction in this Section are third party beneficiary rights for the data subject.]"
4,Legal basis Customer and Partner Data,"Personal data of the prospective customer, customer, or partner can be processed to establish, perform and terminate a contract. \n\nThis also includes advisory services for the customer or partner under the contract if this is related to the contractual purpose. \n\nPrior to a contract, personal data can be processed to prepare bids or purchase orders or to fulfill other requests of the prospective customer relating to contract conclusion. \n\nProspective customers can be contacted during the contract preparation process using the information that they have provided. \n\nAny restrictions requested by the prospective customers must be complied with. \n\nIf the data subject contacts a Group Company with a request for information (e. g. request to receive information material about a product), processing of personal data to meet this request is permitted. \n\nCustomer loyalty or advertising measures are subject to further legal requirements. \n\nPersonal data can be processed for advertising purposes or market and opinion research, provided that this is consistent with the purpose for which the data was originally collected. \n\nThe data subject must be informed in advance about the use of his/her personal data for advertising purposes. \n\nIf personal data is collected only for advertising purposes, the data subject can choose whether to provide this data. \n\nThe data subject shall be informed that providing data for this purpose is voluntary. \n\nAs part of the communication process, consent should be obtained from the data subject. \n\nWhen giving consent, the data subject should be given a choice among available forms of contact, such as e-mail and phone (consent see Section 5.2.3). \n\nIf the data subject objects to the use of his/her data for advertising purposes, it can no longer be used for these purposes and must be restricted or blocked from use for these purposes. Any other restrictions from specific countries regarding the use of data for advertising ...","Personal data of the prospective customer, customer, or partner can be processed to establish, perform and terminate a contract. This also includes advisory services for the customer or partner under the contract if this is related to the contractual purpose. Prior to a contract, personal data can be processed to prepare bids or purchase orders or to fulfill other requests of the prospective customer relating to contract conclusion. Prospective customers can be contacted during the contract preparation process using the information that customers have provided. Any restrictions requested by the prospective customers must be complied with. If the data subject contacts a controller with a request for information (e. g. request to receive information material about a product), processing of personal data to meet this request is permitted. Customer loyalty or advertising measures are subject to further legal requirements. Personal data can be processed for advertising purposes or market and opinion research, provided that this is consistent with the purpose for which the data was originally collected. The data subject must be informed in advance about the use of Customer /subject personal data for advertising purposes. If personal data is collected only for advertising purposes, the data subject can choose whether to provide this data. The data subject shall be informed that providing data for this purpose is voluntary. As part of the communication process, consent should be obtained from the data subject. When giving consent, the data subject should be given a choice among available forms of contact, such as e-mail and phone (consent see Section 5.2.3). If the data subject objects to the use of Customer /subject data for advertising purposes, it can no longer be used for these purposes and must be restricted or blocked from use for these purposes . Any other restrictions from specific countries regarding the use of data for advertising purposes must be observed. Pe...","[Personal data of the prospective customer, customer, or partner can be processed to establish, perform and terminate a contract., This also includes advisory services for the customer or partner under the contract if this is related to the contractual purpose., Prior to a contract, personal data can be processed to prepare bids or purchase orders or to fulfill other requests of the prospective customer relating to contract conclusion., Prospective customers can be contacted during the contract preparation process using the information that customers have provided., Any restrictions requested by the prospective customers must be complied with., If the data subject contacts a controller with a request for information (e. g. request to receive information material about a product), processing of personal data to meet this request is permitted., Customer loyalty or advertising measures are subject to further legal requirements., Personal data can be processed for advertising purposes or market and opinion research, provided that this is consistent with the purpose for which the data was originally collected., The data subject must be informed in advance about the use of Customer /subject personal data for advertising purposes., If personal data is collected only for advertising purposes, the data subject can choose whether to provide this data., The data subject shall be informed that providing data for this purpose is voluntary., As part of the communication process, consent should be obtained from the data subject., When giving consent, the data subject should be given a choice among available forms of contact, such as e-mail and phone (consent see Section 5.2.3)., If the data subject objects to the use of Customer /subject data for advertising purposes, it can no longer be used for these purposes and must be restricted or blocked from use for these purposes ., Any other restrictions from specific countries regarding the use of data for advertising purposes must ..."


In [22]:
df_new = df.explode(['rea_sent'])
df_new.head()

Unnamed: 0,rea_title,rea_text,rea_text_cleaned_2,rea_sent
0,Responsibility,"The members of managing bodies of the Group Companies are responsible for data processing in their area of responsibility. Therefore, they are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within their area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the Chief Officer Corporate Data Protection must be informed immediately.","The members of managing bodies of the controller are responsible for data processing in members area of responsibility. Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the data protection officer must be informed immediately.",The members of managing bodies of the controller are responsible for data processing in members area of responsibility.
0,Responsibility,"The members of managing bodies of the Group Companies are responsible for data processing in their area of responsibility. Therefore, they are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within their area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the Chief Officer Corporate Data Protection must be informed immediately.","The members of managing bodies of the controller are responsible for data processing in members area of responsibility. Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the data protection officer must be informed immediately.","Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties)."
0,Responsibility,"The members of managing bodies of the Group Companies are responsible for data processing in their area of responsibility. Therefore, they are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within their area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the Chief Officer Corporate Data Protection must be informed immediately.","The members of managing bodies of the controller are responsible for data processing in members area of responsibility. Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the data protection officer must be informed immediately.","Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements."
0,Responsibility,"The members of managing bodies of the Group Companies are responsible for data processing in their area of responsibility. Therefore, they are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within their area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the Chief Officer Corporate Data Protection must be informed immediately.","The members of managing bodies of the controller are responsible for data processing in members area of responsibility. Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the data protection officer must be informed immediately.",Compliance with these requirements is the responsibility of the relevant employees.
0,Responsibility,"The members of managing bodies of the Group Companies are responsible for data processing in their area of responsibility. Therefore, they are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within their area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the Chief Officer Corporate Data Protection must be informed immediately.","The members of managing bodies of the controller are responsible for data processing in members area of responsibility. Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties). Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements. Compliance with these requirements is the responsibility of the relevant employees. If public authorities perform data protection checks, the data protection officer must be informed immediately.","If public authorities perform data protection checks, the data protection officer must be informed immediately."


In [23]:
df_new = df_new.drop(['rea_text','rea_text_cleaned_2'], axis=1)
df_new.head()

Unnamed: 0,rea_title,rea_sent
0,Responsibility,The members of managing bodies of the controller are responsible for data processing in members area of responsibility.
0,Responsibility,"Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties)."
0,Responsibility,"Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements."
0,Responsibility,Compliance with these requirements is the responsibility of the relevant employees.
0,Responsibility,"If public authorities perform data protection checks, the data protection officer must be informed immediately."


In [24]:
# extract sent keyphrase with RAKE
import pandas as pd
from rake_nltk import Rake
import re
import os

In [25]:
def RAKE_Keyword_Extraction(text, stop_word_path, threshold):

    # our extracted keywords, min 1, max 5.
    keywords = []
    phrases = []
    phrases1 = []
    phrases2 = []
    max_len = 5
    min_len = 1
    # uncustomized stopwordlist
    stop_words = []
    with open(stop_word_path, 'r') as f:
        for w in f.readlines():
            stop_words.append(w.strip())
        f.close()
    # initialize the Rake keyword extractor
    r = Rake(stopwords=stop_words, max_length=max_len, min_length=min_len)
    #text = re.sub('[^a-zA-Z]', ' ', text)
    r.extract_keywords_from_sentences([text])
    # rank the extracted keywords
    phrases = r.get_ranked_phrases_with_scores()
    # exclude keywords, with scores lower than the threshold
    phrases2.extend([p[1] for p in phrases if len(p[1]) > 1 and p[0] > threshold and p[1] not in phrases2])
    if len(phrases2) >= 5:  # maximal 5 keywords
        keywords.append(phrases2[:5])
    elif 0 < len(phrases2) < 5:  # take the rest
        keywords.append(phrases2)
    else:
        phrases1 = r.get_ranked_phrases()
        if len(phrases) >= 5:
            keywords.append(phrases1[:5])
        else:
            keywords.append(phrases1)
    keyword_list = ' '.join(map(str, keywords))
    return keyword_list

In [26]:
#output of function = one concated keyword string
df_new['keywords_sent'] = df_new.apply(lambda row : RAKE_Keyword_Extraction(row['rea_sent'], GDPR_STOPWORDS_RAKE, 3), axis = 1)
df_new.head()

Unnamed: 0,rea_title,rea_sent,keywords_sent
0,Responsibility,The members of managing bodies of the controller are responsible for data processing in members area of responsibility.,"['managing bodies', 'data processing', 'members area']"
0,Responsibility,"Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties).","['national reporting duties ).', 'data protection policy eu', 'data protection', 'legal requirements']"
0,Responsibility,"Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements.","['data protection requirements', 'data processing', 'technical measures', 'management staff', 'duties area']"
0,Responsibility,Compliance with these requirements is the responsibility of the relevant employees.,['relevant employees']
0,Responsibility,"If public authorities perform data protection checks, the data protection officer must be informed immediately.","['data protection officer must', 'informed immediately']"


In [27]:
df_new['keywords_title'] = df_new.apply(lambda row : RAKE_Keyword_Extraction(row['rea_title'], GDPR_STOPWORDS_RAKE, 3), axis = 1)
df_new.head()

Unnamed: 0,rea_title,rea_sent,keywords_sent,keywords_title
0,Responsibility,The members of managing bodies of the controller are responsible for data processing in members area of responsibility.,"['managing bodies', 'data processing', 'members area']",['responsibility']
0,Responsibility,"Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties).","['national reporting duties ).', 'data protection policy eu', 'data protection', 'legal requirements']",['responsibility']
0,Responsibility,"Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements.","['data protection requirements', 'data processing', 'technical measures', 'management staff', 'duties area']",['responsibility']
0,Responsibility,Compliance with these requirements is the responsibility of the relevant employees.,['relevant employees'],['responsibility']
0,Responsibility,"If public authorities perform data protection checks, the data protection officer must be informed immediately.","['data protection officer must', 'informed immediately']",['responsibility']


In [28]:
def join_keywords(a,b):
    string = re.sub(r'[^\w\s]','',a)
    string2 = re.sub(r'[^\w\s]','',b)
    c = string + " " + string2
    return c

In [29]:
import re
df_new['rea_kw_total'] = df_new.apply(lambda row : join_keywords(row['keywords_sent'], row['keywords_title']), axis =1)
df_new.head()

Unnamed: 0,rea_title,rea_sent,keywords_sent,keywords_title,rea_kw_total
0,Responsibility,The members of managing bodies of the controller are responsible for data processing in members area of responsibility.,"['managing bodies', 'data processing', 'members area']",['responsibility'],managing bodies data processing members area responsibility
0,Responsibility,"Therefore, members are required to ensure that the legal requirements, and those contained in this Data Protection Policy EU, for data protection are met (e. g. national reporting duties).","['national reporting duties ).', 'data protection policy eu', 'data protection', 'legal requirements']",['responsibility'],national reporting duties data protection policy eu data protection legal requirements responsibility
0,Responsibility,"Within duties area of responsibility, management staff is responsible for ensuring that organizational, HR and technical measures are in place so that any data processing is carried out in accordance with data protection requirements.","['data protection requirements', 'data processing', 'technical measures', 'management staff', 'duties area']",['responsibility'],data protection requirements data processing technical measures management staff duties area responsibility
0,Responsibility,Compliance with these requirements is the responsibility of the relevant employees.,['relevant employees'],['responsibility'],relevant employees responsibility
0,Responsibility,"If public authorities perform data protection checks, the data protection officer must be informed immediately.","['data protection officer must', 'informed immediately']",['responsibility'],data protection officer must informed immediately responsibility


In [30]:
# save preprocessed reg to excel
pd.DataFrame(df_new).to_excel(join(INTERMEDIATE_DIRECTORY, "gdpr_rea_preprocessed_optiona.xlsx"))  