# Read Chunk Clean
The following notebook imports GDPR Data from a PDF File and performs the following operations:
- Chunks the data so as to have a dictionary of the following form: dict[Chapter][Article]
- Cleans the data by removing: whitespaces, punctuation and lemmatizing each token

The data is finally saved for later usage

In [1]:
#pip install typer==0.3.0

In [None]:
import fitz 
import re
import json 
from collections import defaultdict
import spacy
import nltk
from nltk.corpus import stopwords, words
import string 
from tqdm import tqdm 

In [None]:
doc = fitz.open('../../data/original_data/gdpr_text.pdf')
print('Number of Pages available:', doc.page_count)
full_text = ' '.join(page.get_text() for page in doc)
print('Number of Tokens:', len(full_text))

Number of Pages available: 88
Number of Tokens: 360940


In [None]:
#Identify Chapters and Articles available in the tex
all_chapters = re.findall(r'CHAPTER .+\n', full_text)
all_articles = re.findall(r'Article [0-9]+ \n', full_text)
gdpr_chunked = defaultdict(dict)

chapter_i_pos = re.search(fr'{all_chapters[0]}', full_text).span()
gdpr_chunked['Legislative acts']['Regulation']=full_text[36:chapter_i_pos[0]].replace('\n', '').strip()
previous_start = chapter_i_pos[1]

#For each chapter identify its start and finish and concatenate its text
for ch in range(len(all_chapters)):
    if ch != len(all_chapters)-1:
        end_part = re.search(fr'{all_chapters[ch+1]}', full_text).span()[0]
        next_start = re.search(fr'{all_chapters[ch+1]}', full_text).span()[1]
        relevant_text = full_text[previous_start:end_part]
    else:
        relevant_text = full_text[previous_start:]

#On each relevant text section look for the articles available in it        
    chapt_name_idx = relevant_text.find('\n')
    for art in range(len(all_articles)-1):
        if re.search(fr'{all_articles[art]}', relevant_text)!=None:
            start_art_numb = re.search(fr'{all_articles[art]}', relevant_text).span()[0]
            start_part_art = re.search(fr'{all_articles[art]}', relevant_text).span()[1]
            if art != len(all_chapters)-1 and re.search(fr'{all_articles[art+1]}', relevant_text)!=None:
                end_part_art = re.search(fr'{all_articles[art+1]}', relevant_text).span()[0]
                art_text = relevant_text[start_part_art:end_part_art]
                art_name_idx = art_text.find('\n')
                #Create a clean article name
                art_key = relevant_text[start_art_numb:start_part_art]+'- '+art_text[:art_name_idx].strip()
                art_key = art_key.replace('\n','').strip()
                gdpr_chunked[relevant_text[:chapt_name_idx].strip()][art_key]=art_text[art_name_idx:].replace('\n', '').strip()
            else: 
                art_text=relevant_text[start_part_art:]
                art_name_idx = art_text.find('\n')
                #Create a clean article name
                art_key = relevant_text[start_art_numb:start_part_art]+'- '+art_text[:art_name_idx].strip()
                art_key = art_key.replace('\n','').strip()
                gdpr_chunked[relevant_text[:chapt_name_idx].strip()][art_key]=art_text[art_name_idx:].replace('\n', '').strip()
    
    previous_start = next_start

In [None]:
# python -m spacy download en_core_web_sm
nltk.download('stopwords')

In [None]:
#Create stopwords list
nlp = spacy.load("en_core_web_sm", disable=['ner', 'textcat'])
clean_stopwords = set(stopwords.words('english')).difference(set('not'))
clean_punctuation = ''.join(set(string.punctuation).difference(set(['()', '(', ')', '/'])))

In [None]:
#Define lemmatization function
def lemmatize(text):
    return ' '.join(token.lemma_ for token in nlp(text))

In [None]:
#Create a clean gdpr dictionary
gdpr_clean = defaultdict(dict)
for k in tqdm(gdpr_chunked.keys()):
    for sub_k in gdpr_chunked[k].keys():
        text = gdpr_chunked[k][sub_k]
        #Remove punctuation
        text = text.translate(str.maketrans('', '', clean_punctuation)) 
        #Remove extra whitespaces, stopwords and lemmatize
        text = ' '.join(i.lower().strip() for i in lemmatize(text).split() if i not in clean_stopwords and len(i)>=2)
        #remove header
        text = re.sub(r'452016 119/[0-9]+ official journal european union en', '', text)
        gdpr_clean[k][sub_k] = text

100%|██████████| 12/12 [00:04<00:00,  2.42it/s]


In [None]:
#Print all the articles to verify they are correct
for k in gdpr_clean.keys():
    print(gdpr_clean[k].keys())

dict_keys(['Regulation'])
dict_keys(['Article 1 - Subject-matter and objectives', 'Article 2 - Material scope', 'Article 3 - Territorial scope', 'Article 4 - Definitions'])
dict_keys(['Article 5 - Principles relating to processing of personal data', 'Article 6 - Lawfulness of processing', 'Article 7 - Conditions for consent', "Article 8 - Conditions applicable to child's consent in relation to information society services", 'Article 9 - Processing of special categories of personal data', 'Article 10 - Processing of personal data relating to criminal convictions and offences', 'Article 11 - Processing which does not require identification'])
dict_keys(['Article 12 - Transparent information, communication and modalities for the exercise of the rights of the data', 'Article 13 - Information to be provided where personal data are collected from the data subject', 'Article 14 - Information to be provided where personal data have not been obtained from the data subject', 'Article 15 - Right 

In [None]:
#Eliminate wrong article
del gdpr_clean['Independent supervisory authorities']['Article 41 - and of a certification body pursuant to Article 43;']

In [None]:
#Save clean data
with open('../../data/chunked_data/gdpr_clean.json', 'w') as f: 
    json.dump(gdpr_clean, f)