# Retrieve Meta Data from Treaties

In [10]:
import pandas as pd
from nltk.corpus import stopwords
import pdftotext
import docx2txt
from os import listdir
import fuzzyset
import time
import difflib
import regex as re

#get the meta data - webscraped and saved as csv file
df_meta_treaty = pd.read_csv("Meta Data Treaties.csv")
df_meta_treaty_decision = pd.read_csv("Meta Data Treaty Decisions.csv")

#create list with all treaty names / These names in the list will be searched later
treaty_names = list(df_meta_treaty["Title"])

#delete fill words
for number, treaty in enumerate(treaty_names):
    new_name = ""
    for word in treaty.split():
        if (word.lower() in stopwords.words("english")) or (word.lower() in stopwords.words("spanish")): #some documents are spanish
            pass
        else:
            new_name += (word + " ")
    treaty_names[number] = new_name[:-1]
    

# Convert Docx/PDFs to txt files - Can be skipped

In [None]:
#only for one, has to be automized for all pdfs + create txt file with name list


list_of_subfolders = ["Decisions", "Bilateral Agreements", "Multilateral Agreements", "Official Documents", "Other", "Recommendations", "Resolutions", "Treaties"]


#for statistics later
successes = 0
fails = 0


#loop over different subfolders
for subfolder in list_of_subfolders:
    sub_fails = 0
    sub_suc = 0
    #get list of all files in one folder
    list_of_files = listdir("Downloads/" + subfolder)

    #loop over different files
    for file in list_of_files:
        try: #try to avoid crashing the programm, as some pdfs are corrupted
            if file[-3:] == "pdf":
                with open(f"Downloads/{subfolder}/{file}", "rb") as f:
                    pdf = pdftotext.PDF(f)

                with open(f"Downloads/{subfolder}/{file[:-3]}.txt", "w", encoding='utf-8') as f:
                    f.write(" ".join(pdf))
                successes += 1
                sub_suc += 1
            
            if file[-4:] == 'docx':
                doc_text = docx2txt.process(f'Downloads/{subfolder}/{file}')
                with open(f"Downloads/{subfolder}/{file[:-4]}.txt", "w", encoding='utf-8') as f:
                    f.write(doc_text)
                successes += 1
                sub_suc += 1
                
                
        except:
            fails += 1
            sub_fails += 1
            
    print("subfolder ", subfolder, "done")
    print(f"All complete in folder {subfolder}, with a success rate of {sub_suc/(sub_fails+sub_suc)} on {sub_suc + sub_fails} files")

print("All complete, with a success rate of ", (successes/(fails + successes), " on ", successes + fails, " files"))


# Analyze Text 

In [None]:
list_of_subfolders = ["Train Data"] #can be extended, but only use one small folder for now


def text_mining(treaty_names):
    
    #create dataframes for later saving data
    df_fuzzysearch = pd.DataFrame(columns = treaty_names)
    df_sequencematch = pd.DataFrame(columns = treaty_names)
    
    
    for subfolder in list_of_subfolders: #loop over subfolders
        list_of_files = listdir("Downloads/" + subfolder) #get all files

        for filename in list_of_files: #go through files in subfolder
            
            if filename[-3:] == "txt":  #check if it is a txt document
                with open("Downloads/" + subfolder + "/" + filename, encoding = "utf8") as f:
                    document_text = f.readlines()
                    
           
                #clean the document
                text = " ".join(document_text) #create long string 
                text = text.lower()
                text = re.sub("</?.*?>"," <> ", text) # remove tags
                text = re.sub("(\\d|\\W)+"," ", text).strip() # remove special characters and digits
                text = text.split()
                
                
                list_of_outcomes = []
                start_time = time.time() #to later see the speed of the programm
                
                #loop over treaty names, to see if it is mention in the document text
                for treaty in treaty_names: 
                    
                    #treaty = "Nagoya Kuala Lumpur Supplementary Protocol Liability Redress Cartagena Protocol Biosafety"
                    
                    
                    '''
                    VERSION 1: Use fuzzy search
                    As fuzzysearch doesn't work well if the string length differ widely,
                    the code loops over different overlaping parts of the text and saves the highest score
                    '''
                    high_score = 0
                    name_len = len(treaty.split()) #see how long the treaty name is
                    for num  in range(0, len(text) - 2 - name_len, (name_len + 2) // 2): #get the length of the text and get a step size that is according to the name length
                        
                        #use fuzzy search to see the match for the specific part of the text
                        Search_Query = fuzzyset.FuzzySet()
                        Search_Query.add(treaty)
                        outcome = Search_Query.get("".join(text[num:num + name_len + 2]))
                        
                        
                        if outcome != None: #check if the match is None (None will cause error)
                            if outcome[0][0] > high_score: #check if the current score is larger than the highest score
                                high_score = outcome[0][0] #update highscore
                    list_of_outcomes.append(high_score)
                
                #add outcomes to df, save df
                df_fuzzysearch.loc[filename[-3:]] = list_of_outcomes
                df_fuzzysearch.to_csv('Downloads/Fuzzy Search Outcomes.csv')
                print('time FuzzySearch: ', time.time() - start_time)
                
                
                #print closes match
                #print(filename, ' Match: ', treaty_names[list_of_outcomes.index(max(list_of_outcomes))], (max(list_of_outcomes)))
                
                    
                list_of_outcomes = []
                start_time = time.time()
                for treaty in treaty_names: 
                    '''
                    VERSION 2: SequenceMatcher
                    Different library - roughly three times faster
                    does not need to loop over 
                    '''
                    
                    
                    s = difflib.SequenceMatcher(None, ''.join(text), treaty)
                    high_score = sum(n for i,j,n in s.get_matching_blocks()) / float(len(treaty))
                    list_of_outcomes.append(high_score)
                    
                #add outcomes to df, save df
                print('time SequenceMatcher: ', time.time() - start_time)
                df_sequencematch.loc[filename[-3:]] = list_of_outcomes
                df_sequencematch.to_csv('Downloads/Seqeunce Match Outcomes.csv')

                
            
                
    
text_mining(treaty_names)
print("ENDE")

time FuzzySearch:  201.60133409500122
time SequenceMatcher:  91.03226804733276
