# Retrieve Meta Data from Treaties

In [20]:
import pandas as pd
from nltk.corpus import stopwords
import pdftotext
import docx2txt
from os import listdir
import fuzzyset
import time
import difflib
import regex as re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
import ray 
import numpy as np


#get the meta data - webscraped and saved as csv file
df_meta_treaty = pd.read_csv("Meta Data Treaties.csv")
df_meta_treaty_decision = pd.read_csv("Meta Data Treaty Decisions.csv")

#create list with all treaty names / These names in the list will be searched later
treaty_names = list(df_meta_treaty["Title"])

#delete fill words
for number, treaty in enumerate(treaty_names):
    new_name = ""
    for word in treaty.split():
        if (word.lower() in stopwords.words("english")) or (word.lower() in stopwords.words("spanish")): #some documents are spanish
            pass
        else:
            new_name += (word + " ")
    treaty_names[number] = new_name[:-1]
    

# Convert Docx/PDFs to txt files - Can be skipped

In [None]:
#only for one, has to be automized for all pdfs + create txt file with name list


list_of_subfolders = ["Decisions", "Bilateral Agreements", "Multilateral Agreements", "Official Documents", "Other", "Recommendations", "Resolutions", "Treaties"]


#for statistics later
successes = 0
fails = 0


#loop over different subfolders
for subfolder in list_of_subfolders:
    sub_fails = 0
    sub_suc = 0
    #get list of all files in one folder
    list_of_files = listdir("Downloads/" + subfolder)

    #loop over different files
    for file in list_of_files:
        try: #try to avoid crashing the programm, as some pdfs are corrupted
            if file[-3:] == "pdf":
                with open(f"Downloads/{subfolder}/{file}", "rb") as f:
                    pdf = pdftotext.PDF(f)

                with open(f"Downloads/{subfolder}/{file[:-3]}.txt", "w", encoding='utf-8') as f:
                    f.write(" ".join(pdf))
                successes += 1
                sub_suc += 1
            
            if file[-4:] == 'docx':
                doc_text = docx2txt.process(f'Downloads/{subfolder}/{file}')
                with open(f"Downloads/{subfolder}/{file[:-4]}.txt", "w", encoding='utf-8') as f:
                    f.write(doc_text)
                successes += 1
                sub_suc += 1
                
                
        except:
            fails += 1
            sub_fails += 1
            
    print("subfolder ", subfolder, "done")
    print(f"All complete in folder {subfolder}, with a success rate of {sub_suc/(sub_fails+sub_suc)} on {sub_suc + sub_fails} files")

print("All complete, with a success rate of ", (successes/(fails + successes), " on ", successes + fails, " files"))


# Analyze Text 

In [2]:
list_of_subfolders = ["Train Data"] #can be extended, but only use one small folder for now


def text_mining(treaty_names):
    
    #create dataframes for later saving data
    df_fuzzysearch = pd.DataFrame(columns = treaty_names)
    df_sequencematch = pd.DataFrame(columns = treaty_names)
    
    
    for subfolder in list_of_subfolders: #loop over subfolders
        list_of_files = listdir("Downloads/" + subfolder) #get all files

        for filename in list_of_files: #go through files in subfolder
            
            if filename[-3:] == "txt":  #check if it is a txt document
                with open("Downloads/" + subfolder + "/" + filename, encoding = "utf8") as f:
                    document_text = f.readlines()
                    
           
                #clean the document
                text = " ".join(document_text) #create long string 
                text = text.lower()
                text = re.sub("</?.*?>"," <> ", text) # remove tags
                text = re.sub("(\\d|\\W)+"," ", text).strip() # remove special characters and digits
                text = text.split()
                
                
                list_of_outcomes = []
                start_time = time.time() #to later see the speed of the programm
                
                #loop over treaty names, to see if it is mention in the document text
                for treaty in treaty_names: 
                    
                    #treaty = "Nagoya Kuala Lumpur Supplementary Protocol Liability Redress Cartagena Protocol Biosafety"
                    
                    
                    '''
                    VERSION 1: Use fuzzy search
                    As fuzzysearch doesn't work well if the string length differ widely,
                    the code loops over different overlaping parts of the text and saves the highest score
                    '''
                    high_score = 0
                    name_len = len(treaty.split()) #see how long the treaty name is
                    for num  in range(0, len(text) - 2 - name_len, (name_len + 2) // 2): #get the length of the text and get a step size that is according to the name length
                        
                        #use fuzzy search to see the match for the specific part of the text
                        Search_Query = fuzzyset.FuzzySet()
                        Search_Query.add(treaty)
                        outcome = Search_Query.get("".join(text[num:num + name_len + 2]))
                        
                        
                        if outcome != None: #check if the match is None (None will cause error)
                            if outcome[0][0] > high_score: #check if the current score is larger than the highest score
                                high_score = outcome[0][0] #update highscore
                    list_of_outcomes.append(high_score)
                
                #add outcomes to df, save df
                df_fuzzysearch.loc[filename[-3:]] = list_of_outcomes
                df_fuzzysearch.to_csv('Downloads/Fuzzy Search Outcomes.csv')
                print('time FuzzySearch: ', time.time() - start_time)
                
                
                #print closes match
                #print(filename, ' Match: ', treaty_names[list_of_outcomes.index(max(list_of_outcomes))], (max(list_of_outcomes)))
                
                    
                list_of_outcomes = []
                start_time = time.time()
                for treaty in treaty_names: 
                    '''
                    VERSION 2: SequenceMatcher
                    Different library - roughly three times faster
                    does not need to loop over 
                    '''
                    
                    
                    s = difflib.SequenceMatcher(None, ''.join(text), treaty)
                    high_score = sum(n for i,j,n in s.get_matching_blocks()) / float(len(treaty))
                    list_of_outcomes.append(high_score)
                    
                #add outcomes to df, save df
                print('time SequenceMatcher: ', time.time() - start_time)
                df_sequencematch.loc[filename[-3:]] = list_of_outcomes
                df_sequencematch.to_csv('Downloads/Seqeunce Match Outcomes.csv')

                
            
                
'''
Uncomment to run code
'''   
#text_mining(treaty_names)
#print("ENDE")

'\nUncomment to run code\n'

# Analyze Text in Parallel
### roughly 15x faster

In [8]:
def clean_text(text):          
    #clean the document
    text = " ".join(text) #create long string 
    text = text.lower()
    text = re.sub("</?.*?>"," ", text) # remove tags
    text = re.sub('\(.\)| .\)', '', text) #remove a) and (b) mentions
    text = re.sub(r'\w*\d\w*', '', text).strip() #remove mistranslations such as '-' to 'xe23xo0'; basically remove any words that have numbers in it
    text = re.sub("(\\d|\\W)+"," ", text).strip() # remove special characters and digits
    return text
                
            
def clean_treaty_names(name):
    name = [ n for n in name.split() if len(n)> 3]
    name = re.sub(r'\w*\d\w*', '', ' '.join(name)).strip() #remove mistranslations such as '-' to 'xe23xo0'; basically remove any words that have numbers in it
    name = re.sub(r'\w*\.\w*', '', name).strip()
    name = text = re.sub("(\\d|\\W)+"," ", name).strip() # remove special characters and digits
    return name



def adjust_to_name_length(name, score):
    ''' 
    Returns the difference of name length and average length times .01 
    0.01, as 10 is the average, with treaty names ranging between 2 and 25 words (max therefore -/+ .08)
    Needed, as short treaty names receive higher matches
    However, not implemented in current code, as the adjustment can be done on the csv output file more effectively
    '''
    
    average = 9.73
    return score + ((len(name.split()) - average) * .01)
    

#update treaty names | deletes certain words and numbers, to shorten treaty name length
new_treaty_names = []
for treaty in treaty_names:
    new_treaty_names.append(clean_treaty_names(treaty))
treaty_names = new_treaty_names.copy()

    
@ray.remote #indicates parallel programming
def fuzzy_search_treaty_decision(treaty_decision_text, treaty_decision_name, treaty_names, step_size, added_words):
    
    print(treaty_decision_name)
    list_of_outcomes = []
    text = clean_text(treaty_decision_text).split() 
    for treaty in treaty_names: 
        '''
        FUZZY SEARCH
        As fuzzysearch doesn't work well if the string length differ widely,
        the code loops over different overlaping parts of the text and saves the highest score
        '''

        high_score = 0
        name_len = len(treaty.split()) #see how long the treaty name is
        treaty_specific_step_size = (name_len + added_words) // step_size
        if treaty_specific_step_size == 0:
            treaty_specific_step_size = 1

        #use fuzzy search to see the match for the specific part of the text
        Search_Query = fuzzyset.FuzzySet()
        for num  in range(0, len(text) - 2 - name_len, treaty_specific_step_size): #get the length of the text and get a step size that is according to the name length



            Search_Query.add(treaty)
            outcome = Search_Query.get(" ".join(text[num:num + name_len + added_words]))


            if outcome != None: #check if the match is None (None will cause error)
                if outcome[0][0] > high_score: #check if the current score is larger than the highest score
                    high_score = outcome[0][0] #update highscore

        #uncomment line below, to adjust for treaty length
        #high_score = adjust_to_name_length(treaty, high_score)
        list_of_outcomes.append(high_score)
        
    return treaty_decision_name, list_of_outcomes
    
    
def fuzzy_search_final(step_size = 4, added_words = 2, list_of_subfolders = ['Train Data/']):
    
    #use cfuzzyset for 15% performance increase
    #check documentation 
    
    start_time = time.time()
    #add df for saving data
    df = pd.DataFrame(columns = treaty_names)
    list_of_treaty_decision_texts = []
    list_of_treaty_decision_names = []
    
    for subfolder in list_of_subfolders: #loop over subfolders
        list_of_files = listdir("Downloads/" + subfolder) #get all files

        for filename in list_of_files: #go through files in subfolder
            list_of_outcomes = []



            if (filename[-3:] == "txt"):  #check if it is a txt document
                with open("Downloads/" + subfolder + "/" + filename, encoding = "utf8") as f:
                    document_text = f.readlines()
                
                list_of_treaty_decision_texts.append(document_text)
                list_of_treaty_decision_names.append(filename[:-3])

        
        #add outcomes to df, save df
        ray.shutdown()
        ray.init()
        list_of_tasks = []
        for num in range(len(list_of_treaty_decision_names)):
            #add all the texts and names to a list, which will be done parallel in the next step
            list_of_tasks.append(fuzzy_search_treaty_decision.remote(list_of_treaty_decision_texts[num], list_of_treaty_decision_names[num], treaty_names, step_size, added_words))
        answers = ray.get(list_of_tasks)
        
        #returns a list of touples with treaty decision name, and a list of the correct values
        for answer in answers:    
            df.loc[answer[0][:-3]] = answer[1]
        df.to_csv(f'Downloads/Fuzzy Search Outcomes Subfolder {subfolder[:-1]}.csv')
        
        print(f'time FuzzySearch for Subfolder {subfolder[:-1]:} ', time.time() - start_time)
 

    
    return 0 



In [9]:
fuzzy_search_final()

 pid=30304)[0m COP411 Cooperation with the European Union.
 pid=37840)[0m A paration for the followup to the Strategic Plan for Biodiversity 20112020 and the Strategic Plan for the Cartagena Protocol on Biosafety 20112020.pdf.
 pid=17876)[0m African cherry Prunus africana.
 pid=30304)[0m Hexachlorobutadiene.on
 pid=29112)[0m Management of Marine Debris.
 pid=17876)[0m Outcome of the UNCCD 1st Scientific Conference.pdf.
 pid=30304)[0m Preparation for the followup to the Strategic Plan for Biodiversity 20112020.pdf.
 pid=17876)[0m Process for aligning national reporting, assessment and review.pdf.
 pid=30304)[0m Pyu Ancient Cities Myanmar C 1444.
 pid=30304)[0m Review of annexes A and B.pdf.
 pid=37840)[0m Submission regarding Swaziland.
time FuzzySearch for Subfolder Train Data  764.1514472961426


0

In [34]:

def create_tfidf_features(corpus, max_features=20000, n_gram_range = (1,1), max_df=0.95, min_df=2):
    """ Creates a tf-idf matrix for the `corpus` using sklearn. """
    tfidf_vectorizor = TfidfVectorizer(decode_error='replace', strip_accents='unicode', analyzer='word',
                                       stop_words='english', ngram_range= n_gram_range, max_features=max_features,
                                       norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True, vocabulary = None) #or use vocab treaty dict
    X = tfidf_vectorizor.fit_transform(corpus)
    print('tfidf matrix successfully created.')
    return X, tfidf_vectorizor

def calculate_similarity(X, vectorizor, queries, top_k=5):
    """ Vectorizes the `query` via `vectorizor` and calculates the cosine similarity of
    the `query` and `X` (all the documents) and returns the `top_k` similar documents."""

    # Vectorize the query to the same length as documents
    similarities = []
    for query in queries:
        query_vec = vectorizor.transform([query])
        # Compute the cosine similarity between query_vec and all the documents
        cosine_similarities = cosine_similarity(X,query_vec).flatten()
        similarities.append(cosine_similarities)
        
    return similarities


def tf_idf_final(max_feature = 100000, n_gram = (1,1), list_of_subfolders = ['Train Data/']):

    
    df = pd.DataFrame(columns = treaty_names)
    list_of_treaty_decision_names = []
    list_of_treaty_decision_texts = []
    
    for subfolder in list_of_subfolders: #loop over subfolders
        list_of_files = listdir("Downloads/" + subfolder) #get all files

        for filename in list_of_files: #go through files in subfolder
            
            if (filename[-3:] == "txt"):  #check if it is a txt document
                with open("Downloads/" + subfolder + "/" + filename, encoding = "utf8") as f:
                    document_text = f.readlines()
                    
               
                
                list_of_treaty_decision_names.append(filename)
                doc_text = clean_text(document_text)
                list_of_treaty_decision_texts.append(doc_text)
                
        start_time = time.time()        
        '''
        VERSION 0: TF-IDF Matrix
        Super Quick, However, not very accurate so far
        '''

        


        X,v = create_tfidf_features(list_of_treaty_decision_texts, max_features = max_feature, n_gram_range = n_gram)  
        features = v.get_feature_names()




        similarities = calculate_similarity(X, v, treaty_names)
        similarities = np.array(similarities).T
        for num, sims in enumerate(similarities):
            list_sim = []
            for sim in sims:
                list_sim.append(float(sim))
            
            df.loc[list_of_treaty_decision_names[num][:-3]] = list_sim
            print('filename: ', list_of_treaty_decision_names[num])
            print('Successful')
            print()
        df.to_csv(f'Downloads/TF-IDF Search Outcomes Subfolder {subfolder[:-1]}.csv')
        


        #overall_score.append(score)
        #attributes.append((max_feature, n_gram))
        #return overall_score, attributes
            
            

In [35]:
tf_idf_final()

tfidf matrix successfully created.
filename:  A paration for the followup to the Strategic Plan for Biodiversity 20112020 and the Strategic Plan for the Cartagena Protocol on Biosafety 20112020.pdf.txt
Successful

filename:  African cherry Prunus africana.txt
Successful

filename:  COP411 Cooperation with the European Union.txt
Successful

Successful

filename:  Hexachlorobutadiene.txt
Successful

filename:  Management of Marine Debris.txt
Successful

filename:  Outcome of the UNCCD 1st Scientific Conference.pdf.txt
Successful

filename:  Preparation for the followup to the Strategic Plan for Biodiversity 20112020.pdf.txt
Successful

filename:  Process for aligning national reporting, assessment and review.pdf.txt
Successful

filename:  Pyu Ancient Cities Myanmar C 1444.txt
Successful

filename:  Review of annexes A and B.pdf.txt
Successful

filename:  Submission regarding Swaziland.txt
Successful

