# Retrieve Meta Data from Treaties

In [1]:
#import packages
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import pdftotext
import docx2txt
from os import listdir
import fuzzyset
import time
import difflib
import regex as re
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
from fuzzywuzzy import fuzz

#get the meta data - webscraped and saved as csv file
df_meta_treaty = pd.read_csv("Meta Data Treaties.csv")
df_meta_treaty_decision = pd.read_csv("Meta Data Treaty Decisions.csv")

#create list with all treaty names / These names in the list will be searched later
treaty_names = list(df_meta_treaty["Title"])

#delete fill words
for number, treaty in enumerate(treaty_names):
    new_name = ""
    for word in treaty.split():
        if (word.lower() in stopwords.words("english")) or (word.lower() in stopwords.words("spanish")): #some documents are spanish
            pass
        else:
            new_name += (word + " ")
    treaty_names[number] = new_name[:-1]
    

# Analyze Text 

In [2]:
"""
GOES THROUGH THE TRAIN DATA WITH DIFFERENT ASSESSMENT METHODS
PRINTS TIME NEEDED AND THE FOUND TREATY DECISIONS WITH ASSOCIATED VALUES
DICT_OF_CORRECT_ANSWERS PROVIDES CORRECT ANSWERS MANUALLY SEARCHED BY CHRISTOPH KRUEGER
"""

list_of_subfolders = ["Train Data"] #can be extended, but only use one small folder for now

#create a dictionary with the right answers, to later check the accuracy of the algorithms
dict_of_correct_answers = {}
dict_of_correct_answers['Management of Marine Debris.txt'] = ['International Convention for the Prevention of Pollution From Ships','Convention on Biological Diversity', 'London Convention','Inter American Convention for the Protection and Conservation of Sea Turtles','Protocols to Regional Seas Conventions on Pollution from Land Based Sources']
dict_of_correct_answers['A paration for the followup to the Strategic Plan for Biodiversity 20112020 and the Strategic Plan for the Cartagena Protocol on Biosafety 20112020.pdf.txt'] = ['Convention on Biological Diversity','Cartagena Protocol on Biosafety', 'Nagoya – Kuala Lumpur Supplementary Protocol']
dict_of_correct_answers['African cherry Prunus africana.txt'] = ['None']
dict_of_correct_answers['COP411 Cooperation with the European Union.txt'] = ['None']
dict_of_correct_answers['Environmental assessmentinformation system, monitoring and early warning xe2x80x93 Article 12 of the Carpathian Convention.txt'] = ['None'] #should reference Carpathian Convention, but the convention is not in ecolex
dict_of_correct_answers['Hexachlorobutadiene.txt'] = ['Stockholm Convention']
dict_of_correct_answers['Outcome of the UNCCD 1st Scientific Conference.pdf.txt'] = ['None']
dict_of_correct_answers['Preparation for the followup to the Strategic Plan for Biodiversity 20112020.pdf.txt'] = ['Convention on Biological Diversity', 'NAGOYA PROTOCOL ON ACCESS TO GENETIC RESOURCES', 'NAGOYA PROTOCOL ON ACCESS AND BENEFIT-SHARING'] 
dict_of_correct_answers['Process for aligning national reporting, assessment and review.pdf.txt'] = ['Convention on Biological Diversity', 'Rio conventions', 'Cartagena Protocol', 'Nagoya Protocol'] 
dict_of_correct_answers['Pyu Ancient Cities Myanmar C 1444.txt'] = ['None']
dict_of_correct_answers['Review of annexes A and B.pdf.txt'] = ['Minamata Convention on Mercury']
dict_of_correct_answers['Submission regarding Swaziland.txt'] = ['Basel Convention'] #ecolex name differs widely



#retrieve the best n names with their values frmo two lists
def find_top_X(values, names, n = 5):
    top_idx = np.argsort(values)[-n:]
    top_vals = [values[i] for i in top_idx]
    top_names = [names[i] for i in top_idx]
    return top_vals[::-1], top_names[::-1]



def create_tfidf_features(corpus, max_features=20000, max_df=0.95, min_df=2):
    #concept idea for the algorithm from https://github.com/sci2lab/ml_tutorial/blob/master/ml_tutorial/tfidf.py#L24
    """ Creates a tf-idf matrix for the `corpus` using sklearn. """
    tfidf_vectorizor = TfidfVectorizer(decode_error='replace', strip_accents='unicode', analyzer='word',
                                       stop_words='english', ngram_range=(1, 4), max_features=max_features,
                                       norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True, vocabulary = None) #or use vocab treaty dict
    X = tfidf_vectorizor.fit_transform(corpus)
    print('tfidf matrix successfully created.')
    return X, tfidf_vectorizor

def preprocess(title, body=None):
    #concept idea for the algorithm from https://github.com/sci2lab/ml_tutorial/blob/master/ml_tutorial/tfidf.py#L24
    """ Preprocess the input, i.e. lowercase, remove html tags, special character and digits."""
    text = ''
    if body is None:
        text = title
    else:
        text = title + body
    # to lower case
    text = text.lower()

    # remove tags
    text = re.sub("</?.*?>"," <> ", text)

    # remove special characters and digits
    text = re.sub("(\\d|\\W)+"," ", text).strip()
    return text

def calculate_similarity(X, vectorizor, queries, top_k=5):
    #concept idea for the algorithm from https://github.com/sci2lab/ml_tutorial/blob/master/ml_tutorial/tfidf.py#L24
    """ Vectorizes the `query` via `vectorizor` and calculates the cosine similarity of
    the `query` and `X` (all the documents) and returns the `top_k` similar documents."""

    # Vectorize the query to the same length as documents
    similarities = []
    for query in queries:
        query_vec = vectorizor.transform([query])
        # Compute the cosine similarity between query_vec and all the documents
        cosine_similarities = cosine_similarity(X,query_vec).flatten()
        similarities.append(cosine_similarities)
        
    return similarities




def text_mining(treaty_names):
    
    #create dataframes for later saving data
    df_fuzzysearch = pd.DataFrame(columns = treaty_names)
    df_sequencematch = pd.DataFrame(columns = treaty_names)
    
    
    for subfolder in list_of_subfolders: #loop over subfolders
        list_of_files = listdir("Downloads/" + subfolder) #get all files
        
        for filename in list_of_files: #go through files in subfolder
            
            if (filename[-3:] == "txt"):  #check if it is a txt document
                with open("Downloads/" + subfolder + "/" + filename, encoding = "utf8") as f:
                    document_text = f.readlines()
                    
                print(dict_of_correct_answers)
                #clean the document
                text = " ".join(document_text) #create long string 
                text = text.lower()
                text = re.sub("</?.*?>"," <> ", text) # remove tags
                text = re.sub("(\\d|\\W)+"," ", text).strip() # remove special characters and digits
                text = text.split()
                
                
                list_of_outcomes = []
                start_time = time.time() #to later see the speed of the programm
                
                '''
                VERSION 0: TF-IDF Matrix
                Super Quick, However, not very accurate so far
                '''


                X,v = create_tfidf_features([' '.join(text)])
                features = v.get_feature_names()



                search_start = time.time()
                similarities = calculate_similarity(X, v, treaty_names)
                search_time = time.time() - search_start
                list_sim = []
                for sim in similarities:
                    list_sim.append(float(sim))
                print("time TF-IDF: ", search_time)
                print(find_top_X(list_sim, treaty_names, 10))
                #loop over treaty names, to see if it is mention in the document text
                for treaty in treaty_names: 
                    
                    #treaty = "Nagoya Kuala Lumpur Supplementary Protocol Liability Redress Cartagena Protocol Biosafety"
                    
                    
                    
                    
                    '''
                    VERSION 1: Use fuzzy search
                    As fuzzysearch doesn't work well if the string length differ widely,
                    the code loops over different overlaping parts of the text and saves the highest score
                    '''
                    high_score = 0
                    name_len = len(treaty.split()) #see how long the treaty name is
                    for num  in range(0, len(text) - 2 - name_len, (name_len + 2) // 2): #get the length of the text and get a step size that is according to the name length
                        
                        #use fuzzy search to see the match for the specific part of the text
                        Search_Query = fuzzyset.FuzzySet()
                        Search_Query.add(treaty)
                        outcome = Search_Query.get("".join(text[num:num + name_len + 2]))
                        
                        
                        if outcome != None: #check if the match is None (None will cause error)
                            if outcome[0][0] > high_score: #check if the current score is larger than the highest score
                                high_score = outcome[0][0] #update highscore
                    list_of_outcomes.append(high_score)
                
                #add outcomes to df, save df
                df_fuzzysearch.loc[filename[-3:]] = list_of_outcomes
                df_fuzzysearch.to_csv('Downloads/Fuzzy Search Outcomes.csv')
                print('time FuzzySearch: ', time.time() - start_time)
                print(find_top_X(list_of_outcomes, treaty_names, 10))
                
                #print closes match
                #print(filename, ' Match: ', treaty_names[list_of_outcomes.index(max(list_of_outcomes))], (max(list_of_outcomes)))
                
                    
                list_of_outcomes = []
                start_time = time.time()
                for treaty in treaty_names: 
                    '''
                    VERSION 2: SequenceMatcher
                    Different library - roughly three times faster
                    does not need to loop over 
                    Currently very inaccurate
                    '''
                    
                    
                    s = difflib.SequenceMatcher(None, ' '.join(text), treaty)
                    high_score = sum(n for i,j,n in s.get_matching_blocks()) / float(len(treaty))
                    list_of_outcomes.append(high_score)
                    
                #add outcomes to df, save df
                print('time SequenceMatcher: ', time.time() - start_time)
                print(find_top_X(list_of_outcomes, treaty_names, 10))
                df_sequencematch.loc[filename[-3:]] = list_of_outcomes
                df_sequencematch.to_csv('Downloads/Seqeunce Match Outcomes.csv')
                
                
                
                list_of_outcomes = []
                start_time = time.time()
                for treaty in treaty_names: 
                    '''
                    VERSION 3: FuzzyWuzzy
                    Similarly quick as fuzzysearch, but less accurate
                    '''
                    
                    
                    highscore = fuzz.ratio(" ".join(text), treaty)
                    list_of_outcomes.append(high_score/100)
                    
                #add outcomes to df, save df
                print('time SequenceMatcher: ', time.time() - start_time)
                print(find_top_X(list_of_outcomes, treaty_names, 10))
                df_sequencematch.loc[filename[-3:]] = list_of_outcomes
                df_sequencematch.to_csv('Downloads/Seqeunce Match Outcomes.csv')

                
            
                
    
text_mining(treaty_names)
print("ENDE")

tfidf matrix successfully created.
time TF-IDF:  4.288866281509399
([0.2671125351183198, 0.17852596239050794, 0.11812622109764166, 0.11642103921538015, 0.11642103921538015, 0.11546515452589383, 0.11546515452589383, 0.11546515452589383, 0.11265772277923669, 0.11265772277923669], ['Nagoya Kuala Lumpur Supplementary Protocol Liability Redress Cartagena Protocol Biosafety', 'Cartagena Protocol Biosafety Convention Biological Diversity', 'Protocol implementation Alpine Convention field town country planning sustainable development', 'Protocol Implementation SenegaloGambian Agreement Field Maritime Fisheries', 'Agreement LongTerm Economic, Cultural, Scientific, Technical Industrial Cooperation Government Republic Cyprus Government Republic Rwanda Protocol implementation Agreement', 'Protocol Sustainable Forest Management Framework Convention Protection Sustainable Development Carpathians', 'Protocol Sustainable Tourism Framework Convention Protection Sustainable Development Carpathians', 'Pr

time FuzzySearch:  44.97464561462402
([0.5510204081632653, 0.5510204081632653, 0.5434782608695652, 0.5294117647058824, 0.5, 0.5, 0.467741935483871, 0.4545454545454546, 0.44999999999999996, 0.4423076923076923], ['International Tropical Timber Agreement 2006', 'International Tropical Timber Agreement, 1994', 'Convention International Maritime Organization', 'International Convention Salvage', 'International Energy Charter', 'Constitution International Rice Commission', 'European Convention Protection Animals International Transport', 'Convention International Council Exploration', 'International Convention Safe Containers', 'Protocol International Convention Regulation Whaling'])
time SequenceMatcher:  25.880484104156494
([0.8888888888888888, 0.8837209302325582, 0.8823529411764706, 0.8823529411764706, 0.8780487804878049, 0.8780487804878049, 0.875, 0.875, 0.8666666666666667, 0.8620689655172413], ['Protocol Fisheries', 'Agreement regarding Monitoring Stratosphere', 'Protocol Forestry', 'In

time TF-IDF:  3.7522690296173096
([0.16724433679928244, 0.16724433679928244, 0.15087784557597175, 0.14817598327992137, 0.13933328478855622, 0.13933328478855622, 0.13933328478855622, 0.13933328478855622, 0.13933328478855622, 0.13933328478855622], ['Protocol implementation Alpine Convention field mountain agriculture', 'Protocol implementation Alpine Convention field mountain forests', 'Protocol Cypriot Republic European Union participation Cyprus European Environment Agency European Environment Information Observation Network N 5III2001', 'Revised Convention Establishment European Organisation Nuclear Research', 'Protocol implementation Alpine Convention 1991 field tourism', 'Convention European Space Agency', 'Protocol Implementation 1991 Alpine Convention Soil Protection', 'Protocol implementation Alpine Convention 1991 field energy', 'Convention Establishment European Mediterranean Plant Protection Organisation', 'Protocol implementation Alpine Convention field town country planning 

time TF-IDF:  5.78061842918396
([0.11514866314821248, 0.11331381642873574, 0.11022308643950021, 0.10946506266256083, 0.10799864593334116, 0.10639075460140096, 0.10423164633614207, 0.10221441759340358, 0.10185825432007767, 0.10122053841894557], ['Protocol Protection Marine Environment Black Land Based Sources Activities', 'Protocol concerning Pollution LandBased sources activities Convention Protection Development Marine Environment Wider Caribbean Region', 'International Convention Prevention Pollution Ships MARPOL Annex V Optional Garbage', 'Protocol Protection Caspian Pollution LandBased Sources Activities Framework Convention Protection Marine Environment Caspian', 'Kuwait Regional Convention Cooperation Protection Marine Environment Pollution', 'Protocol concerning Regional Preparedness, Response CoOperation combating Oil Pollution Incidents Framework Convention Protection Marine Environment Caspian', 'International Convention Prevention Pollution Ships MARPOL modified Protocol 197

time SequenceMatcher:  1.2400460243225098
([0.006571428571428572, 0.006571428571428572, 0.006571428571428572, 0.006571428571428572, 0.006571428571428572, 0.006571428571428572, 0.006571428571428572, 0.006571428571428572, 0.006571428571428572, 0.006571428571428572], ['Revised Convention Navigation Rhine', 'Protocol implementation Alpine Convention 1991 field energy', 'Agreement Government Russian Federation Portugal cooperation sphere civil protection, prevention mitigation emergency situations', 'Agreement Government Russian Federation Government Kazakhstan scientific technical cooperation realization experimental thermonuclear reactor', 'Northwest Wildland Fire Protection Agreement', 'Protocol establishing, period 1 July 1997 30 June 2000, fishing rights financial compensation provided Agreement European Economic Community Government Republic Equatorial Guinea fishing coast Equatorial Guinea', 'Agreement form Exchange Letters concerning provisional application Protocol establishing, pe

time TF-IDF:  5.352057456970215
([0.14910436775667163, 0.14910436775667163, 0.14015100837459707, 0.13463006496730578, 0.12824641721175503, 0.1217684624687011, 0.12140683267256872, 0.12137341234420651, 0.12137341234420651, 0.11731420914762294], ['Nagoya Protocol Access Genetic Resources Fair Equitable Sharing Benefits Arising Utilization Convention Biological Diversity', 'Cartagena Protocol Biosafety Convention Biological Diversity', 'Protocol Conservation Sustainable Use Biological Landscape Diversity Framework Convention Protection Sustainable Development Carpathians', 'Protocol implementation Alpine Convention field town country planning sustainable development', 'Black Biodiversity Landscape Conservation Protocol Convention Protection Black Pollution', 'Protocol Sustainable Forest Management Framework Convention Protection Sustainable Development Carpathians', 'Protocol Conservation Biological Diversity Framework Convention Protection Marine Environment Caspian', 'Protocol Sustainab

time SequenceMatcher:  0.6142129898071289
([0.0074285714285714285, 0.0074285714285714285, 0.0074285714285714285, 0.0074285714285714285, 0.0074285714285714285, 0.0074285714285714285, 0.0074285714285714285, 0.0074285714285714285, 0.0074285714285714285, 0.0074285714285714285], ['Revised Convention Navigation Rhine', 'Protocol implementation Alpine Convention 1991 field energy', 'Agreement Government Russian Federation Portugal cooperation sphere civil protection, prevention mitigation emergency situations', 'Agreement Government Russian Federation Government Kazakhstan scientific technical cooperation realization experimental thermonuclear reactor', 'Northwest Wildland Fire Protection Agreement', 'Protocol establishing, period 1 July 1997 30 June 2000, fishing rights financial compensation provided Agreement European Economic Community Government Republic Equatorial Guinea fishing coast Equatorial Guinea', 'Agreement form Exchange Letters concerning provisional application Protocol establ

time TF-IDF:  4.021511554718018
([0.15473500077247207, 0.15473500077247207, 0.15193604309276743, 0.1426165273286113, 0.13272963079964517, 0.13243985602595187, 0.13196732693278546, 0.13083704942094662, 0.13083704942094662, 0.1282501079017398], ['Amendment Basel Convention Control Transboundary Movements Hazardous Wastes Disposal', 'Basel Convention Control Transboundary Movements Hazardous Wastes Disposal', 'Agreement Government Republic South Africa, Government Kingdom Swaziland Government Peoples Republic Mozambique relative establishment Tripartite Permanent Technical Committee', 'Convention establishing Permanent InterState Drought Control Committee Sahel', 'Agreement Committee Standardization, Metrology Certification Belarus National Bureau Certification Cuba cooperation sphere certification', 'Environmental Cooperation Action Plan Government Canada Government Peoples Republic China', 'Agreement Action Plan Environmentally Sound Management Common Zambezi River System', 'Agreement S

# See accuracy of different methods
## Run Different Methods and compare them to the right answers

In [3]:
list_of_subfolders = ['Train Data']

def tf_idf():
    text_list = []
    name_list = []
    for subfolder in list_of_subfolders: #loop over subfolders
        list_of_files = listdir("Downloads/" + subfolder) #get all files

        for filename in list_of_files: #go through files in subfolder
            
            if (filename[-3:] == "txt"):  #check if it is a txt document
                with open("Downloads/" + subfolder + "/" + filename, encoding = "utf8") as f:
                    document_text = f.readlines()
                    
               
                #clean the document
                text = " ".join(document_text) #create long string 
                text = text.lower()
                text = re.sub("</?.*?>"," <> ", text) # remove tags
                text = re.sub("(\\d|\\W)+"," ", text).strip() # remove special characters and digits
                text = text.split()
                
                
                text_list.append(' '.join(text))
                name_list.append(filename)
                
        start_time = time.time()        
        '''
        VERSION 0: TF-IDF Matrix
        Super Quick, However, not very accurate so far
        '''


        X,v = create_tfidf_features(text_list)
        features = v.get_feature_names()



        
        similarities = calculate_similarity(X, v, treaty_names)
        search_time = time.time() - start_time
        similarities = np.array(similarities).T
        for num, sims in enumerate(similarities):
            list_sim = []
            for sim in sims:
                list_sim.append(float(sim))
            print("time TF-IDF: ", search_time)
            print('filename: ', name_list[num])
            print(find_top_X(list_sim, treaty_names, 10))
            print()
            print()
                

def fuzzy_search():
    

    for subfolder in list_of_subfolders: #loop over subfolders
        list_of_files = listdir("Downloads/" + subfolder) #get all files
        counter = 0
        for filename in list_of_files: #go through files in subfolder
            list_of_outcomes = []
            

            
            if (filename[-3:] == "txt"):  #check if it is a txt document
                with open("Downloads/" + subfolder + "/" + filename, encoding = "utf8") as f:
                    document_text = f.readlines()
                    
                
                #clean the document
                text = " ".join(document_text) #create long string 
                text = text.lower()
                text = re.sub("</?.*?>"," <> ", text) # remove tags
                text = re.sub("(\\d|\\W)+"," ", text).strip() # remove special characters and digits
                text = text.split()
                
                start_time = time.time()
            
                for treaty in treaty_names: 
                    '''
                    VERSION 1: Use fuzzy search
                    As fuzzysearch doesn't work well if the string length differ widely,
                    the code loops over different overlaping parts of the text and saves the highest score
                    '''
                    high_score = 0
                    name_len = len(treaty.split()) #see how long the treaty name is
                    for num  in range(0, len(text) - 2 - name_len, (name_len + 2) // 2): #get the length of the text and get a step size that is according to the name length

                        #use fuzzy search to see the match for the specific part of the text
                        Search_Query = fuzzyset.FuzzySet()
                        Search_Query.add(treaty)
                        outcome = Search_Query.get(" ".join(text[num:num + name_len + 2]))


                        if outcome != None: #check if the match is None (None will cause error)
                            if outcome[0][0] > high_score: #check if the current score is larger than the highest score
                                high_score = outcome[0][0] #update highscore
                    list_of_outcomes.append(high_score)

                #add outcomes to df, save df
                print('time FuzzySearch: ', time.time() - start_time)
                print('filename: ', filename)
                print(find_top_X(list_of_outcomes, treaty_names, 10))
                print()
                print()
                
                

def sequence_matcher():
    for subfolder in list_of_subfolders: #loop over subfolders
        list_of_files = listdir("Downloads/" + subfolder) #get all files

        for filename in list_of_files: #go through files in subfolder
            
            if (filename[-3:] == "txt"):  #check if it is a txt document
                with open("Downloads/" + subfolder + "/" + filename, encoding = "utf8") as f:
                    document_text = f.readlines()
                    
                
                #clean the document
                text = " ".join(document_text) #create long string 
                text = text.lower()
                text = re.sub("</?.*?>"," <> ", text) # remove tags
                text = re.sub("(\\d|\\W)+"," ", text).strip() # remove special characters and digits
                text = text.split()
                
                start_time = time.time()
                list_of_outcomes = []
                for treaty in treaty_names: 
                    '''
                    VERSION 2: SequenceMatcher
                    Different library - roughly three times faster
                    does not need to loop over 
                    '''
                    
                    
                    s = difflib.SequenceMatcher(None, ''.join(text), treaty)
                    high_score = sum(n for i,j,n in s.get_matching_blocks()) / float(len(treaty))
                    list_of_outcomes.append(high_score)
                    
                #add outcomes to df, save df
                print('time SequenceMatcher: ', time.time() - start_time)
                print('filename: ', filename)
                print(find_top_X(list_of_outcomes, treaty_names, 10))
                print()
                print()

                

In [103]:
print('CORRECT ANSWERS: \n')
for key in dict_of_correct_answers:
    print(key)
    print(dict_of_correct_answers[key])
    print()
    
tf_idf()

CORRECT ANSWERS: 

Management of Marine Debris.txt
['International Convention for the Prevention of Pollution From Ships', 'Convention on Biological Diversity', 'London Convention', 'Inter American Convention for the Protection and Conservation of Sea Turtles', 'Protocols to Regional Seas Conventions on Pollution from Land Based Sources']

A paration for the followup to the Strategic Plan for Biodiversity 20112020 and the Strategic Plan for the Cartagena Protocol on Biosafety 20112020.pdf.txt
['Convention on Biological Diversity', 'Cartagena Protocol on Biosafety', 'Nagoya – Kuala Lumpur Supplementary Protocol']

African cherry Prunus africana.txt
['None']

COP411 Cooperation with the European Union.txt
['None']

['None']

Hexachlorobutadiene.txt
['Stockholm Convention']

Outcome of the UNCCD 1st Scientific Conference.pdf.txt
['None']

Preparation for the followup to the Strategic Plan for Biodiversity 20112020.pdf.txt
['Convention on Biological Diversity', 'NAGOYA PROTOCOL ON ACCESS T

# Create Sensitivity Analysis Functions
## alter input variables, search input parameter grid
## score each parameter set, select best set afterwards

In [None]:
def score_result(filename, output, answers):
    #get the length of the treaties mention + 1 (in case a correct score is one down)
    
    correct_answers = answers[filename]
    len_ans = len(correct_answers)
    if correct_answers[0] == 'None': #if no treaty is mentioned, return 1 minus highest score, to reward a low highest score
        return 1 - output[0][0]
    output_treaty_names = output[1][0:len_ans + 1]
    output_treaty_score = output[0][0:len_ans + 1]
    
    #score the treaties found according to the treaties
    #check if the treaties are maybe not in the right order, so check over all found treaties
    #To find how strong the correlation is, multiply by the score it was given
    #to compare different methods of text search, one has to adjust for the average score -> however, if you run the best attributes again, you can manually see, which methods works better
    
    full_score = 0
    for correct_answer in correct_answers:
        for num, treaty in enumerate(output_treaty_names):
            Search_Query = fuzzyset.FuzzySet()
            Search_Query.add(correct_answer)
            outcome = Search_Query.get(treaty)
            if outcome != None:
                score_1 = outcome[0][0]
                score_2 = output_treaty_score[num]
                full_score += score_1 * score_2
    return full_score
     


def fuzzy_search_sensitivity(step_size = [2,3,4], added_words = [2,3,4]): #loop over the input variables
    overall_score = []
    attributes = []
        
    for step in step_size:
        for added_word in added_words:
            score = 0
            for subfolder in list_of_subfolders: #loop over subfolders
                list_of_files = listdir("Downloads/" + subfolder) #get all files
                counter = 0
                for filename in list_of_files: #go through files in subfolder
                    list_of_outcomes = []



                    if (filename[-3:] == "txt"):  #check if it is a txt document
                        with open("Downloads/" + subfolder + "/" + filename, encoding = "utf8") as f:
                            document_text = f.readlines()


                        #clean the document
                        text = " ".join(document_text) #create long string 
                        text = text.lower()
                        text = re.sub("</?.*?>"," <> ", text) # remove tags
                        text = re.sub("(\\d|\\W)+"," ", text).strip() # remove special characters and digits
                        text = text.split()

                        start_time = time.time()

                        for treaty in treaty_names: 
                            '''
                            VERSION 1: Use fuzzy search
                            As fuzzysearch doesn't work well if the string length differ widely,
                            the code loops over different overlaping parts of the text and saves the highest score
                            '''
                            high_score = 0
                            name_len = len(treaty.split()) #see how long the treaty name is
                            for num  in range(0, len(text) - 2 - name_len, (name_len + added_word) // step): #get the length of the text and get a step size that is according to the name length

                                #use fuzzy search to see the match for the specific part of the text
                                Search_Query = fuzzyset.FuzzySet()
                                Search_Query.add(treaty)
                                outcome = Search_Query.get(" ".join(text[num:num + name_len + added_word]))


                                if outcome != None: #check if the match is None (None will cause error)
                                    if outcome[0][0] > high_score: #check if the current score is larger than the highest score
                                        high_score = outcome[0][0] #update highscore
                            list_of_outcomes.append(high_score)

                        #add outcomes to df, save df
                        print('time FuzzySearch: ', time.time() - start_time)
                        print('filename: ', filename)
                        print('Correct Answers: ', dict_of_correct_answers[filename])
                        print(find_top_X(list_of_outcomes, treaty_names, 10))
                        print()
                        print()
                        score += score_result(filename, find_top_X(list_of_outcomes, treaty_names, 10), dict_of_correct_answers)
                        
            overall_score.append(score)
            attributes.append((step, added_word))
    return overall_score, attributes

                        




def tf_idf_sensitivity(max_features = [10000, 20000, 40000, 60000, 100000], n_gram_range = [(1,1), (1,2), (1,3), (1,4), (2,4)]):#loop over the input parameters
    text_list = []
    name_list = []
    overall_score = []
    attributes = []
    for subfolder in list_of_subfolders: #loop over subfolders
        list_of_files = listdir("Downloads/" + subfolder) #get all files

        for filename in list_of_files: #go through files in subfolder
            
            if (filename[-3:] == "txt"):  #check if it is a txt document
                with open("Downloads/" + subfolder + "/" + filename, encoding = "utf8") as f:
                    document_text = f.readlines()
                    
               
                #clean the document
                text = " ".join(document_text) #create long string 
                text = text.lower()
                text = re.sub("</?.*?>"," <> ", text) # remove tags
                text = re.sub("(\\d|\\W)+"," ", text).strip() # remove special characters and digits
                text = text.split()
                
                
                text_list.append(' '.join(text))
                name_list.append(filename)
                
        start_time = time.time()        
        '''
        VERSION 0: TF-IDF Matrix
        Super Quick, However, not very accurate so far
        '''

        for max_feature in max_features:
            for n_gram in n_gram_range:
                score = 0
            
                X,v = create_tfidf_features(text_list, max_features = max_feature, n_gram_range = n_gram)  
                features = v.get_feature_names()




                similarities = calculate_similarity(X, v, treaty_names)
                search_time = time.time() - start_time
                similarities = np.array(similarities).T
                for num, sims in enumerate(similarities):
                    list_sim = []
                    for sim in sims:
                        list_sim.append(float(sim))
                    print("time TF-IDF: ", search_time)
                    print('Attributes: ', max_feature, n_gram)
                    print('filename: ', name_list[num])
                    print('Correct Answers: ', dict_of_correct_answers[name_list[num]])
                    print(find_top_X(list_sim, treaty_names, 10))
                    score += score_result(name_list[num], find_top_X(list_sim, treaty_names, 10), dict_of_correct_answers)
                    print()
                    print()
                    
                    
                overall_score.append(score)
                attributes.append((max_feature, n_gram))
        return overall_score, attributes
            
            
            
            


def sequence_matcher_sensitivity(): #cannot loop over the input parameters, as there are none
    for subfolder in list_of_subfolders: #loop over subfolders
        list_of_files = listdir("Downloads/" + subfolder) #get all files
        
        overall_score = []
        attributes = []
        score = 0
        for filename in list_of_files: #go through files in subfolder
            
            if (filename[-3:] == "txt"):  #check if it is a txt document
                with open("Downloads/" + subfolder + "/" + filename, encoding = "utf8") as f:
                    document_text = f.readlines()
                    
                
                #clean the document
                text = " ".join(document_text) #create long string 
                text = text.lower()
                text = re.sub("</?.*?>"," <> ", text) # remove tags
                text = re.sub("(\\d|\\W)+"," ", text).strip() # remove special characters and digits
                text = text.split()
                
                start_time = time.time()
                list_of_outcomes = []
                for treaty in treaty_names: 
                    '''
                    VERSION 2: SequenceMatcher
                    Different library - roughly three times faster
                    does not need to loop over 
                    '''
                    
                    
                    s = difflib.SequenceMatcher(None, ''.join(text), treaty)
                    high_score = sum(n for i,j,n in s.get_matching_blocks()) / float(len(treaty))
                    list_of_outcomes.append(high_score)
                    
                #add outcomes to df, save df
                print('time SequenceMatcher: ', time.time() - start_time)
                print('filename: ', filename)
                print(find_top_X(list_of_outcomes, treaty_names, 10))
                print(score_result(filename, find_top_X(list_of_outcomes, treaty_names, 10), dict_of_correct_answers))
                score += score_result(filename, find_top_X(list_of_outcomes, treaty_names, 10), dict_of_correct_answers)
                print()
                print()
            overall_score.append(score)
            attributes.append('Dummy')
            return overall_score, attributes
        

In [None]:
'''
Fuzzy Search Sensitivity has a long running time, several hours
PRINTS ALL THE POSSIBLE PARAMETERS IN THE PARAMETER SPACE WITH ASSOCIATED CORRECTNESS VALUE
'''

outcome_tf_idf = tf_idf_sensitivity()
print('TF-IDF Done')

outcome_fuzzy_search = fuzzy_search_sensitivity()
print('Fuzzy Search Done')
print('ENDE')

In [None]:
print(outcome_tf_idf)
print()
print()
print(outcome_fuzzy_search)

# Show Results for the Best Parameters

In [None]:
tf_idf_sensitivity([100000], [(1,1)])

In [None]:
fuzzy_search_sensitivity([4],[2])

In [None]:
'''
USE IN FUTURE CLEANING OF TEXTS AND TREATIES
USE FOR ADJUSTING OUTCOME POSSIBLY
'''
def clean_text(text):          
    #clean the document
    text = " ".join(text) #create long string 
    text = text.lower()
    text = re.sub("</?.*?>"," ", text) # remove tags
    text = re.sub('\(.\)| .\)', '', text) #remove a) and (b) mentions
    text = re.sub(r'\w*\d\w*', '', text).strip() #remove mistranslations such as '-' to 'xe23xo0'; basically remove any words that have numbers in it
    text = re.sub("(\\d|\\W)+"," ", text).strip() # remove special characters and digits
    return text
                
            
def clean_treaty_names(name):
    name = [ n for n in name.split() if len(n)> 3]
    name = re.sub(r'\w*\d\w*', '', ' '.join(name)).strip() #remove mistranslations such as '-' to 'xe23xo0'; basically remove any words that have numbers in it
    name = re.sub(r'\w*\.\w*', '', name).strip()
    name = text = re.sub("(\\d|\\W)+"," ", name).strip() # remove special characters and digits
    return name

def adjust_to_name_length(name, score):
    ''' 
    Returns the difference of name length and average length times .01
    Needed, as short treaty names receive higher matches
    '''
    average = 9.73
    return score + ((len(name.split()) - average) * .01)

In [None]:
list_of_subfolders = ['Train Data/'] #declare subfolder in question

In [None]:
import matplotlib.pyplot as plt
"""
display histogram of the name length
"""
total = 0
lengts = []
for treaty in treaty_names:
    treaty = clean_treaty_names(treaty)
    total += len(treaty.split())
    lengts.append(len(treaty.split()))
    if len(treaty.split()) <2:
        print(treaty)
print(total/len(treaty_names))
    
    
plt.hist(lengts, bins = 50)
plt.show()

In [None]:
"""
ADJUST VALUES FOR TREATY NAME LENGTH, AS SHORT TREATIES HAVE HIGHER CHANCE OF HITS THEORETICALLY
CAN ALSO BE ADJUSTED AFTER RUNNING THE ANALYSIS REGULARLY
"""
def fuzzy_search_sensitivity_length_adjusted(step_size = [2,3,4], added_words = [2,3,4]):
    
    overall_score = []
    attributes = []
    start_time = time.time()   
    for step in step_size:
        for added_word in added_words:
            score = 0
            for subfolder in list_of_subfolders: #loop over subfolders
                list_of_files = listdir("Downloads/" + subfolder) #get all files
                counter = 0
                for filename in list_of_files: #go through files in subfolder
                    list_of_outcomes = []



                    if (filename[-3:] == "txt"):  #check if it is a txt document
                        with open("Downloads/" + subfolder + "/" + filename, encoding = "utf8") as f:
                            document_text = f.readlines()


                        #clean the document
                        text = " ".join(document_text) #create long string 
                        text = text.lower()
                        text = re.sub("</?.*?>"," <> ", text) # remove tags
                        text = re.sub("(\\d|\\W)+"," ", text).strip() # remove special characters and digits
                        text = text.split()

                        

                        for treaty in treaty_names: 
                            '''
                            VERSION 1: Use fuzzy search
                            As fuzzysearch doesn't work well if the string length differ widely,
                            the code loops over different overlaping parts of the text and saves the highest score
                            '''
                            high_score = 0
                            name_len = len(treaty.split()) #see how long the treaty name is
                            treaty_specific_step_size = (name_len + added_word) // step
                            if treaty_specific_step_size == 0:
                                treaty_specific_step_size = 1
                            for num  in range(0, len(text) - 2 - name_len, treaty_specific_step_size): #get the length of the text and get a step size that is according to the name length

                                #use fuzzy search to see the match for the specific part of the text
                                Search_Query = fuzzyset.FuzzySet()
                                Search_Query.add(treaty)
                                outcome = Search_Query.get(" ".join(text[num:num + name_len + added_word]))


                                if outcome != None: #check if the match is None (None will cause error)
                                    if outcome[0][0] > high_score: #check if the current score is larger than the highest score
                                        high_score = outcome[0][0] #update highscore
                                        
                            high_score = adjust_to_name_length(treaty, high_score)
                            list_of_outcomes.append(high_score)

                        #add outcomes to df, save df
                        '''
                        print('filename: ', filename)
                        print('Correct Answers: ', dict_of_correct_answers[filename])
                        print(find_top_X(list_of_outcomes, treaty_names, 10))
                        print()
                        print()
                        '''
                        score += score_result(filename, find_top_X(list_of_outcomes, treaty_names, 10), dict_of_correct_answers)
                        
            overall_score.append(score)
            attributes.append((step, added_word))
    print('time FuzzySearch: ', time.time() - start_time)
    return overall_score, attributes

In [None]:
"""
CLEANING OF TEXTS/NAMES AND ADJUSTING FOR TEXT LENGTHS
"""
def clean_text(text):          
    #clean the document
    text = " ".join(text) #create long string 
    text = text.lower()
    text = re.sub("</?.*?>"," ", text) # remove tags
    text = re.sub('\(.\)| .\)', '', text) #remove a) and (b) mentions
    text = re.sub(r'\w*\d\w*', '', text).strip() #remove mistranslations such as '-' to 'xe23xo0'; basically remove any words that have numbers in it
    text = re.sub("(\\d|\\W)+"," ", text).strip() # remove special characters and digits
    return text
                
            
def clean_treaty_names(name):
    name = [ n for n in name.split() if len(n)> 3]
    name = re.sub(r'\w*\d\w*', '', ' '.join(name)).strip() #remove mistranslations such as '-' to 'xe23xo0'; basically remove any words that have numbers in it
    name = re.sub(r'\w*\.\w*', '', name).strip()
    name = text = re.sub("(\\d|\\W)+"," ", name).strip() # remove special characters and digits
    return name

def adjust_to_name_length(name, score):
    ''' 
    Returns the difference of name length and average length times .01 
    0.01, as 10 is the average, with treaty names ranging between 2 and 25 words (max therefore -/+ .08)
    Needed, as short treaty names receive higher matches
    '''
    average = 9.73
    return score + ((len(name.split()) - average) * .01)



# Run TF-IDF Analysis with Vocab

In [18]:
'''
RUN DIFFERENT ANALYSIS WITH ADDED VOCABULARY AS PARAMETER FOR TF-IDF MATRIX
'''
def create_tfidf_features_vocab(corpus, max_features=20000, max_df=0.95, min_df=2, vocab = None, n_gram_range = (1,1)):
    """ Creates a tf-idf matrix for the `corpus` using sklearn. """
    tfidf_vectorizor = TfidfVectorizer(decode_error='replace', strip_accents='unicode', analyzer='word',
                                       stop_words='english', ngram_range=n_gram_range, max_features=max_features,
                                       norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True, vocabulary = vocab) #or use vocab treaty dict
    X = tfidf_vectorizor.fit_transform(corpus)
    print('tfidf matrix successfully created.')
    return X, tfidf_vectorizor


def calculate_similarity(X, vectorizor, queries, top_k=5):
    """ Vectorizes the `query` via `vectorizor` and calculates the cosine similarity of
    the `query` and `X` (all the documents) and returns the `top_k` similar documents."""

    # Vectorize the query to the same length as documents
    similarities = []
    for query in queries:
        query_vec = vectorizor.transform([query])
        # Compute the cosine similarity between query_vec and all the documents
        cosine_similarities = cosine_similarity(X,query_vec).flatten()
        similarities.append(cosine_similarities)
        
    return similarities

def clean_text(text):          
    #clean the document
    text = " ".join(text) #create long string 
    text = text.lower()
    text = re.sub("</?.*?>"," ", text) # remove tags
    text = re.sub('\(.\)| .\)', '', text) #remove a) and (b) mentions
    text = re.sub(r'\w*\d\w*', '', text).strip() #remove mistranslations such as '-' to 'xe23xo0'; basically remove any words that have numbers in it
    text = re.sub("(\\d|\\W)+"," ", text).strip() # remove special characters and digits
    return text



test_vocab = []
for tret in treaty_names:
    test_vocab.extend(tret.split())
test_vocab = list(set(test_vocab))


In [None]:
'''
RUN DIFFERENT ANALYSIS WITH ADDED VOCABULARY AS PARAMETER FOR TF-IDF MATRIX
'''

def tf_idf_vocab(max_feature =  100000, n_gram = (1,1), vocab = None, list_of_subfolders = ['Train Data/']):
    text_list = []
    name_list = []
    overall_score = []
    attributes = []
    for subfolder in list_of_subfolders: #loop over subfolders
        list_of_files = listdir("Downloads/" + subfolder) #get all files

        for filename in list_of_files: #go through files in subfolder
            
            if (filename[-3:] == "txt"):  #check if it is a txt document
                with open("Downloads/" + subfolder + "/" + filename, encoding = "utf8") as f:
                    document_text = f.readlines()
                
                    

                text = clean_text(' '.join(document_text))
                
                text_list.append(text)
                name_list.append(filename)
                
                start_time = time.time()        
                '''
                VERSION 0: TF-IDF Matrix
                Super Quick, However, not very accurate so far
                '''

                
                X,v = create_tfidf_features_vocab(text_list, max_features = max_feature, n_gram_range = n_gram, vocab = vocab)  
                features = v.get_feature_names()




                similarities = calculate_similarity(X, v, treaty_names)
                search_time = time.time() - start_time
                similarities = np.array(similarities).T
                score = 0 
                for num, sims in enumerate(similarities):
                    list_sim = []
                    for sim in sims:
                        list_sim.append(float(sim))
                    print("time TF-IDF: ", search_time)
                    print('Attributes: ', max_feature, n_gram)
                    print('filename: ', name_list[num])
                    print('Correct Answers: ', dict_of_correct_answers[name_list[num]])
                    print(find_top_X(list_sim, treaty_names, 10))
                    score += score_result(name_list[num], find_top_X(list_sim, treaty_names, 10), dict_of_correct_answers)
                    print()
                    print()
                    
                    
              
        return score
    
#score_1 = tf_idf_vocab(vocab = test_vocab)
score_2 = tf_idf_vocab()

print(f'score one {score_1} vs score two {score_2}')
            