In [1]:
import re
import string
import collections
import decimal
import pandas as pd
import spacy
import string
import time
import dateparser
from operator import itemgetter
from nltk import sent_tokenize, word_tokenize
from utils.myutils import split_triple, load_object, save_object, short_name

In [2]:
type_int = "Int"
type_float = "Float"
type_date = "Date"

constants = (type_int, type_float, type_date)

#Parameter
improve_float_matching = True
convert_units = True
use_gold_standard_units = True
only_top20 = False


if convert_units:
    unit_dict = load_object('unit_dict')
    unit_conversion_dict = load_object('unit_conversion_dict')

In [3]:
def get_sentence_tokens_nltk(sentence):
    tokens = word_tokenize(sentence)
    tokens_extended = list()
    offset = 0
    
    for i, token in enumerate(tokens):
        offset = sentence.find(token, offset)
        tokens_extended.append((token, i, offset))
        offset += len(token)
        
    return tokens_extended

In [4]:
running_time = 0
def get_adjacent_tokens(sentence_tokens, match_pos, local_signs=('.',',','-')):
    #Parameter
    number_of_neighbor_tokens = 3
    
    decimal_sep = local_signs[0]
    thousands_sep = local_signs[1]
    minus_sign = local_signs[2]
    
    adjacent_tokens = list()
    token_directly_following = None
    for token in sentence_tokens:
        #---TRANSLATION---
        #token[0] is the word
        #token[1] is the index (token.i in SpaCy)
        #token[2] is the position/offset (token.idx in SpaCy)

        if match_pos > token[2]:
            continue

        #Usually both should match
        #But mismatches can occur if the number is not en entire token itself but just part of another token
        #In this case we take the previous token as this is the one our number-match is part of
        if match_pos != token[2]:
            token = sentence_tokens[token[1] - 1]

        #Tokens before
        i = token[1] - 1 #Index or i
        number_of_neighbor_tokens_tmp = number_of_neighbor_tokens
        while i >= 0 and i >= token[1] - number_of_neighbor_tokens_tmp:
            if re.sub('[0-9' + thousands_sep + decimal_sep + minus_sign + ']', '', sentence_tokens[i][0]) not in string.punctuation:
                #print("Token with index " + str(i) + " is adjacent.")
                adjacent_tokens.append(sentence_tokens[i][0].lower())
            else:
                #Skip tokens that are just punctuations or numbers
                number_of_neighbor_tokens_tmp += 1
            i -= 1

        #Tokens after
        i = token[1] + 1
        number_of_neighbor_tokens_tmp = number_of_neighbor_tokens
        while i <= (len(sentence_tokens) - 1) and i <= token[1] + number_of_neighbor_tokens_tmp:
            if re.sub('[0-9' + thousands_sep + decimal_sep + minus_sign + ']', '', sentence_tokens[i][0]) not in string.punctuation:
                #print("Token with index " + str(i) + " is adjacent.")
                adjacent_tokens.append(sentence_tokens[i][0].lower())
                if token_directly_following is None:
                    token_directly_following = sentence_tokens[i][0].lower()
            else:
                #Skip tokens that are just punctuations or numbers
                number_of_neighbor_tokens_tmp += 1
            i += 1

        #Remove duplicates
        adjacent_tokens = list(set(adjacent_tokens))

        #For ends automatically if token is found
        break
    
    if token_directly_following is None:
        token_directly_following = '#_NONE_#'
    
    return (adjacent_tokens, token_directly_following)

In [5]:
#punctuation_without_sep = '!"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~'
#decimal_sep = '.'
#thousands_sep = ','
#minus_sign = '-'
#language = 'en'
#running_time = 0
local_signs = ('.',',','-')
date_dict = load_object('date_dict')


def abstract_get_literals(abstract, entity=None, local_signs=('.',',','-'), constants=('Int', 'Float', 'Date'), include_dates=False, date_dict=None):
    #global running_time
    
    decimal_sep, thousands_sep, minus_sign = local_signs
    type_int, type_float, type_date = constants

    punctuation_without_sep = re.sub('[' + decimal_sep + thousands_sep + minus_sign + ']','',string.punctuation)
    
    global_offset = 0
    entire_list = list()
    
    for sentence in sent_tokenize(abstract):
        #Remove all punctuations except for decimal_separator, thousands_separator and minus_sign
        #translation = sentence.maketrans('', '', punctuation_without_sep)
        #sentence_clean = sentence.translate(translation)
        
        sentence_clean = sentence
        
        #Required to find adjacent tokens
        start = time.clock()
        sentence_tokens = get_sentence_tokens_nltk(sentence)
        #running_time += ((time.clock() - start)*1000)
        
        literals_all = list()
        
        numbers = re.finditer(minus_sign + '?([0-9]+[' + thousands_sep + decimal_sep + ']*)+', sentence_clean)
        
        integers = list()
        floats = list()
        dates = list()
        
        #Numbers (Ints and Floats) are found by Regex
        for match in numbers:
            #print("Number " + str(match.group()) + " starts at position " + str(match.start()))
            
            #Remove dot and comma from beginning and ending
            match_number = str(match.group()).strip(decimal_sep + thousands_sep).replace(minus_sign, '-')
            match_pos = int(match.start())
            
            #Find adjacent tokens with NLTK-list (formerly SpaCy)
            adjacent_tokens, token_directly_following = get_adjacent_tokens(sentence_tokens, match_pos, local_signs)
            
            #For Info-Feature: Position with respect to the entire abstract
            this_global_offset = global_offset + match_pos
            
            #A number won't be an Int if it contains a decimal separator (unless if more than one -> then it is not a decimal separator)
            if match_number.count(decimal_sep) != 1:
                #print("Number " + match_number + " is Int")
                integers.append((re.sub('[' + decimal_sep + thousands_sep + ']', '', match_number), match_pos, type_int, adjacent_tokens, token_directly_following, this_global_offset))
            
            #A number won't be a Float if it has not exactly one decimal separator
            if match_number.count(decimal_sep) == 1:
                #print("Number " + match_number + " is Float")
                floats.append((re.sub('[' + thousands_sep + ']', '', match_number), match_pos, type_float, adjacent_tokens, token_directly_following, this_global_offset))
        
        #Dates are found (in advance) by SpaCy
        if include_dates:
            #Only take dates in this sentence
            start_idx = global_offset
            end_idx = global_offset + len(sentence)
            
            #DBpedia already has many relations 2 times (once for the entire date and once for the year only)
            for date_info in date_dict[entity]:
                if date_info[1] >= start_idx and date_info[1] < end_idx:
                    match_pos = date_info[1] - global_offset
                    adjacent_tokens, token_directly_following = get_adjacent_tokens(sentence_tokens, match_pos, local_signs)
                    dates.append((date_info[0], match_pos, type_date, adjacent_tokens, token_directly_following, date_info[1]))
    
        literals_all.extend(integers)
        literals_all.extend(floats)
        literals_all.extend(dates)
        
        #Sort by Position in Sentence
        literals_all = sorted(literals_all, key=itemgetter(1))
        entire_list.append(literals_all)
        
        global_offset += len(sentence) + 1 #(might rarly make minor errors when a sentence is separated by two whitespaces)
    
    return entire_list

In [6]:
test = 'He is 15.43 years old and has -4,463 friends on Facebook. 1,443,233.44 people live in the same city as him. He is 20.'
#--- TUPLE STRUCTURE ---
#0: Unified Number (as String but can be casted) or Date (as Tuple containing the unified date and the type)
#1: Position/Offset within the sentence
#2: Literal Type (Float, Int or Date)
#3: List of adjacent tokens within the sentence (depends on window size)
#4: Token directly following
#5: Position/Offset within the entire abstract
entire_list = abstract_get_literals(test)
print(entire_list)
#print(element_len(entire_list))
#fitting_list = [[t for t in sent if t[2] == type_float] for sent in entire_list]
#print(fitting_list)

[[('15.43', 6, 'Float', ['years', 'and', 'old', 'he', 'is'], 'years', 6), ('-4463', 30, 'Int', ['and', 'old', 'has', 'facebook', 'friends', 'on'], 'friends', 30)], [('1443233.44', 0, 'Float', ['people', 'live', 'in'], 'people', 58)], [('20', 6, 'Int', ['he', 'is'], '#_NONE_#', 114)]]


In [7]:
def matched_int(fact_number, abstract_number):
    fact_number = int(fact_number)
    abstract_number = int(abstract_number)

    if abstract_number == fact_number:
        return "Matched by equality"
    if abstract_number <= fact_number * 1.015 and abstract_number >= fact_number / 1.015:
        return "Matched by range"
    if fact_number > 1000 and abstract_number == round(fact_number / 100) * 100:
        return "Matched by Hundret round"
    if fact_number > 10000 and abstract_number == round(fact_number / 1000) * 1000:
        return "Matched by Thousand round"
    if fact_number > 100000 and abstract_number == round(fact_number / 10000) * 10000:
        return "Matched by 10K round"
    if fact_number > 1000000 and abstract_number == round(fact_number / 100000) * 100000:
        return "Matched by 100K round"
    if fact_number > 10000000 and abstract_number == round(fact_number / 1000000) * 1000000:
        return "Matched by Mio round"
    if fact_number > 100000000 and abstract_number == round(fact_number / 10000000) * 10000000:
        return "Matched by 10Mio round"
    if fact_number > 1000000000 and abstract_number == round(fact_number / 100000000) * 100000000:
        return "Matched by 100Mio round"
    if fact_number > 10000000000 and abstract_number == round(fact_number / 1000000000) * 1000000000:
        return "Matched by Mrd round"
    return "Not matched"

In [8]:
def matched_float(fact_number, abstract_number):    
    #Number of decimal places
    fact_decimals = decimal.Decimal(fact_number).as_tuple().exponent * (-1)
    
    fact_number = float(fact_number)
    abstract_number = float(abstract_number)
    
    #if fact_number.is_integer():
    #    return matched_int(fact_number, abstract_number)
    
    if abstract_number == fact_number:
        return "Matched by equality"
    if abstract_number <= fact_number * 1.015 and abstract_number >= fact_number / 1.015:
        return "Matched by range"
    #TODO: Würde man auch z.B. 1.23 auf 1.2 runden -> zu fein, oder?
    if fact_decimals >= 3 and abstract_number == round(fact_number, 2):
        return "Matched by 2-Decimal round"
    if fact_decimals >= 4 and abstract_number == round(fact_number, 3):
        return "Matched by 3-Decimal round"
    if fact_decimals >= 5 and abstract_number == round(fact_number, 4):
        return "Matched by 4-Decimal round"
    if fact_decimals >= 6 and abstract_number == round(fact_number, 5):
        return "Matched by 5-Decimal round"
    if fact_decimals >= 7 and abstract_number == round(fact_number, 6):
        return "Matched by 6-Decimal round"
    if fact_decimals >= 8 and abstract_number == round(fact_number, 7):
        return "Matched by 7-Decimal round"
    if fact_decimals >= 9 and abstract_number == round(fact_number, 8):
        return "Matched by 8-Decimal round"
    return "Not matched"

In [9]:
def matched_date(fact_date, abstract_date):
    
    #Match entire date (YMD) against everything
    #Match YM against YM and Y
    #Match Year (Y) only against Year
    if ( abstract_date[1] == 'ymd' and \
           ( fact_date == abstract_date[0] or fact_date == abstract_date[0][:-3] or fact_date == abstract_date[0][:-6] ) ) or \
       ( abstract_date[1] == 'ym' and \
           ( fact_date == abstract_date[0] or fact_date == abstract_date[0][:-3] ) ) or \
       ( abstract_date[1] == 'y' and \
           fact_date == abstract_date[0]):
        return "Matched by equality (date)"
    
    return "Not matched"

In [10]:
file_path_triples_filtered_sorted_by_relation = "dbpedia/mappingbased_literals_en_filtered_sorted_by_relation.ttl"

triples_file_filtered_sorted_by_relation = open(file_path_triples_filtered_sorted_by_relation, 'r', encoding='utf-8')

abstract_dict = load_object("abstract_dict")
relation_type_dict = load_object("relation_type_dict")
negative_dict = load_object('negative_dict')
types_int = load_object("types_int")
types_float = load_object("types_float")
types_date = load_object("types_date")

matched_cnt = collections.Counter()

In [11]:
relations_top20 = [
    'http://dbpedia.org/ontology/numberOfGoals',
    'http://dbpedia.org/ontology/numberOfMatches',
    'http://dbpedia.org/ontology/populationTotal',
    'http://dbpedia.org/ontology/runtime',
    'http://dbpedia.org/ontology/elevation', #5
    'http://dbpedia.org/ontology/squadNumber',
    'http://dbpedia.org/ontology/height',
    'http://dbpedia.org/ontology/areaTotal',
    'http://dbpedia.org/ontology/weight',
    'http://dbpedia.org/ontology/length', #10
    'http://dbpedia.org/ontology/populationDensity',
    'http://dbpedia.org/ontology/areaLand',
    'http://dbpedia.org/ontology/area',
    'http://dbpedia.org/ontology/areaWater',
    'http://dbpedia.org/ontology/maximumElevation', #15
    'http://dbpedia.org/ontology/minimumElevation',
    'http://dbpedia.org/ontology/numberOfEpisodes',
    'http://dbpedia.org/ontology/numberOfStudents',
    'http://dbpedia.org/ontology/shipBeam',
    'http://dbpedia.org/ontology/runwayLength' #20
]

In [12]:
df_cols = ['IntegersInAbstract', 'IntegersInSentence',
           'FloatsInAbstract', 'FloatsInSentence',
           'NumbersInAbstract', 'NumbersInSentence',
           'DatesInAbstract', 'DatesInSentence',
           'FittingPositionInAbstract', 'FittingPositionInSentence',
           'NumberPositionInAbstract', 'NumberPositionInSentence',
           'PositionInAbstract', 'PositionInSentence',
           'SentencePosition', 'StandardDeviationFactor',
           'TokenAround', 'Type', 'Label',
           'InfoEntity', 'InfoFactNumber',
           'InfoAbstractNumber', 'InfoAbstractNumberConverted',
           'InfoTokenAfter', 'InfoMatchingType', 'InfoGlobalOffset']
%store df_cols

def make_new_df(rows, columns):
    new_df = pd.DataFrame(rows, columns=columns)
    #TODO!!!
    #new_df = new_df.sample(frac=0.1).reset_index(drop=True)
    return new_df

Stored 'df_cols' (list)


In [13]:
#This counts the number of base elements in a nested list (e.g a list of a 2-element list and a 3-element list is 5)
def element_len(item):
    if type(item) == list:
        return sum(element_len(subitem) for subitem in item)
    else:
        return 1

In [14]:
instance_types_dict = load_object("instance_types_dict")
relation_stat_dict = load_object("relation_stat_dict")
token_around_counter = collections.Counter()
def create_record(int_list_abstract, \
                  float_list_abstract, \
                  number_list_abstract, \
                  date_list_abstract, \
                  int_list_sentence, \
                  float_list_sentence, \
                  number_list_sentence, \
                  date_list_sentence, \
                  index_abstract, \
                  index_sentence, \
                  index_sentence_fitting, \
                  index_sentence_number, \
                  flat_index, \
                  flat_index_fitting, \
                  flat_index_number, \
                  tokens_around, \
                  token_directly_following, \
                  found,\
                  entity, \
                  relation, \
                  fact_number, \
                  abstract_number, \
                  abstract_number_converted, \
                  matching_type, \
                  global_offset, \
                  relation_type, \
                  instance_types_dict, \
                  relation_stat_dict, 
                  track_token_around_counter):
    
    if track_token_around_counter:
        global token_around_counter
        #Store the frequencies to filter on minimum frequency later on
        for token in tokens_around:
            token_around_counter[token] += 1
    
    record_dict = dict()
    
    record_dict['IntegersInAbstract'] = element_len(int_list_abstract)
    record_dict['IntegersInSentence'] = len(int_list_sentence)
    record_dict['FloatsInAbstract'] = element_len(float_list_abstract)
    record_dict['FloatsInSentence'] = len(float_list_sentence)
    record_dict['NumbersInAbstract'] = element_len(number_list_abstract)
    record_dict['NumbersInSentence'] = len(number_list_sentence)
    record_dict['DatesInAbstract'] = element_len(date_list_abstract)
    record_dict['DatesInSentence'] = len(date_list_sentence)
    record_dict['FittingPositionInAbstract'] = flat_index_fitting
    record_dict['FittingPositionInSentence'] = index_sentence_fitting
    record_dict['NumberPositionInAbstract'] = flat_index_number
    record_dict['NumberPositionInSentence'] = index_sentence_number
    record_dict['PositionInAbstract'] = flat_index
    record_dict['PositionInSentence'] = index_sentence
    record_dict['SentencePosition'] = index_abstract
    if relation in relation_stat_dict:
        record_dict['StandardDeviationFactor'] = (relation_stat_dict[relation][0] - float(abstract_number)) / relation_stat_dict[relation][1] #x times stdev
    else:
        record_dict['StandardDeviationFactor'] = 0
    record_dict['TokenAround'] = tokens_around
    record_dict['Type'] = instance_types_dict[entity]
    record_dict['Label'] = found
    #Administrative Features - will be automatically dropped before classification
    record_dict['InfoEntity'] = entity
    record_dict['InfoFactNumber'] = fact_number
    record_dict['InfoAbstractNumber'] = abstract_number
    record_dict['InfoAbstractNumberConverted'] = abstract_number_converted
    record_dict['InfoTokenAfter'] = token_directly_following
    record_dict['InfoMatchingType'] = matching_type
    record_dict['InfoGlobalOffset'] = global_offset
    
    #return pd.Series(record_dict)
    return record_dict

In [15]:
def clean_df(df, save_set, save_name, predefined_token_set=None):
    print("Cleaning df...")
    
    global token_around_counter
    
    token_set = set()
    
    #Parameters
    min_occurrences_token_around = 10
    top_n_only = 500
    
    if predefined_token_set is None:
        if top_n_only == -1:
            top_n_tokens = [t[0] for t in token_around_counter.most_common()]
        else:
            top_n_tokens = [t[0] for t in token_around_counter.most_common(top_n_only)]
    
    def delete_below_frequency(cell):
        tokens_filtered = [word for word in cell if word in top_n_tokens and token_around_counter[word] >= min_occurrences_token_around]
        
        #Empty list not supported -> add NoneToken
        if not tokens_filtered:
            tokens_filtered = ['#_NONE_#']
        
        token_set.update(tokens_filtered)
        return tokens_filtered
    
    def delete_by_predefined_token_set(cell):
        tokens_filtered = [word for word in cell if word in predefined_token_set]
        #Empty list not supported -> add NoneToken
        if not tokens_filtered:
            tokens_filtered = ['#_NONE_#']
        return tokens_filtered
    
    if predefined_token_set is None:
        df['TokenAround'] = df['TokenAround'].apply(delete_below_frequency)
    else:
        df['TokenAround'] = df['TokenAround'].apply(delete_by_predefined_token_set)
    
    if predefined_token_set is None:
        token_around_counter = collections.Counter()
    
    if save_set:
        save_object(token_set, 'data_info/' + save_name + '_tokens')
    
    return df

In [16]:
def save_df(df, save_name):
    print("Saving df...")
    #name = relation_name.split("/")[-1]
    try:
        df.to_csv('data/' + save_name + '.csv',
                  encoding = 'utf-8',
                  sep = ',',
                  index = False)
        save_object(df.dtypes.to_dict(), 'data_info/' + save_name + '_dtypes')
        print("Saved successfully")
    except PermissionError:
        print("ERROR! Could not save file " + name + ".csv due to Permission Error!")

In [17]:
def convert_to_df_with_dummies(in_df):
    
    if in_df.empty:
        return in_df
    
    try:

        #TODO: Kann man beides vereinheitlichen?

        #see https://stackoverflow.com/questions/37381862/get-dummies-for-pandas-column-containing-list
        #Coding relies on that no record has an empty list (works for dense DataFrames but not for sparse ones!)
        
        print('To sparse...')
        in_df = pd.SparseDataFrame(in_df)
        
        print("apply tuple...")
        in_df['Type'] = in_df['Type'].apply(tuple)

        #print(in_df)
        #print(in_df['TokenAround'].apply(pd.Series))

        print("calc group by columns...")
        group_by_cols = list(in_df.columns.values)
        group_by_cols.remove('TokenAround')
        print("calc dummies...")
        out_df = pd.get_dummies(
          in_df.join(pd.SparseSeries(in_df['TokenAround'].apply(pd.Series).stack().reset_index(1, drop=True),
                            name='TokenAround1')).drop('TokenAround', axis=1).rename(columns={'TokenAround1': 'TokenAround'}),
          columns=['TokenAround']).groupby(by=group_by_cols, as_index=False).sum()
        
        print("apply list...")
        out_df['Type'] = out_df['Type'].apply(list)

        group_by_cols = list(out_df.columns.values)
        group_by_cols.remove('Type')
        out_df = pd.get_dummies(
          out_df.join(pd.SparseSeries(out_df['Type'].apply(pd.Series).stack().reset_index(1, drop=True),
                            name='Type1')).drop('Type', axis=1).rename(columns={'Type1': 'Type'}),
          columns=['Type']).groupby(by=group_by_cols, as_index=False).sum()

        print('To dense...')
        out_df = out_df.to_dense()
    except MemoryError:
        print("MEMORY ERROR! Deleting Columns!")
        out_df = in_df.to_dense().drop(['TokenAround', 'Type'], axis=1)
    
    return out_df

In [18]:
def handle_units(relation, type_searched, abstract_record, convert_units=True, use_gold_standard_units=True, constants=('Int', 'Float', 'Date'), unit_dict=None, unit_conversion_dict=None, negative_dict=None):
    
    type_int, type_float, type_date = constants
    
    if convert_units and relation in unit_dict:
        
        try:
            to_unit = unit_dict[relation]
            if use_gold_standard_units:
                factor = unit_conversion_dict[(abstract_record[4], to_unit, True)]
            elif unit_conversion_dict[(abstract_record[4], to_unit, False)] != False:
                factor = unit_conversion_dict[(abstract_record[4], to_unit, False)][0]
            else:
                raise KeyError

            abstract_number_converted_float = float(abstract_record[0]) * factor

            #if type_searched == type_int: -> Update to ListType
            if type_int in type_searched:
                if not abstract_number_converted_float.is_integer():
                    #We wanted an int but after converting we (still) got a float
                    return None
                abstract_number_converted = str(int(abstract_number_converted_float))
            #elif type_searched == type_float: -> Update to ListType
            elif type_float in type_searched:
                if abstract_number_converted_float.is_integer():
                    #We wanted a float but after converting we (still) got an int
                    return None
                abstract_number_converted = str(abstract_number_converted_float)
            else:
                print('There was some error that should not occur...')
                return None
            #print(abstract_record[0] + ' ' + abstract_record[4] + ' converted to ' + abstract_number_converted + ' ' + to_unit + ' (fact: ' + fact_number + ')')
        except (KeyError, TypeError):
            abstract_number_converted = abstract_record[0]
            #if abstract_record[2] != type_searched: -> Update to ListType
            if abstract_record[2] not in type_searched:
                #We did not apply unit conversion: Check hard if type still fits
                return None
    else:
        abstract_number_converted = abstract_record[0]
        #if abstract_record[2] != type_searched: -> Update to ListType
        if abstract_record[2] not in type_searched:
            #We did not apply unit conversion: Check hard if type still fits
            return None
    
    #Allow negatives only if relation allows negatives
    try:
        if negative_dict[relation] == False and float(abstract_number_converted) < 0:
            return None
    except (KeyError, ValueError):
        pass

    return abstract_number_converted

In [19]:
start_line = 0

token_around_counter = collections.Counter()
triples_file_filtered_sorted_by_relation.seek(0)
prev = None
for i, line in enumerate(triples_file_filtered_sorted_by_relation):    
    if i < start_line:
        continue
    
    triple = split_triple(line)
    
    if only_top20 and triple[1][1:-1] not in relations_top20:
        continue
    
    if triple[1] != prev:
        if prev != None:
            save_name = short_name(prev[1:-1])
            
            print("Make new df...")
            df = make_new_df(rows, df_cols)
            print("Clean df...")
            df = clean_df(df, True, save_name, None)
            print("Convert dummies...")
            df = convert_to_df_with_dummies(df)
            
            print("Save df...")
            #print(df)
            save_df(df, save_name)
        #df = get_new_df()
        rows = list()
        print("Now solving " + triple[1])
    
    
    fact_number = triple[2].split("^^")[0]
    found = False
    prev = triple[1]
    try:
        abstract = abstract_dict[triple[0]]
    except KeyError:
        #print('KeyError for ' + triple[0])
        continue
    
    abstract_literals = abstract_get_literals(abstract, triple[0], local_signs, constants, True, date_dict)
    
    #print(abstract)
    #print(abstract_literals)
    
    if relation_type_dict[triple[1][1:-1]] in types_int:
        type_searched = type_int
    elif relation_type_dict[triple[1][1:-1]] in types_float:
        #Special case: If fact number is typed as double but is actually integer (e.g. 3E6)
        if improve_float_matching and float(fact_number).is_integer():
            fact_number = str(int(float(fact_number)))
            type_searched = type_int
        else:
            type_searched = type_float
    elif relation_type_dict[triple[1][1:-1]] in types_date:
        type_searched = type_date
    else:
        print('There was some error that should not occur...')
        continue
        
    #print("Looking for " + fact_number + "(" + type_searched + ")")
        
    found_somewhere = False
    
    int_list_abstract = [[t for t in sent if t[2] == type_int] for sent in abstract_literals]
    float_list_abstract = [[t for t in sent if t[2] == type_float] for sent in abstract_literals]
    number_list_abstract = [[t for t in sent if t[2] == type_int or t[2] == type_float] for sent in abstract_literals]
    date_list_abstract = [[t for t in sent if t[2] == type_date] for sent in abstract_literals]
        
    flat_index = 0 #-1
    flat_index_number = 0 #-1
    flat_index_fit = 0 #-1
    
    
    for j, sentence in enumerate(abstract_literals):
        
        int_list_sentence = int_list_abstract[j]
        float_list_sentence = float_list_abstract[j]
        number_list_sentence = number_list_abstract[j]
        date_list_sentence = date_list_abstract[j]
        
        k_fit = 0 #-1
        k_number = 0 #-1
        
        for k, abstract_record in enumerate(sentence):

            abstract_number_converted = handle_units(triple[1], type_searched, abstract_record, convert_units, use_gold_standard_units, constants, unit_dict, unit_conversion_dict, negative_dict)
            
            #None is returned whenever the type in abstract number does not fit the searched type (afer possible unit conversions)
            if abstract_number_converted is not None:
                
                #print(str(abstract_number_converted) + ' is valid.')

                found = False
                if type_searched == type_int:
                    match_result = matched_int(fact_number, abstract_number_converted)
                elif type_searched == type_float:
                    match_result = matched_float(fact_number, abstract_number_converted)
                elif type_searched == type_date:
                    match_result = matched_date(fact_number, abstract_number_converted)
                else:
                    raise ValueError('Invalid Type: ' + type_searched)


                if match_result != 'Not matched':
                    found = True
                    found_somewhere = True
                    matched_cnt[match_result] += 1                      

                rows.append(create_record(int_list_abstract = int_list_abstract, \
                                          float_list_abstract = float_list_abstract, \
                                          number_list_abstract = number_list_abstract, \
                                          date_list_abstract = date_list_abstract, \
                                          int_list_sentence = int_list_sentence, \
                                          float_list_sentence = float_list_sentence, \
                                          number_list_sentence = number_list_sentence, \
                                          date_list_sentence = date_list_sentence, \
                                          index_abstract = j, \
                                          index_sentence = k, \
                                          index_sentence_fitting = k_fit, \
                                          index_sentence_number = k_number, \
                                          flat_index = flat_index, \
                                          flat_index_fitting = flat_index_fit, \
                                          flat_index_number = flat_index_number, \
                                          tokens_around = abstract_record[3], \
                                          token_directly_following = abstract_record[4], \
                                          found = found, \
                                          entity = triple[0], \
                                          relation = triple[1], \
                                          fact_number = fact_number, \
                                          abstract_number = abstract_record[0], \
                                          abstract_number_converted = abstract_number_converted, \
                                          matching_type = match_result, \
                                          global_offset = abstract_record[5], \
                                          relation_type = type_searched, \
                                          instance_types_dict = instance_types_dict, \
                                          relation_stat_dict = relation_stat_dict,
                                          track_token_around_counter = True))
            
            flat_index += 1
            
            if abstract_record[2] == type_int or abstract_record[2] == type_float:
                k_number +=1
                flat_index_number += 1
            
            if abstract_record[2] == type_searched:
                k_fit += 1
                flat_index_fit += 1
                
    if not found_somewhere:
        matched_cnt['Not matched'] += 1
        
    if i % 1000 == 0:
        print(str(i) + " entries processed")
        #print("Running Time for sentence parsing (nlp): " + str(running_time) + "ms")

save_name = short_name(prev[1:-1])
df = make_new_df(rows, df_cols)
df = clean_df(df, True, save_name, None)
df = convert_to_df_with_dummies(df)
save_df(df, save_name)   
print("Finished")

Now solving <http://dbpedia.org/ontology/absoluteMagnitude>
0 entries processed
1000 entries processed
2000 entries processed
3000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/acceleration>
Make new df...
Clean df...
Cleaning df...
Convert dummies...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/acquirementDate>
5000 entries processed
6000 entries processed
7000 entries processed
8000 entries processed
9000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/activeYearsEndDate>
14000 entries processed
23000 entries processed
27000 entrie

477000 entries processed
479000 entries processed
480000 entries processed
481000 entries processed
482000 entries processed
483000 entries processed
484000 entries processed
485000 entries processed
487000 entries processed
488000 entries processed
489000 entries processed
490000 entries processed
491000 entries processed
492000 entries processed
493000 entries processed
494000 entries processed
495000 entries processed
497000 entries processed
498000 entries processed
499000 entries processed
500000 entries processed
501000 entries processed
502000 entries processed
503000 entries processed
504000 entries processed
505000 entries processed
506000 entries processed
507000 entries processed
508000 entries processed
509000 entries processed
510000 entries processed
511000 entries processed
512000 entries processed
513000 entries processed
514000 entries processed
516000 entries processed
517000 entries processed
518000 entries processed
519000 entries processed
520000 entries processed


810000 entries processed
811000 entries processed
812000 entries processed
813000 entries processed
814000 entries processed
815000 entries processed
816000 entries processed
817000 entries processed
818000 entries processed
819000 entries processed
820000 entries processed
821000 entries processed
822000 entries processed
823000 entries processed
824000 entries processed
825000 entries processed
826000 entries processed
827000 entries processed
828000 entries processed
829000 entries processed
830000 entries processed
831000 entries processed
832000 entries processed
833000 entries processed
834000 entries processed
835000 entries processed
836000 entries processed
837000 entries processed
838000 entries processed
839000 entries processed
840000 entries processed
841000 entries processed
842000 entries processed
843000 entries processed
844000 entries processed
845000 entries processed
846000 entries processed
847000 entries processed
848000 entries processed
849000 entries processed


apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/averageSpeed>
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/barPassRate>
1055000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/beatifiedDate>
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/bedCount>
1056000 entries processed
1057000 entries processed
Make new df...
Clean df...
Cleaning df..

1317000 entries processed
1318000 entries processed
1319000 entries processed
1320000 entries processed
1321000 entries processed
1322000 entries processed
1323000 entries processed
1324000 entries processed
1325000 entries processed
1326000 entries processed
1327000 entries processed
1328000 entries processed
1329000 entries processed
1330000 entries processed
1331000 entries processed
1332000 entries processed
1333000 entries processed
1334000 entries processed
1335000 entries processed
1336000 entries processed
1337000 entries processed
1338000 entries processed
1339000 entries processed
1340000 entries processed
1341000 entries processed
1342000 entries processed
1343000 entries processed
1344000 entries processed
1345000 entries processed
1346000 entries processed
1347000 entries processed
1348000 entries processed
1349000 entries processed
1350000 entries processed
1351000 entries processed
1352000 entries processed
1353000 entries processed
1354000 entries processed
1355000 entr

1633000 entries processed
1634000 entries processed
1635000 entries processed
1636000 entries processed
1637000 entries processed
1638000 entries processed
1639000 entries processed
1640000 entries processed
1641000 entries processed
1642000 entries processed
1643000 entries processed
1644000 entries processed
1645000 entries processed
1646000 entries processed
1647000 entries processed
1648000 entries processed
1649000 entries processed
1650000 entries processed
1651000 entries processed
1652000 entries processed
1653000 entries processed
1654000 entries processed
1655000 entries processed
1656000 entries processed
1657000 entries processed
1658000 entries processed
1659000 entries processed
1660000 entries processed
1661000 entries processed
1662000 entries processed
1663000 entries processed
1664000 entries processed
1665000 entries processed
1666000 entries processed
1667000 entries processed
1668000 entries processed
1669000 entries processed
1670000 entries processed
1671000 entr

1874000 entries processed
1875000 entries processed
1876000 entries processed
1877000 entries processed
1878000 entries processed
1879000 entries processed
1880000 entries processed
1881000 entries processed
1882000 entries processed
1883000 entries processed
1884000 entries processed
1885000 entries processed
1886000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/capacityFactor>
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/capitalElevation>
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To d

1952000 entries processed
1953000 entries processed
1954000 entries processed
1955000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/course>
1956000 entries processed
1957000 entries processed
1958000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/crews>
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/currentRank>
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse

2133000 entries processed
2134000 entries processed
2135000 entries processed
2136000 entries processed
2137000 entries processed
2138000 entries processed
2139000 entries processed
2140000 entries processed
2141000 entries processed
2142000 entries processed
2143000 entries processed
2144000 entries processed
2145000 entries processed
2146000 entries processed
2147000 entries processed
2148000 entries processed
2149000 entries processed
2150000 entries processed
2151000 entries processed
2152000 entries processed
2153000 entries processed
2154000 entries processed
2155000 entries processed
2156000 entries processed
2157000 entries processed
2158000 entries processed
2159000 entries processed
2160000 entries processed
2161000 entries processed
2162000 entries processed
2163000 entries processed
2164000 entries processed
2165000 entries processed
2166000 entries processed
2167000 entries processed
2168000 entries processed
2169000 entries processed
2170000 entries processed
2171000 entr

apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/dissolutionYear>
2284000 entries processed
2285000 entries processed
2286000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/dissolved>
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/distance>
2287000 entries processed
2288000 entries processed
2289000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http

2429000 entries processed
2430000 entries processed
2431000 entries processed
2432000 entries processed
2433000 entries processed
2434000 entries processed
2435000 entries processed
2436000 entries processed
2437000 entries processed
2438000 entries processed
2439000 entries processed
2440000 entries processed
2441000 entries processed
2442000 entries processed
2443000 entries processed
2444000 entries processed
2445000 entries processed
2446000 entries processed
2447000 entries processed
2448000 entries processed
2449000 entries processed
2450000 entries processed
2451000 entries processed
2452000 entries processed
2453000 entries processed
2454000 entries processed
2455000 entries processed
2456000 entries processed
2457000 entries processed
2458000 entries processed
2459000 entries processed
2460000 entries processed
2461000 entries processed
2462000 entries processed
2463000 entries processed
2464000 entries processed
2465000 entries processed
2466000 entries processed
2467000 entr

2622000 entries processed
2623000 entries processed
2624000 entries processed
2625000 entries processed
2626000 entries processed
2627000 entries processed
2628000 entries processed
2629000 entries processed
2630000 entries processed
2631000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/failedLaunches>
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/fastestLap>
2632000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now sol

2722000 entries processed
2723000 entries processed
2724000 entries processed
2725000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/foundingYear>
2726000 entries processed
2727000 entries processed
2728000 entries processed
2729000 entries processed
2730000 entries processed
2731000 entries processed
2732000 entries processed
2733000 entries processed
2734000 entries processed
2735000 entries processed
2736000 entries processed
2737000 entries processed
2738000 entries processed
2740000 entries processed
2741000 entries processed
2742000 entries processed
2743000 entries processed
2744000 entries processed
2745000 entries processed
2746000 entries processed
2747000 entries processed
2748000 entries processed
2749000 entries processed
2750000 entries processed
2751000 entries pr

2936000 entries processed
2937000 entries processed
2938000 entries processed
2939000 entries processed
2940000 entries processed
2941000 entries processed
2942000 entries processed
2943000 entries processed
2944000 entries processed
2945000 entries processed
2946000 entries processed
2947000 entries processed
2948000 entries processed
2949000 entries processed
2950000 entries processed
2951000 entries processed
2952000 entries processed
2953000 entries processed
2955000 entries processed
2956000 entries processed
2957000 entries processed
2958000 entries processed
2959000 entries processed
2960000 entries processed
2961000 entries processed
2962000 entries processed
2963000 entries processed
2964000 entries processed
2965000 entries processed
2966000 entries processed
2967000 entries processed
2968000 entries processed
2969000 entries processed
2970000 entries processed
2971000 entries processed
2972000 entries processed
2973000 entries processed
2974000 entries processed
2975000 entr

Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/lastLaunchDate>
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/lastPosition>
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/lastPublicationDate>
3091000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now 

3198000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/maximumElevation>
3199000 entries processed
3200000 entries processed
3201000 entries processed
3202000 entries processed


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



5836000 entries processed
5838000 entries processed
5839000 entries processed
5840000 entries processed
5841000 entries processed
5842000 entries processed
5843000 entries processed
5845000 entries processed
5846000 entries processed
5847000 entries processed
5848000 entries processed
5849000 entries processed
5850000 entries processed
5852000 entries processed
5853000 entries processed
5854000 entries processed
5855000 entries processed
5856000 entries processed
5857000 entries processed
5858000 entries processed
5859000 entries processed
5860000 entries processed
5861000 entries processed
5862000 entries processed
5863000 entries processed
5865000 entries processed
5866000 entries processed
5867000 entries processed
5868000 entries processed
5869000 entries processed
5870000 entries processed
5871000 entries processed
5872000 entries processed
5873000 entries processed
5874000 entries processed
5875000 entries processed
5876000 entries processed
5877000 entries processed
5878000 entr

6021000 entries processed
6022000 entries processed
6023000 entries processed
6024000 entries processed
6025000 entries processed
6026000 entries processed
6027000 entries processed
6028000 entries processed
6029000 entries processed
6030000 entries processed
6031000 entries processed
6032000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/shoeNumber>
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/shoreLength>
6033000 entries processed
6034000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
a

To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/totalLaunches>
6332000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/totalPopulation>
6333000 entries processed
6334000 entries processed
6335000 entries processed
6336000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/totalTravellers>
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/track

Saved successfully
Now solving <http://dbpedia.org/ontology/width>
6455000 entries processed
6456000 entries processed
6457000 entries processed
6458000 entries processed
6460000 entries processed
6462000 entries processed
6463000 entries processed
6464000 entries processed
6465000 entries processed
6466000 entries processed
6467000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/wins>
6468000 entries processed
Make new df...
Clean df...
Cleaning df...
Convert dummies...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Save df...
Saving df...
Saved successfully
Now solving <http://dbpedia.org/ontology/year>
6473000 entries processed
6474000 entries processed
6475000 entries processed
6476000 entries processed
6477000 entries processed

In [20]:
matched_cnt

Counter({'Matched by range': 35164,
         'Not matched': 3535012,
         'Matched by equality': 281782,
         'Matched by equality (date)': 912003,
         'Matched by 2-Decimal round': 152,
         'Matched by Mio round': 23,
         'Matched by 100K round': 17,
         'Matched by 10Mio round': 11,
         'Matched by Mrd round': 2,
         'Matched by 100Mio round': 3,
         'Matched by Hundret round': 622,
         'Matched by Thousand round': 253,
         'Matched by 10K round': 62,
         'Matched by 4-Decimal round': 1,
         'Matched by 3-Decimal round': 11})