In [1]:
import nbimporter
import create_trainings as ctr
from utils.myutils import load_object, short_name

Importing Jupyter notebook from create_trainings.ipynb


In [None]:
type_int = "Int"
type_float = "Float"
type_date = "Date"
constants = (type_int, type_float, type_date)

#Parameter
only_float_threshold = 0.8
only_int_threshold = 0.2

break_point = 500000

In [3]:
abstract_dict = load_object('abstract_dict')
relation_type_dict = load_object('relation_type_dict')
candidate_dict = load_object('candidate_dict')
date_dict = load_object('date_dict')
unit_dict = load_object('unit_dict')
unit_conversion_dict = load_object('unit_conversion_dict')
instance_types_dict = load_object('instance_types_dict')
relation_stat_dict = load_object('relation_stat_dict')
negative_dict = load_object('negative_dict')
types_int = load_object('types_int')
types_float = load_object('types_float')
types_date = load_object('types_date')
float_proportion_dict = load_object('float_proportion_dict')

%store -r df_cols

In [5]:
rows = list()
local_signs = ('.',',','-')
record_cnt = 0
running_suffix = 0

for relation, candidates in candidate_dict.items():

    print('---' + relation + '---')
    
    rel_short_name = short_name(relation[1:-1])
    
    try:
        load_object('models/' + rel_short_name)
    except FileNotFoundError:
        print('Not processed')
        continue
    
    try:
        token_set = load_object('data_info/' + rel_short_name + '_tokens')
    except FileNotFoundError:
        print("Cannot process relation " + rel_short_name + " - Token Set missing!")
        continue
    
    record_cnt = 0
    running_suffix = 0
    
    for candidate in candidates:
        try:
            abstract_literals = ctr.abstract_get_literals(abstract_dict[candidate], candidate, local_signs, constants,True, date_dict)
        except KeyError:
            #No Abstract -> No Result
            continue
    
        if relation_type_dict[relation[1:-1]] in types_int:
            type_searched = type_int
        elif relation_type_dict[relation[1:-1]] in types_float:
            
            if float_proportion_dict[relation] < only_int_threshold:
                type_searched = type_int
            elif float_proportion_dict[relation] > only_float_threshold:
                type_searched = type_float
            else:
                type_searched = [type_float, type_int]
                
        elif relation_type_dict[relation[1:-1]] in types_date:
            type_searched = type_date
        else:
            print('There was some error that should not occur...')
            continue

        
        int_list_abstract = [[t for t in sent if t[2] == type_int] for sent in abstract_literals]
        float_list_abstract = [[t for t in sent if t[2] == type_float] for sent in abstract_literals]
        number_list_abstract = [[t for t in sent if t[2] == type_int or t[2] == type_float] for sent in abstract_literals]
        date_list_abstract = [[t for t in sent if t[2] == type_date] for sent in abstract_literals]

        flat_index = 0 #-1
        flat_index_number = 0 #-1
        flat_index_fit = 0 #-1

        for j, sentence in enumerate(abstract_literals):

            int_list_sentence = int_list_abstract[j]
            float_list_sentence = float_list_abstract[j]
            number_list_sentence = number_list_abstract[j]
            date_list_sentence = date_list_abstract[j]

            k_fit = 0 #-1
            k_number = 0 #-1
            
            
            for k, abstract_record in enumerate(sentence):

                abstract_number_converted = ctr.handle_units(relation, type_searched, abstract_record, True, True, constants, unit_dict, unit_conversion_dict, negative_dict)

                #For Dates
                if type_searched == type_date:
                    abstract_number_converted = validate_date(abstract_number_converted, relation)
                
                #None is returned whenever the type in abstract number does not fit the searched type (afer possible unit conversions)
                if abstract_number_converted is not None:
                    
                    #Label hardcoded to False - actually it is not defined yet
                    #Fact_number hardcoded to 0 - no fact_number available for new candidates
                    #Matching_type harcdoded to 'N/A' - no matching for candiadtes
                    rows.append(ctr.create_record(int_list_abstract = int_list_abstract, \
                                                  float_list_abstract = float_list_abstract, \
                                                  number_list_abstract = number_list_abstract, \
                                                  date_list_abstract = date_list_abstract, \
                                                  int_list_sentence = int_list_sentence, \
                                                  float_list_sentence = float_list_sentence, \
                                                  number_list_sentence = number_list_sentence, \
                                                  date_list_sentence = date_list_sentence, \
                                                  index_abstract = j, \
                                                  index_sentence = k, \
                                                  index_sentence_fitting = k_fit, \
                                                  index_sentence_number = k_number, \
                                                  flat_index = flat_index, \
                                                  flat_index_fitting = flat_index_fit, \
                                                  flat_index_number = flat_index_number, \
                                                  tokens_around = abstract_record[3], \
                                                  token_directly_following = abstract_record[4], \
                                                  found = False, \
                                                  entity = candidate, \
                                                  relation = relation, \
                                                  fact_number = '0', \
                                                  abstract_number = abstract_record[0], \
                                                  abstract_number_converted = abstract_number_converted, \
                                                  matching_type = 'N/A', \
                                                  global_offset = abstract_record[5], \
                                                  relation_type = type_searched, \
                                                  instance_types_dict = instance_types_dict, \
                                                  relation_stat_dict = relation_stat_dict, \
                                                  track_token_around_counter = False))
                    record_cnt += 1
                
                
                flat_index += 1

                if abstract_record[2] == type_int or abstract_record[2] == type_float:
                    k_number +=1
                    flat_index_number += 1
                
                if abstract_record[2] in type_searched:
                    k_fit += 1
                    flat_index_fit += 1
        
        if record_cnt >= break_point:
            print('Breakpoint reached. Generating intermediate set...')
            df = ctr.make_new_df(rows, df_cols)
            df = ctr.clean_df(df, False, rel_short_name, token_set)
            df = ctr.convert_to_df_with_dummies(df)
            ctr.save_df(df, 'candidates/' + rel_short_name + '_' + str(running_suffix))
            rows = list()
            record_cnt = 0
            running_suffix += 1
                        
    
    df = ctr.make_new_df(rows, df_cols)
    df = ctr.clean_df(df, False, rel_short_name, token_set)
    df = ctr.convert_to_df_with_dummies(df)
    ctr.save_df(df, 'candidates/' + rel_short_name)
    rows = list()

Cannot process relation absoluteMagnitude - Token Set missing!
Cannot process relation acceleration - Token Set missing!
Cannot process relation acquirementDate - Token Set missing!
Cannot process relation activeYearsEndDate - Token Set missing!
Cannot process relation activeYearsEndYear - Token Set missing!
Cannot process relation activeYearsStartDate - Token Set missing!
Cannot process relation activeYearsStartYear - Token Set missing!
Cannot process relation added - Token Set missing!
Cannot process relation ageRange - Token Set missing!
Cannot process relation albedo - Token Set missing!
Cannot process relation anniversary - Token Set missing!
Cannot process relation apoapsis - Token Set missing!
Cannot process relation apparentMagnitude - Token Set missing!
Cannot process relation approximateCalories - Token Set missing!
Cleaning df...
To sparse...
apply tuple...
calc group by columns...
calc dummies...
apply list...
To dense...
Saving df...
Saved successfully
Cleaning df...
To sp

Cannot process relation numberOfSuites - Token Set missing!
Cannot process relation numberOfTeams - Token Set missing!
Cannot process relation numberOfTracks - Token Set missing!
Cannot process relation numberOfUndergraduateStudents - Token Set missing!
Cannot process relation numberOfVehicles - Token Set missing!
Cannot process relation numberOfVineyards - Token Set missing!
Cannot process relation numberOfVisitors - Token Set missing!
Cannot process relation numberOfVisitorsAsOf - Token Set missing!
Cannot process relation numberOfVolumes - Token Set missing!
Cannot process relation numberOfVolunteers - Token Set missing!
Cannot process relation numberOfWineries - Token Set missing!
Cannot process relation numberSold - Token Set missing!
Cannot process relation omim - Token Set missing!
Cannot process relation onChromosome - Token Set missing!
Cannot process relation openingDate - Token Set missing!
Cannot process relation openingYear - Token Set missing!
Cannot process relation oper

In [1]:
#Test
def validate_date(input_date, input_rel):
    
    if input_date is None:
        return None
    
    if relation_type_dict[input_rel[1:-1]] == 'http://www.w3.org/2001/XMLSchema#date':
        if input_date[1] == 'ymd':
            return input_date[0]
        return None
    
    if relation_type_dict[input_rel[1:-1]] == 'http://www.w3.org/2001/XMLSchema#gYear':
        if input_date[1] == 'ymd':
            return input_date[0][:-6]
        if input_date[1] == 'ym':
            return input_date[0][:-3]
        if input_date[1] == 'y':
            return input_date[0]
        return None
    
    if relation_type_dict[input_rel[1:-1]] == 'http://www.w3.org/2001/XMLSchema#gYearMonth':
        if input_date[1] == 'ymd':
            return input_date[0][:-3]
        if input_date[1] == 'ym':
            return input_date[0]
        return None
    
    return None