In [1]:
import os
import re
import pandas as pd
import numpy as np
from utils.myutils import load_object, long_name

In [2]:
#Parameter
generate_triples = True

candidate_dict = load_object('candidate_dict')

directory = os.fsencode("data/candidates")
directory_name = directory.decode("utf-8")

if generate_triples:
    relation_type_dict = load_object('relation_type_dict')
    types_float = load_object('types_float')
    file_path_triples_new = "dbpedia/new_triples.ttl"
    new_triples_file = open(file_path_triples_new, 'w', encoding='utf-8')

In [3]:
#eval_file = 'elevation_2.csv'
for file in os.listdir(directory):
    file_name = os.fsdecode(file)
    
    if not file_name.endswith(".csv"):
        continue
    
    #if file_name != eval_file:
    #    continue
    
    print('---' + file_name + '---')
    
    if len(re.findall('_[0-9]$', file_name[:-4])) > 0:
        real_file_name = re.sub('_[0-9]$', '', file_name[:-4]) + '.csv'
    else:
        real_file_name = file_name
    
    try:
        model = load_object('models/' + real_file_name[:-4])
        confidence_threshold = load_object('models/' + real_file_name[:-4] + '_confidence')
        original_features = load_object('models/' + real_file_name[:-4] + '_features')
    except FileNotFoundError:
        print('Data not (fully) available')
        continue
    
    
    try:
        dtypes = load_object('data_info/candidates/' + file_name[:-4] + '_dtypes')
        if type(dtypes) != dict:
            dtypes = dtypes.to_dict()
    except (FileNotFoundError, AttributeError):
        print('WARNING! No dtype information available!')
        dtypes = None
    
    print(len(dtypes))
        
    data = pd.read_csv(directory_name + "/" + file_name,
                       encoding = "utf-8",
                       dtype = dtypes,
                       sep = ',')
    
    print(len(data.columns))
    
    #Some algorithms have problems with certain characters in feature names
    data.columns = [re.sub('[\[\]<>]', 'X', elem) for elem in data.columns]
    
    #Drop all Features not considered in the Original
    drop_features = [col for col in list(data.columns) if col not in original_features]
    fit_data = data.drop(drop_features, axis=1)
    
    #Add empty Features not contained in the Candidates
    gen = (col for col in original_features if col not in list(fit_data.columns))

    for col in gen:
        fit_data[col] = 0
        
    #Very Important! Scikit-learn is not aware of the feature names - therefore we must fit it to the original order!
    fit_data = fit_data[original_features]
        
    print(len(data.columns))
    
    try:
        data_proba = model.predict_proba(fit_data)
        #print(data_proba)
    except ValueError:
        print('Got ValueError. Trying to convert and clean data...')
        old_shape = fit_data.shape[0]
        #Convert float64 to float32 and remove possibly resulting Infinity Rows
        f64_dtypes = data.dtypes[data.dtypes == np.float64].to_dict()
        data = data.astype(dict.fromkeys(f64_dtypes, np.float32))
        data = data[~data.isin([np.inf, -np.inf]).any(1)]
        
        fit_data = data.drop(drop_features, axis=1)
        #Add empty Features not contained in the Candidates
        gen = (col for col in original_features if col not in list(fit_data.columns))

        for col in gen:
            fit_data[col] = 0
        
        #Very Important! Scikit-learn is not aware of the feature names - therefore we must fit it to the original order!
        fit_data = fit_data[original_features]
        
        print('Deleted ' + str(old_shape - fit_data.shape[0]) + ' values.')
        
        data_proba = model.predict_proba(fit_data)
    
    #print(data_proba)
    
    #Numpy-2DArray: Is the confidence above the threshold?
    data_proba_threshold = data_proba >= confidence_threshold
    #Numpy-1DArray: Is the prediction for True above the threshold?
    data_prediction = data_proba_threshold[:,1]
    
    #print(data_proba_threshold)

    data['LabelPredicted'] = data_prediction
    
    print('Added Prediction')
    
    print(data[data['LabelPredicted'] == True].shape)
    
    data.to_csv(directory_name + "/" + file_name,
                encoding = 'utf-8',
                sep = ',',
                index = False)
    
    if generate_triples:
        
        relation_long_name = long_name(real_file_name[:-4])
        relation_type = '<' + relation_type_dict[relation_long_name] + '>'
        
        print('Write Triples...')
        triple_cnt = 0
        for idx, row in data[data['LabelPredicted'] == True].iterrows():
            if relation_type in types_float and '.' not in row['InfoAbstractNumberConverted']:
                row['InfoAbstractNumberConverted'] = row['InfoAbstractNumberConverted'] + '.0'
            new_triples_file.write(row['InfoEntity'] + ' ' + relation_long_name + ' ' + row['InfoAbstractNumberConverted'] + '^^' + relation_type + ' .\n')
            triple_cnt += 1
        print(str(triple_cnt) + ' new triples written')
        
if generate_triples:
    print('Saving final file...')
    new_triples_file.close()
print('Done')

---areaTotal_2.csv---
682
683
683
Added Prediction
(454, 683)
Write Triples...
454 new triples written
---areaTotal_3.csv---
671
671
671
Added Prediction
(71, 672)
Write Triples...
71 new triples written
---areaTotal.csv---
668
668
668
Got ValueError. Trying to convert and clean data...
Deleted 1 values.
Added Prediction
(16, 669)
Write Triples...
16 new triples written
---birthDate.csv---
702
702
702
Added Prediction
(53053, 703)
Write Triples...
53053 new triples written
---birthYear_0.csv---
719
719
719
Added Prediction
(109794, 720)
Write Triples...
109794 new triples written
---birthYear_1.csv---
717
717
717
Added Prediction
(135326, 718)
Write Triples...
135326 new triples written
---birthYear_2.csv---
717
717
717
Added Prediction
(163895, 718)
Write Triples...
163895 new triples written
---birthYear.csv---
712
712
712
Added Prediction
(52875, 713)
Write Triples...
52875 new triples written
---date.csv---
473
473
473
Added Prediction
(114, 474)
Write Triples...
114 new triples wr