This Notebook filters the mappingbased_literals file for all types that are numbers or dates

In [3]:
import re
import collections
from utils.myutils import split_triple, save_object

In [2]:
types_valid = set(['http://www.w3.org/2001/XMLSchema#positiveInteger',
                   'http://www.w3.org/2001/XMLSchema#float',
                   'http://www.w3.org/2001/XMLSchema#integer',
                   'http://www.w3.org/2001/XMLSchema#nonNegativeInteger',
                   'http://www.w3.org/2001/XMLSchema#double',
                   'http://www.w3.org/2001/XMLSchema#gYear',
                   'http://www.w3.org/2001/XMLSchema#date',
                   'http://www.w3.org/2001/XMLSchema#gYearMonth'])
types_invalid = set(['http://www.w3.org/1999/02/22-rdf-syntax-ns#langString',
                     'http://www.w3.org/2001/XMLSchema#string',
                     'http://www.w3.org/2001/XMLSchema#boolean',
                     'http://dbpedia.org/datatype/engineConfiguration',
                     'http://dbpedia.org/datatype/valvetrain',
                     'http://dbpedia.org/datatype/fuelType',
                     'http://www.w3.org/2001/XMLSchema#anyURI'])

In [3]:
file_path_triples = "dbpedia/mappingbased_literals_en.ttl"
file_path_triples_filtered = "dbpedia/mappingbased_literals_en_filtered.ttl"
file_path_owl = "dbpedia/dbpedia_2016-10.owl"

triples_file = open(file_path_triples, 'r', encoding='utf-8')
triples_file_filtered = open(file_path_triples_filtered, 'w', encoding='utf-8')
owl_file = open(file_path_owl, 'r', encoding='utf-8')

relations_valid = set([])
relations_invalid = set([])

relation_type_dict = dict()

relations_cnt = collections.Counter()
types_cnt = collections.Counter()
relations_valid_cnt = collections.Counter()
types_valid_cnt = collections.Counter()

In [4]:
#Manual Assignment of some non-DBPedia relations - All others will get String assigned!
relations_valid.add('http://www.w3.org/2003/01/geo/wgs84_pos#lat')
relation_type_dict['http://www.w3.org/2003/01/geo/wgs84_pos#lat'] = 'http://www.w3.org/2001/XMLSchema#float'
relations_valid.add('http://www.w3.org/2003/01/geo/wgs84_pos#long')
relation_type_dict['http://www.w3.org/2003/01/geo/wgs84_pos#long'] = 'http://www.w3.org/2001/XMLSchema#float'

In [5]:
regex_xml_tag = re.compile('<rdfs:range rdf:resource=[^>]*/>')
regex_resource = re.compile('rdf:resource=".*"/>')

#Look up the type of an unknown relations that is neither in relations_valid nor in relations_invalid
def check_new_relation(relation):
    #print(relation)
    search_term = 'rdf:about="' + relation + '"'
    owl_file.seek(0)
    for line in owl_file:
        if search_term in line:
            #Read next line
            range_line = owl_file.readline()
            #print(str(relation) + " found in line " + line)
            #print("Assuming range in line " + range_line)
            
            try:
                rdfs_range_tag = regex_xml_tag.search(range_line)[0]
            except TypeError:
                range_line = owl_file.readline()
                try:
                    rdfs_range_tag = regex_xml_tag.search(range_line)[0]
                    print("WARNING! Could not find in-line type for relation " + relation)
                    print("Found type in the next line. Check whether this is correct!")
                except TypeError:
                    print("ERROR! Could not find a type for relation " + relation)
                    continue
            
            try:
                regex_xml_tag.search(range_line)[1]
            except IndexError:
                relation_range = regex_resource.search(rdfs_range_tag)[0][14:-3]
                
                relation_type_dict[relation] = relation_range
                
                if relation_range in types_valid:
                    #print("Relation " + relation + " has type " + relation_range + " and is therfore valid.")
                    relations_valid.add(relation)
                elif relation_range in types_invalid:
                    #print("Relation " + relation + " has type " + relation_range + " and is therfore invalid.")
                    relations_invalid.add(relation)
                else:
                    print("WARNING! Relation " + relation + " has unknown type " + relation_range)
                
                return
            print("ERROR! Multiple Findings! Second Finding: " + regex_xml_tag.search(range_line)[1])
    
    print("WARNING! Could not find a type for relation " + relation + "! Assuming String from now on!")
    relations_invalid.add(relation)
    relation_type_dict[relation] = 'http://www.w3.org/2001/XMLSchema#string'

In [6]:
i = 0
# Loop over all existing Triples
for line in triples_file:
    i += 1
    
    #Skip comment lines and empty lines
    if line[:1] == '#' or len(line) == 0:
        continue
    
    #Split triple correctly into 3-array
    try:
        triple = split_triple(line)
    except ValueError:
        print("ValueError: Split not possible: " + line)
        continue
    
    if triple[1][1:-1] not in relations_valid and triple[1][1:-1] not in relations_invalid:
        check_new_relation(triple[1][1:-1])
    
    relations_cnt[triple[1][1:-1]] += 1
    types_cnt[relation_type_dict[triple[1][1:-1]]] += 1
    
    if triple[1][1:-1] in relations_invalid:
        #print(triple[1] + " is invalid. Ignored.")
        continue
        
    if triple[1][1:-1] in relations_valid:
        #print(triple[1] + " is valid.")
        triples_file_filtered.write(line)
        relations_valid_cnt[triple[1][1:-1]] += 1
        types_valid_cnt[relation_type_dict[triple[1][1:-1]]] += 1
    else:
        #print(triple[1] + " is invalid. Ignored.")
        print("ERROR! Tuple not valid or invalid!")
    
    if i % 1000 == 0:
        print(str(i) + " entries processed")
        
    #if i > 10000:
    #    break

triples_file.close()
triples_file_filtered.close()
owl_file.close()

1000 entries processed
2000 entries processed
4000 entries processed
5000 entries processed
8000 entries processed
12000 entries processed
18000 entries processed
20000 entries processed
23000 entries processed
25000 entries processed
27000 entries processed
31000 entries processed
33000 entries processed
34000 entries processed
35000 entries processed
36000 entries processed
42000 entries processed
46000 entries processed
47000 entries processed
49000 entries processed
50000 entries processed
51000 entries processed
53000 entries processed
54000 entries processed
58000 entries processed
59000 entries processed
62000 entries processed
66000 entries processed
67000 entries processed
71000 entries processed
75000 entries processed
78000 entries processed
81000 entries processed
84000 entries processed
85000 entries processed
90000 entries processed
98000 entries processed
101000 entries processed
102000 entries processed
103000 entries processed
104000 entries processed
105000 entries pr

525000 entries processed
526000 entries processed
527000 entries processed
530000 entries processed
531000 entries processed
532000 entries processed
534000 entries processed
537000 entries processed
539000 entries processed
540000 entries processed
541000 entries processed
542000 entries processed
543000 entries processed
544000 entries processed
545000 entries processed
549000 entries processed
550000 entries processed
551000 entries processed
552000 entries processed
553000 entries processed
554000 entries processed
556000 entries processed
557000 entries processed
558000 entries processed
562000 entries processed
568000 entries processed
570000 entries processed
571000 entries processed
572000 entries processed
573000 entries processed
574000 entries processed
578000 entries processed
579000 entries processed
581000 entries processed
582000 entries processed
583000 entries processed
585000 entries processed
587000 entries processed
588000 entries processed
594000 entries processed


1196000 entries processed
1197000 entries processed
1200000 entries processed
1201000 entries processed
1208000 entries processed
1209000 entries processed
1210000 entries processed
1211000 entries processed
1215000 entries processed
1216000 entries processed
1217000 entries processed
1219000 entries processed
1220000 entries processed
1223000 entries processed
1225000 entries processed
1226000 entries processed
1227000 entries processed
1229000 entries processed
1234000 entries processed
1235000 entries processed
1238000 entries processed
1240000 entries processed
1241000 entries processed
1243000 entries processed
1244000 entries processed
1248000 entries processed
1251000 entries processed
1252000 entries processed
1256000 entries processed
1258000 entries processed
1261000 entries processed
1262000 entries processed
1264000 entries processed
1265000 entries processed
1270000 entries processed
1271000 entries processed
1272000 entries processed
1273000 entries processed
1274000 entr

2391000 entries processed
2393000 entries processed
2395000 entries processed
2396000 entries processed
2397000 entries processed
2400000 entries processed
2402000 entries processed
2403000 entries processed
2404000 entries processed
2405000 entries processed
2409000 entries processed
2410000 entries processed
2411000 entries processed
2412000 entries processed
2414000 entries processed
2417000 entries processed
2418000 entries processed
2419000 entries processed
2420000 entries processed
2421000 entries processed
2425000 entries processed
2426000 entries processed
2427000 entries processed
2429000 entries processed
2430000 entries processed
2431000 entries processed
2434000 entries processed
2436000 entries processed
2439000 entries processed
2440000 entries processed
2442000 entries processed
2447000 entries processed
2448000 entries processed
2449000 entries processed
2452000 entries processed
2457000 entries processed
2459000 entries processed
2460000 entries processed
2461000 entr

3030000 entries processed
3033000 entries processed
3035000 entries processed
3036000 entries processed
3041000 entries processed
3042000 entries processed
3048000 entries processed
3051000 entries processed
3054000 entries processed
3055000 entries processed
3057000 entries processed
3059000 entries processed
3060000 entries processed
3064000 entries processed
3066000 entries processed
3072000 entries processed
3073000 entries processed
3075000 entries processed
3078000 entries processed
3082000 entries processed
3083000 entries processed
3086000 entries processed
3089000 entries processed
3093000 entries processed
3096000 entries processed
3098000 entries processed
3100000 entries processed
3101000 entries processed
3102000 entries processed
3109000 entries processed
3111000 entries processed
3114000 entries processed
3116000 entries processed
3117000 entries processed
3119000 entries processed
3123000 entries processed
3125000 entries processed
3129000 entries processed
3131000 entr

3651000 entries processed
3653000 entries processed
3657000 entries processed
3658000 entries processed
3662000 entries processed
3663000 entries processed
3664000 entries processed
3665000 entries processed
3667000 entries processed
3668000 entries processed
3676000 entries processed
3679000 entries processed
3684000 entries processed
3688000 entries processed
3691000 entries processed
3695000 entries processed
3699000 entries processed
3702000 entries processed
3703000 entries processed
3705000 entries processed
3708000 entries processed
3709000 entries processed
3710000 entries processed
3714000 entries processed
3721000 entries processed
3722000 entries processed
3724000 entries processed
3727000 entries processed
3728000 entries processed
3729000 entries processed
3730000 entries processed
3733000 entries processed
3735000 entries processed
3739000 entries processed
3742000 entries processed
3744000 entries processed
3745000 entries processed
3746000 entries processed
3748000 entr

4292000 entries processed
4293000 entries processed
4294000 entries processed
4296000 entries processed
4297000 entries processed
4298000 entries processed
4300000 entries processed
4301000 entries processed
4305000 entries processed
4308000 entries processed
4310000 entries processed
4311000 entries processed
4312000 entries processed
4314000 entries processed
4315000 entries processed
4318000 entries processed
4319000 entries processed
4321000 entries processed
4322000 entries processed
4326000 entries processed
4327000 entries processed
4329000 entries processed
4331000 entries processed
4332000 entries processed
4333000 entries processed
4334000 entries processed
4335000 entries processed
4336000 entries processed
4339000 entries processed
4340000 entries processed
4342000 entries processed
4347000 entries processed
4348000 entries processed
4351000 entries processed
4352000 entries processed
4353000 entries processed
4354000 entries processed
4355000 entries processed
4358000 entr

4868000 entries processed
4869000 entries processed
4870000 entries processed
4877000 entries processed
4881000 entries processed
4882000 entries processed
4884000 entries processed
4886000 entries processed
4887000 entries processed
4889000 entries processed
4894000 entries processed
4895000 entries processed
4898000 entries processed
4899000 entries processed
4901000 entries processed
4903000 entries processed
4906000 entries processed
4907000 entries processed
4910000 entries processed
4911000 entries processed
4912000 entries processed
4914000 entries processed
4915000 entries processed
4919000 entries processed
4923000 entries processed
4929000 entries processed
4930000 entries processed
4931000 entries processed
4935000 entries processed
4936000 entries processed
4937000 entries processed
4938000 entries processed
4939000 entries processed
4941000 entries processed
4944000 entries processed
4946000 entries processed
4947000 entries processed
4948000 entries processed
4949000 entr

5518000 entries processed
5519000 entries processed
5521000 entries processed
5522000 entries processed
5523000 entries processed
5524000 entries processed
5525000 entries processed
5526000 entries processed
5529000 entries processed
5531000 entries processed
5532000 entries processed
5533000 entries processed
5534000 entries processed
5535000 entries processed
5537000 entries processed
5539000 entries processed
5540000 entries processed
5545000 entries processed
5547000 entries processed
5549000 entries processed
5551000 entries processed
5555000 entries processed
5556000 entries processed
5557000 entries processed
5559000 entries processed
5561000 entries processed
5562000 entries processed
5563000 entries processed
5564000 entries processed
5566000 entries processed
5569000 entries processed
5571000 entries processed
5572000 entries processed
5577000 entries processed
5580000 entries processed
5582000 entries processed
5583000 entries processed
5585000 entries processed
5586000 entr

6092000 entries processed
6093000 entries processed
6095000 entries processed
6097000 entries processed
6099000 entries processed
6100000 entries processed
6101000 entries processed
6102000 entries processed
6104000 entries processed
6105000 entries processed
6107000 entries processed
6108000 entries processed
6109000 entries processed
6110000 entries processed
6111000 entries processed
6113000 entries processed
6115000 entries processed
6118000 entries processed
6125000 entries processed
6126000 entries processed
6127000 entries processed
6128000 entries processed
6129000 entries processed
6131000 entries processed
6132000 entries processed
6133000 entries processed
6134000 entries processed
6135000 entries processed
6137000 entries processed
6138000 entries processed
6139000 entries processed
6140000 entries processed
6142000 entries processed
6146000 entries processed
6147000 entries processed
6148000 entries processed
6149000 entries processed
6151000 entries processed
6153000 entr

6635000 entries processed
6636000 entries processed
6637000 entries processed
6639000 entries processed
6642000 entries processed
6643000 entries processed
6645000 entries processed
6646000 entries processed
6648000 entries processed
6650000 entries processed
6651000 entries processed
6652000 entries processed
6653000 entries processed
6654000 entries processed
6655000 entries processed
6657000 entries processed
6658000 entries processed
6659000 entries processed
6661000 entries processed
6662000 entries processed
6666000 entries processed
6667000 entries processed
6670000 entries processed
6671000 entries processed
6672000 entries processed
6673000 entries processed
6674000 entries processed
6675000 entries processed
6682000 entries processed
6683000 entries processed
6684000 entries processed
6688000 entries processed
6689000 entries processed
6690000 entries processed
6692000 entries processed
6693000 entries processed
6697000 entries processed
6698000 entries processed
6700000 entr

7229000 entries processed
7230000 entries processed
7231000 entries processed
7234000 entries processed
7235000 entries processed
7236000 entries processed
7238000 entries processed
7240000 entries processed
7242000 entries processed
7244000 entries processed
7245000 entries processed
7247000 entries processed
7250000 entries processed
7251000 entries processed
7255000 entries processed
7256000 entries processed
7257000 entries processed
7261000 entries processed
7262000 entries processed
7263000 entries processed
7264000 entries processed
7265000 entries processed
7267000 entries processed
7269000 entries processed
7270000 entries processed
7272000 entries processed
7274000 entries processed
7276000 entries processed
7277000 entries processed
7278000 entries processed
7282000 entries processed
7283000 entries processed
7284000 entries processed
7288000 entries processed
7289000 entries processed
7290000 entries processed
7291000 entries processed
7292000 entries processed
7294000 entr

7811000 entries processed
7814000 entries processed
7815000 entries processed
7818000 entries processed
7819000 entries processed
7820000 entries processed
7821000 entries processed
7822000 entries processed
7823000 entries processed
7824000 entries processed
7825000 entries processed
7826000 entries processed
7829000 entries processed
7830000 entries processed
7832000 entries processed
7833000 entries processed
7834000 entries processed
7837000 entries processed
7840000 entries processed
7844000 entries processed
7847000 entries processed
7848000 entries processed
7849000 entries processed
7852000 entries processed
7853000 entries processed
7856000 entries processed
7857000 entries processed
7858000 entries processed
7860000 entries processed
7861000 entries processed
7862000 entries processed
7863000 entries processed
7866000 entries processed
7867000 entries processed
7870000 entries processed
7878000 entries processed
7879000 entries processed
7882000 entries processed
7883000 entr

8376000 entries processed
8377000 entries processed
8381000 entries processed
8384000 entries processed
8385000 entries processed
8386000 entries processed
8387000 entries processed
8388000 entries processed
8389000 entries processed
8391000 entries processed
8392000 entries processed
8393000 entries processed
8395000 entries processed
8396000 entries processed
8397000 entries processed
8398000 entries processed
8399000 entries processed
8401000 entries processed
8402000 entries processed
8404000 entries processed
8407000 entries processed
8410000 entries processed
8412000 entries processed
8413000 entries processed
8417000 entries processed
8421000 entries processed
8424000 entries processed
8425000 entries processed
8426000 entries processed
8427000 entries processed
8428000 entries processed
8429000 entries processed
8430000 entries processed
8431000 entries processed
8432000 entries processed
8434000 entries processed
8436000 entries processed
8439000 entries processed
8440000 entr

9554000 entries processed
9555000 entries processed
9556000 entries processed
9559000 entries processed
9561000 entries processed
9563000 entries processed
9564000 entries processed
9565000 entries processed
9566000 entries processed
9567000 entries processed
9570000 entries processed
9573000 entries processed
9574000 entries processed
9575000 entries processed
9577000 entries processed
9578000 entries processed
9579000 entries processed
9582000 entries processed
9584000 entries processed
9585000 entries processed
9587000 entries processed
9590000 entries processed
9591000 entries processed
9597000 entries processed
9600000 entries processed
9601000 entries processed
9604000 entries processed
9606000 entries processed
9609000 entries processed
9612000 entries processed
9613000 entries processed
9615000 entries processed
9616000 entries processed
9617000 entries processed
9618000 entries processed
9620000 entries processed
9621000 entries processed
9623000 entries processed
9627000 entr

10149000 entries processed
10153000 entries processed
10158000 entries processed
10159000 entries processed
10162000 entries processed
10163000 entries processed
10165000 entries processed
10168000 entries processed
10169000 entries processed
10170000 entries processed
10174000 entries processed
10176000 entries processed
10177000 entries processed
10179000 entries processed
10182000 entries processed
10184000 entries processed
10185000 entries processed
10186000 entries processed
10191000 entries processed
10193000 entries processed
10194000 entries processed
10195000 entries processed
10196000 entries processed
10200000 entries processed
10201000 entries processed
10203000 entries processed
10205000 entries processed
10206000 entries processed
10208000 entries processed
10209000 entries processed
10210000 entries processed
10211000 entries processed
10213000 entries processed
10214000 entries processed
10215000 entries processed
10216000 entries processed
10217000 entries processed
1

10708000 entries processed
10710000 entries processed
10711000 entries processed
10713000 entries processed
10714000 entries processed
10716000 entries processed
10717000 entries processed
10718000 entries processed
10720000 entries processed
10722000 entries processed
10723000 entries processed
10724000 entries processed
10727000 entries processed
10728000 entries processed
10729000 entries processed
10730000 entries processed
10731000 entries processed
10736000 entries processed
10740000 entries processed
10741000 entries processed
10742000 entries processed
10743000 entries processed
10747000 entries processed
10749000 entries processed
10750000 entries processed
10751000 entries processed
10752000 entries processed
10755000 entries processed
10757000 entries processed
10760000 entries processed
10761000 entries processed
10762000 entries processed
10767000 entries processed
10768000 entries processed
10769000 entries processed
10770000 entries processed
10772000 entries processed
1

11290000 entries processed
11291000 entries processed
11292000 entries processed
11293000 entries processed
11294000 entries processed
11295000 entries processed
11296000 entries processed
11298000 entries processed
11301000 entries processed
11305000 entries processed
11308000 entries processed
11309000 entries processed
11310000 entries processed
11311000 entries processed
11312000 entries processed
11313000 entries processed
11316000 entries processed
11317000 entries processed
11319000 entries processed
11324000 entries processed
11326000 entries processed
11330000 entries processed
11331000 entries processed
11337000 entries processed
11340000 entries processed
11341000 entries processed
11343000 entries processed
11344000 entries processed
11347000 entries processed
11348000 entries processed
11353000 entries processed
11355000 entries processed
11356000 entries processed
11359000 entries processed
11360000 entries processed
11361000 entries processed
11362000 entries processed
1

11948000 entries processed
11949000 entries processed
11950000 entries processed
11951000 entries processed
11957000 entries processed
11963000 entries processed
11966000 entries processed
11970000 entries processed
11973000 entries processed
11978000 entries processed
11979000 entries processed
11981000 entries processed
11984000 entries processed
11986000 entries processed
11987000 entries processed
11990000 entries processed
11991000 entries processed
11994000 entries processed
11995000 entries processed
11996000 entries processed
11997000 entries processed
11999000 entries processed
12003000 entries processed
12004000 entries processed
12005000 entries processed
12006000 entries processed
12007000 entries processed
12008000 entries processed
12009000 entries processed
12010000 entries processed
12012000 entries processed
12013000 entries processed
12015000 entries processed
12016000 entries processed
12020000 entries processed
12025000 entries processed
12030000 entries processed
1

12614000 entries processed
12616000 entries processed
12617000 entries processed
12620000 entries processed
12621000 entries processed
12622000 entries processed
12623000 entries processed
12624000 entries processed
12625000 entries processed
12626000 entries processed
12627000 entries processed
12629000 entries processed
12630000 entries processed
12631000 entries processed
12632000 entries processed
12633000 entries processed
12634000 entries processed
12641000 entries processed
12643000 entries processed
12644000 entries processed
12645000 entries processed
12646000 entries processed
12659000 entries processed
12660000 entries processed
12661000 entries processed
12662000 entries processed
12663000 entries processed
12664000 entries processed
12665000 entries processed
12666000 entries processed
12668000 entries processed
12670000 entries processed
12672000 entries processed
12673000 entries processed
12674000 entries processed
12676000 entries processed
12678000 entries processed
1

13236000 entries processed
13237000 entries processed
13238000 entries processed
13244000 entries processed
13249000 entries processed
13250000 entries processed
13251000 entries processed
13252000 entries processed
13253000 entries processed
13254000 entries processed
13255000 entries processed
13258000 entries processed
13259000 entries processed
13265000 entries processed
13266000 entries processed
13267000 entries processed
13269000 entries processed
13270000 entries processed
13271000 entries processed
13272000 entries processed
13273000 entries processed
13274000 entries processed
13275000 entries processed
13276000 entries processed
13280000 entries processed
13281000 entries processed
13282000 entries processed
13283000 entries processed
13284000 entries processed
13287000 entries processed
13288000 entries processed
13290000 entries processed
13291000 entries processed
13292000 entries processed
13296000 entries processed
13297000 entries processed
13300000 entries processed
1

13833000 entries processed
13836000 entries processed
13838000 entries processed
13839000 entries processed
13841000 entries processed
13846000 entries processed
13847000 entries processed
13848000 entries processed
13849000 entries processed
13850000 entries processed
13852000 entries processed
13854000 entries processed
13855000 entries processed
13857000 entries processed
13858000 entries processed
13860000 entries processed
13861000 entries processed
13864000 entries processed
13866000 entries processed
13871000 entries processed
13872000 entries processed
13878000 entries processed
13879000 entries processed
13880000 entries processed
13881000 entries processed
13882000 entries processed
13884000 entries processed
13885000 entries processed
13886000 entries processed
13888000 entries processed
13892000 entries processed
13894000 entries processed
13896000 entries processed
13898000 entries processed
13900000 entries processed
13902000 entries processed
13906000 entries processed
1

In [7]:
relation_type_dict

{'http://dbpedia.org/ontology/aSide': 'http://www.w3.org/2001/XMLSchema#string',
 'http://dbpedia.org/ontology/abbreviation': 'http://www.w3.org/2001/XMLSchema#string',
 'http://dbpedia.org/ontology/absoluteMagnitude': 'http://www.w3.org/2001/XMLSchema#double',
 'http://dbpedia.org/ontology/acceleration': 'http://www.w3.org/2001/XMLSchema#double',
 'http://dbpedia.org/ontology/access': 'http://www.w3.org/2001/XMLSchema#string',
 'http://dbpedia.org/ontology/acquirementDate': 'http://www.w3.org/2001/XMLSchema#date',
 'http://dbpedia.org/ontology/activeYears': 'http://www.w3.org/2001/XMLSchema#string',
 'http://dbpedia.org/ontology/activeYearsEndDate': 'http://www.w3.org/2001/XMLSchema#date',
 'http://dbpedia.org/ontology/activeYearsEndYear': 'http://www.w3.org/2001/XMLSchema#gYear',
 'http://dbpedia.org/ontology/activeYearsStartDate': 'http://www.w3.org/2001/XMLSchema#date',
 'http://dbpedia.org/ontology/activeYearsStartYear': 'http://www.w3.org/2001/XMLSchema#gYear',
 'http://dbpedia.o

In [8]:
relations_valid

{'http://dbpedia.org/ontology/absoluteMagnitude',
 'http://dbpedia.org/ontology/acceleration',
 'http://dbpedia.org/ontology/acquirementDate',
 'http://dbpedia.org/ontology/activeYearsEndDate',
 'http://dbpedia.org/ontology/activeYearsEndYear',
 'http://dbpedia.org/ontology/activeYearsStartDate',
 'http://dbpedia.org/ontology/activeYearsStartYear',
 'http://dbpedia.org/ontology/added',
 'http://dbpedia.org/ontology/ageRange',
 'http://dbpedia.org/ontology/albedo',
 'http://dbpedia.org/ontology/anniversary',
 'http://dbpedia.org/ontology/apoapsis',
 'http://dbpedia.org/ontology/apparentMagnitude',
 'http://dbpedia.org/ontology/approximateCalories',
 'http://dbpedia.org/ontology/area',
 'http://dbpedia.org/ontology/areaLand',
 'http://dbpedia.org/ontology/areaMetro',
 'http://dbpedia.org/ontology/areaOfCatchment',
 'http://dbpedia.org/ontology/areaRural',
 'http://dbpedia.org/ontology/areaTotal',
 'http://dbpedia.org/ontology/areaUrban',
 'http://dbpedia.org/ontology/areaWater',
 'http:/

In [9]:
relations_invalid

{'http://dbpedia.org/ontology/aSide',
 'http://dbpedia.org/ontology/abbreviation',
 'http://dbpedia.org/ontology/access',
 'http://dbpedia.org/ontology/activeYears',
 'http://dbpedia.org/ontology/address',
 'http://dbpedia.org/ontology/agencyStationCode',
 'http://dbpedia.org/ontology/alias',
 'http://dbpedia.org/ontology/allegiance',
 'http://dbpedia.org/ontology/alternativeName',
 'http://dbpedia.org/ontology/analogChannel',
 'http://dbpedia.org/ontology/areaCode',
 'http://dbpedia.org/ontology/areaOfCatchmentQuote',
 'http://dbpedia.org/ontology/areaQuote',
 'http://dbpedia.org/ontology/arm',
 'http://dbpedia.org/ontology/atcPrefix',
 'http://dbpedia.org/ontology/atcSuffix',
 'http://dbpedia.org/ontology/atcSupplemental',
 'http://dbpedia.org/ontology/automobileModel',
 'http://dbpedia.org/ontology/averageDepthQuote',
 'http://dbpedia.org/ontology/bSide',
 'http://dbpedia.org/ontology/background',
 'http://dbpedia.org/ontology/band',
 'http://dbpedia.org/ontology/battingSide',
 'htt

In [10]:
relations_cnt.most_common()

[('http://xmlns.com/foaf/0.1/name', 3368646),
 ('http://dbpedia.org/ontology/years', 927710),
 ('http://dbpedia.org/ontology/numberOfGoals', 650801),
 ('http://dbpedia.org/ontology/numberOfMatches', 645122),
 ('http://dbpedia.org/ontology/birthDate', 630419),
 ('http://dbpedia.org/ontology/utcOffset', 445717),
 ('http://dbpedia.org/ontology/populationTotal', 327277),
 ('http://dbpedia.org/ontology/runtime', 274334),
 ('http://dbpedia.org/ontology/activeYearsStartYear', 254812),
 ('http://dbpedia.org/ontology/synonym', 253511),
 ('http://dbpedia.org/ontology/elevation', 247824),
 ('http://dbpedia.org/ontology/title', 247497),
 ('http://dbpedia.org/ontology/squadNumber', 245563),
 ('http://dbpedia.org/ontology/releaseDate', 212290),
 ('http://dbpedia.org/ontology/height', 202143),
 ('http://dbpedia.org/ontology/deathDate', 197044),
 ('http://dbpedia.org/ontology/postalCode', 188395),
 ('http://dbpedia.org/ontology/areaTotal', 179581),
 ('http://dbpedia.org/ontology/office', 177682),
 ('h

In [11]:
types_cnt.most_common()

[('http://www.w3.org/2001/XMLSchema#string', 6255235),
 ('http://www.w3.org/2001/XMLSchema#gYear', 1969905),
 ('http://www.w3.org/2001/XMLSchema#double', 1754385),
 ('http://www.w3.org/2001/XMLSchema#date', 1530980),
 ('http://www.w3.org/2001/XMLSchema#integer', 1328113),
 ('http://www.w3.org/2001/XMLSchema#nonNegativeInteger', 844353),
 ('http://www.w3.org/1999/02/22-rdf-syntax-ns#langString', 616430),
 ('http://www.w3.org/2001/XMLSchema#positiveInteger', 45134),
 ('http://www.w3.org/2001/XMLSchema#float', 22107),
 ('http://www.w3.org/2001/XMLSchema#boolean', 10459),
 ('http://dbpedia.org/datatype/engineConfiguration', 9911),
 ('http://www.w3.org/2001/XMLSchema#gYearMonth', 547),
 ('http://dbpedia.org/datatype/valvetrain', 491),
 ('http://dbpedia.org/datatype/fuelType', 464),
 ('http://www.w3.org/2001/XMLSchema#anyURI', 23)]

In [None]:
relations_valid_cnt.most_common()

In [None]:
types_valid_cnt.most_common()

In [12]:
save_object(relation_type_dict, "relation_type_dict")

In [13]:
save_object(relations_valid, "relations_valid")

In [14]:
save_object(relations_invalid, "relations_invalid")

In [15]:
save_object(types_valid, "types_valid")

In [16]:
save_object(types_invalid, "types_invalid")

In [17]:
save_object(relations_cnt, "relations_cnt")

In [18]:
save_object(types_cnt, "types_cnt")

In [None]:
save_object(relations_valid_cnt, "relations_valid_cnt")

In [None]:
save_object(types_valid_cnt, "types_valid_cnt")

In [4]:
types_int = set(['http://www.w3.org/2001/XMLSchema#positiveInteger',
                 'http://www.w3.org/2001/XMLSchema#integer',
                 'http://www.w3.org/2001/XMLSchema#nonNegativeInteger'])
save_object(types_int, "types_int")

In [5]:
types_float = set(['http://www.w3.org/2001/XMLSchema#float',
                   'http://www.w3.org/2001/XMLSchema#double'])
save_object(types_float, "types_float")

In [6]:
types_date = set(['http://www.w3.org/2001/XMLSchema#gYear',
                  'http://www.w3.org/2001/XMLSchema#date',
                  'http://www.w3.org/2001/XMLSchema#gYearMonth'])
save_object(types_date, "types_date")