In [1]:
import nltk
import pycrfsuite
import pandas as pd
import string
from time import time
from tokenize import tokenize
from SPARQLWrapper import SPARQLWrapper, JSON

SENTENCESDATA = '/home/dkaczmarek/kodi/semvii/zti_oke/src/oke_sentences.txt'
SENTENCESSAMPLES = '/home/dkaczmarek/kodi/semvii/zti_oke/data/oke_samples.csv'

SPARQL = SPARQLWrapper("http://dbpedia.org/sparql")
SPARQL.setReturnFormat(JSON)

In [2]:
grammar = r"""NP: {<DT>?<JJ>*<NN>}
                  {<NNP>+}
           """

cp = nltk.RegexpParser(grammar)

with open(SENTENCESDATA, 'r') as file:
    pos_tagged = []
    noun_phrases = []
    for sentence_raw in file:
        sentence_tagged = nltk.pos_tag(nltk.word_tokenize(sentence_raw))
        pos_tagged += [sentence_tagged]
        
print(cp.parse(pos_tagged[0]))

(S
  (NP Hyundai/NNP Motor/NNP)
  sold/VBD
  4.86/CD
  million/CD
  vehicles/NNS
  compared/VBN
  with/IN
  its/PRP$
  (NP target/NN)
  of/IN
  5.01/CD
  million/CD
  (NP last/JJ year/NN)
  ./.
  (NP Kia/NNP)
  Motors/NNPS
  sold/VBD
  3.02/CD
  million/CD
  vehicles/NNS
  ,/,
  (NP shy/NN)
  of/IN
  its/PRP$
  (NP goal/NN)
  of/IN
  3.12/CD
  million/CD
  ./.
  (NP The/DT 78-year-old/JJ chief/NN)
  said/VBD
  the/DT
  automakers/NNS
  will/MD
  launch/VB
  more/JJR
  than/IN
  10/CD
  new/JJ
  models/NNS
  (NP every/DT year/NN)
  ,/,
  including/VBG
  a/DT
  new/JJ
  (NP SUV/NNP)
  for/IN
  advanced/JJ
  markets/NNS
  and/CC
  a/DT
  (NP Genesis/NNP G70/NNP)
  (NP sedan/NN)
  (NP this/DT year/NN)
  ./.)


In [3]:
def get_type_from_resource(resource_name):
    formated_name = replace_whitespaces(resource_name)
    type_query = return_type_sparql_query(formated_name)
    SPARQL.setQuery(type_query)
    SPARQL.setReturnFormat(JSON)
    results = SPARQL.query().convert()
    return results

def replace_whitespaces(name):
    return name.replace(" ", "_")

def return_type_sparql_query(name):
    return ("prefix db: <http://dbpedia.org/resource/> "
            "select distinct ?type where { "
            "db:{0} <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?type . "
            "filter (strstarts(str(?type), \"http://dbpedia.org/ontology/\")) }").replace("{0}", name)
    

In [4]:
results = get_type_from_resource("New York")
slice_len = len('http://dbpedia.org/ontology/')
for t in results['results']['bindings']:
    print(t['type']['value'][slice_len:])

Place
Location
AdministrativeRegion
PopulatedPlace
Region


In [5]:
def get_subclasses(super_class):
    subclasses_query = return_subclasses_sparql_query(super_class)
    SPARQL.setQuery(subclasses_query)
    results = SPARQL.query().convert()
    return results
    
def return_subclasses_sparql_query(super_class):
    return ("prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> "
            "select distinct ?subClass where { "
            "?subClass rdfs:subClassOf <http://dbpedia.org/ontology/{0}> . }").replace("{0}", super_class)
            

In [6]:
results = get_subclasses("Activity")
slice_len = len('http://dbpedia.org/ontology/')
for t in results['results']['bindings']:
    print(t['subClass']['value'][slice_len:])

Game
Sales
Sport


In [7]:
list_of_superclass = ['Activity', 'Agent', 'Award', 'Disease', 'EthnicGroup', 'Event', 'Language', 'MeanOfTransportation', 'PersonFunction', 'Place', 'Species', 'Work']
dict_of_superclasses = {}
for superclass in list_of_superclass:
    dict_of_superclasses[superclass] = []
    results = get_subclasses(superclass)
    slice_len = len('http://dbpedia.org/ontology/')
    for t in results['results']['bindings']:
        dict_of_superclasses[superclass] += [t['subClass']['value'][slice_len:]]
        

In [8]:
dict_of_superclasses

{'Activity': ['Game', 'Sales', 'Sport'],
 'Agent': ['Person', 'Deity', 'Employer', 'Family', 'Organisation'],
 'Award': ['Decoration', 'NobelPrize'],
 'Disease': [],
 'EthnicGroup': [],
 'Event': ['Competition', 'LifeCycleEvent', 'NaturalEvent', 'SocietalEvent'],
 'Language': ['ProgrammingLanguage'],
 'MeanOfTransportation': ['Aircraft',
  'Automobile',
  'Locomotive',
  'MilitaryVehicle',
  'Motorcycle',
  'On-SiteTransportation',
  'Rocket',
  'Ship',
  'SpaceShuttle',
  'SpaceStation',
  'Spacecraft',
  'Train',
  'TrainCarriage',
  'Tram'],
 'PersonFunction': ['PoliticalFunction', 'Profession'],
 'Place': ['ArchitecturalStructure',
  'CelestialBody',
  'Cemetery',
  'ConcentrationCamp',
  'CountrySeat',
  'Garden',
  'HistoricPlace',
  'Mine',
  'Monument',
  'NaturalPlace',
  'Park',
  'PopulatedPlace',
  'ProtectedArea',
  'SiteOfSpecialScientificInterest',
  'WineRegion',
  'WorldHeritageSite'],
 'Species': ['Archaea', 'Bacteria', 'Eukaryote'],
 'Work': ['Software',
  'Artwork',

In [26]:
def sentence_to_samples(sentence, sentence_id):
    temp_df = pd.DataFrame(columns = df_scheme_columns)
    
    punct_to_none = str.maketrans("", "", string.punctuation)
    filtered_sentence = sentence.translate(punct_to_none)
    
    tokens = nltk.word_tokenize(filtered_sentence)
    tokens_with_pos_tags = nltk.pos_tag(tokens)
    filtered_tokens = [x for x in tokens_with_pos_tags if x[1] != 'CD' 
                       and x[1] != 'JJ']
    
    df = ''
    pos_in_sentence = 0
    for token in filtered_tokens:
        results = get_type_from_resource(token[0])
        
        slice_len = len('http://dbpedia.org/ontology/')
        classes = [t['type']['value'][slice_len:] for t in results['results']['bindings']]
        df += '{0}, {1}, {2}, {3}, {4}\n'.format(token[0], token[1], str(classes), str(sentence_id), str(pos_in_sentence))
        
        pos_in_sentence += 1
    
    return df


sentences_counter = 1

header = 'word, pos_tag, class, sentence_id, pos_in_sentence\n'
all_df = []

with open(SENTENCESSAMPLES, 'w+') as samples_file:
    samples_file.write(header)
    with open(SENTENCESDATA, 'r') as file:
        for line in file:
            now = time()
            l = sentence_to_samples(line, sentences_counter)
            sentences_counter += 1
            print('sentence', sentences_counter-1, 'prepared, average:', (time() - now)/(sentences_counter - 1), 's')
            
            samples_file.write(l)

print(all_df)

sentence 1 prepared, average: 3.7167458534240723 s
sentence 2 prepared, average: 2.277917981147766 s
sentence 3 prepared, average: 3.803216060002645 s
sentence 4 prepared, average: 1.6824213862419128 s
sentence 5 prepared, average: 1.7917675495147705 s
sentence 6 prepared, average: 1.118127743403117 s
sentence 7 prepared, average: 0.9951025077274868 s


KeyboardInterrupt: 