Import Libraries
-------------

In [3]:
import json, urllib
import glob, os
import time
import numpy as np
import pandas as pd
from collections import defaultdict
from IPython.display import display, HTML

from tools import plwn
from tools import corpus2
from tools import corpus2mwe as mwe
from tools.mwe.mwe_converter import MWEConverter

from basicutils.simple import convert_to_coarse_pos
from corpus2 import AnnotatedSentence_wrap_sentence as annotate_sentence
from corpus_ccl import cclutils as ccl
from corpus_ccl import corpus_object_utils as cou

ModuleNotFoundError: No module named 'corpus_ccl'

Variables
----------------

In [None]:
question_id = 123
pd.options.display.max_colwidth = 1000
load_data_to_test = False

dataset_file = "Data/czywiesz-eva-I-250.csv"
json_file = 'questions_to_test.json'

Load all questions
------------

In [None]:
def load_all_data():
    dataset = pd.read_csv(dataset_file, header=None, sep=";")
    dataset = dataset.drop(columns=[2, 3])
    dataset.rename(columns={1: 'Question', 0: 'Question_ID'}, inplace=True)
    dataset['Dataset_ID'] = dataset.index
    
    display(dataset.head(5))

    return dataset

Load questions to test
-----------

In [None]:
def load_data_to_test():
    with open(json_file) as json_data:  
        data = json.load(json_data)
    dataset = pd.read_json(data, orient='split')

    display(dataset.head(5))

    return dataset

Set tagset as NKJP
------------

In [None]:
tagset = ccl.get_tagset('nkjp')

Functions
===========

In [None]:
def save_question_to_file(number, filename='question.csv'):
    pd.DataFrame(data = { 'Question': [dataset.iloc[1]['Question']]}).to_csv(filename, index=False, header=False, encoding='utf-8')
    
def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n","<br>")))

NLP Rest API 2 Functions
--------------

In [None]:
#!/usr/bin/env python
#-*- coding: utf-8 -*-

url="http://ws.clarin-pl.eu/nlprest2/base" 

def upload(file):    
        with open (file, "r") as myfile:
            doc=myfile.read()
        return urllib2.urlopen(urllib2.Request(url+'/upload/',doc,{'Content-Type': 'binary/octet-stream'})).read();

def tool(lpmn,user): 
    data={}
    data['lpmn'] = lpmn
    data['user'] = user

    doc=json.dumps(data)
    taskid = urllib2.urlopen(urllib2.Request(url+'/startTask/',doc,{'Content-Type': 'application/json'})).read();
    time.sleep(0.1);
    resp = urllib2.urlopen(urllib2.Request(url+'/getStatus/'+taskid));
    data=json.load(resp)
    while data["status"] == "QUEUE" or data["status"] == "PROCESSING" :
        time.sleep(0.1);
        resp = urllib2.urlopen(urllib2.Request(url+'/getStatus/'+taskid));
        data=json.load(resp)
    if data["status"]=="ERROR":
        print("Error "+data["value"]);
        return None   
    return data["value"]

def tagging(filename = 'question.csv'):
    data=upload(filename)

    # tutaj następuje złożenie identyfikatora pliku do przetworzenia i ścieżki przetwarzania 
    data=tool('file('+data+')|any2txt|wcrft2({"morfeusz2":false})|liner2({\"model\":\"all\"})|wsd({"use_mwe":true})','adam.dlubak@gmail.com')
    
    data=data[0]["fileID"];
    content = urllib2.urlopen(urllib2.Request(url+'/download'+data)).read();    
    with open (os.path.splitext(filename)[0] + '.ccl', "w") as outfile:
        outfile.write(content)  

MWEConverter Functions
--------------------

In [None]:
def convert(self, ccl_file, out_mwe_file, annots_used=False):
    if not self.reader:
        self.reader = mwe.CclMWEReader(ccl_file, self.tagset, '/usr/local/share/corpus2mwe/thes-v3.xml')
        self.reader.use_annotations(annots_used)
    else:
        self.reader.set_files(ccl_file)
    mwe_doc = self.reader.read()
    ccl.write_ccl(mwe_doc, out_mwe_file)

Data Extraction Functions
-----------------

In [None]:
def getCase(pos):
    posArray = pos.split(",")
    if(posArray[0] == "subst"):
        return posArray[1]
    else:
        return ""

In [None]:
def _find_token(sentence, token):
    for (index, token_in_sentence) in enumerate(sentence.tokens()):
        if token_in_sentence.is_same(token):
            return index
    raise ValueError("Token does not belong to sentence.")

def get_annotations(sentence, token):
    """
    Get annotations of a token from sentence annotation channel.

    Args:
        sentence (Corpus2.sentence)
        token (Corpus2.token)

    Returns:
        Dict[str, int]
    """
    try:
        sentence.all_channels()
    except AttributeError:
        sentence = annotate_sentence(sentence)

    index = _find_token(sentence, token)
    # Using dict causes invalid reference, need to retrieve channel anyways
    channels = list(sentence.all_channels())
    return {name: sentence.get_channel(name).get_segment_at(index)
            for name in channels}

Load Data
----------

In [None]:
if load_data_to_test:
    dataset = load_data_to_test()    
else:
    dataset = load_all_data()

Prepare data structures
---------

In [None]:
results = pd.DataFrame(columns = ["Orth", "Base", "Ctag", "Description"])

In [None]:
table_df = pd.DataFrame(data= {'1. Question': 
     ["kto", "co", "który", "jaki", "kiedy", "gdzie", "jak", "jak często", "jak rzadko", "którędy", "skąd", "dokąd", "ile", "czyje", 
      "czemu", "czy", "czyj", "dlaczego"], 
     '2. Description': 
     ["Określenie podmiotu / osoby", "Określenie rzeczy / zwierzęcia / stanu / pojęć", "Określenie jednego spośród wielu",
    "Określenie cechy elementu", "Czas", "Miejsce", "Sposób", "Częstotoliwość", "Częstotoliwość", "Droga ruchu", "Początek ruchu", "Cel ruchu", "Liczność",
    "Własność", "Powód", "Tak/Nie", "Przynależność", "Powód"]
    })
display(table_df)

Working Part
=========

Tagger proccess
-------------

In [None]:
save_question_to_file(question_id)

In [None]:
tagging('question.csv')
converter = MWEConverter(tagset='nkjp')
converter.convert('question.ccl', 'question-result.ccl')
doc = ccl.read_ccl('question-result.ccl')

Level 1 - Extract information about ctag, case and tags
---------------

In [None]:
full_pos_mask = corpus2.get_attribute_mask(tagset, '')

wn = plwn.load_default()
nes = defaultdict(set)

results = pd.DataFrame(columns = ["Orth", "Base", "Ctag", "Case", "Role", "Description"])
double_token = []
double_token_description = []
time_double_token_description = []
time_double_token = []
previous_annotation_is_correct = 0
previous_time_annotation_is_correct = 0
for par in doc.paragraphs():
    for sent in par.sentences():
        for token in sent.tokens():
            annotations = get_annotations(sent, token)
            lexeme = token.get_preferred_lexeme(tagset).lemma()
            orth = token.orth() 
            if 'nam' in annotations:
                if annotations['nam'] > 0:
                    previous_annotation_is_correct = 1
                    nes[annotations['nam']].add(token.orth_utf8())
                    double_token.append(token.orth_utf8())
                    double_token_description = max([k for k,v in annotations.items() if v == 1], key=len)
                    results.loc[len(results)] = [orth, "", "",  "", "",  ""]
                        
                    continue
                elif previous_annotation_is_correct > 0:
                    double_token = ' '.join(double_token)
                    double_token = double_token.replace(" .", ".")
                    double_token = double_token.replace(" ,", ",")
                    double_token = double_token.replace(" :", ":")
                    
                    results.loc[len(results) - 1]["Base"] = double_token
                    results.loc[len(results) - 1]["Description"] = double_token_description
                    
                    double_token = []
                    double_token_description = []
                    previous_annotation_is_correct = 0
                    
            if 'timex' in annotations:
                if annotations['timex'] > 0:
                    previous_time_annotation_is_correct = 1
                    time_double_token.append(token.orth_utf8())
                    time_double_token_description = max([k for k,v in annotations.items() if v == 1], key=len)
                    results.loc[len(results)] = [orth, "", "",  "", "",  ""]        
                    continue
                elif previous_time_annotation_is_correct > 0:
                    time_double_token = ' '.join(time_double_token)
                    time_double_token = time_double_token.replace(" .", ".")
                    time_double_token = time_double_token.replace(" ,", ",")
                    time_double_token = time_double_token.replace(" :", ":")
                    
                    results.loc[len(results) - 1]["Base"] = time_double_token
                    results.loc[len(results) - 1]["Description"] = time_double_token_description
                    time_double_token = []
                    time_double_token_description = []
                    previous_time_annotation_is_correct = 0
                    
            if 'mwe' in annotations and annotations['mwe'] is 1:
                if token.has_metadata():
                    md = token.get_metadata()
                    if not md.has_attribute('mwe_base'):
                        lexeme = ""
                    else:
                        lexeme = md.get_attribute('mwe_base')
                else:
                    lexeme = ""
            
            if token.has_metadata(): # Jeśli ma <prop>
                md = token.get_metadata()
                if md.has_attribute('sense:ukb:syns_id'):
                    sense = md.get_attribute('sense:ukb:syns_id')
                    wn.synset_by_id(sense)
            
            tag = token.get_preferred_lexeme(tagset).tag()
            pos = tagset.tag_to_symbol_string(tag)
            

            results.loc[len(results)] = [orth, 
                                         lexeme, 
                                         convert_to_coarse_pos(pos.split(",")[0]),
                                         getCase(pos),
                                         "",
                                         ""]
            t1_pos_mask = tag.get_masked(full_pos_mask)
            t1_pos_str = tagset.tag_to_symbol_string(t1_pos_mask)

pretty_print(results)

Level 2 - Extract information question type
---------------

In [None]:
if (str(results['Base'][0]) == "jak" or str(results['Base'][0]) == "Jak") and (str(results['Base'][1]) == "często" or str(results['Base'][1]) == "rzadko"):
    results.iloc[0, results.columns.get_loc('Role')] = "Pytanie o Częstotliwość" + "\n"
else:
    for idx, row in enumerate(results['Base'].head(3)):
        for idx_question, question_row in enumerate(table_df['1. Question']):
            if str(row) == str(question_row):
                results.iloc[idx, results.columns.get_loc('Role')] = "Zaimek Pytający"
                results.iloc[idx, results.columns.get_loc('Description')] = "Pytanie o " + table_df.iloc[idx_question]["2. Description"] + "\n"
                
pretty_print(results)

Level 3 - Determine the subject and the predicate
---------------

In [None]:
saved_base = ""

for idx, (base, ctag) in enumerate(zip(results["Base"], results["Ctag"])):
    if ctag == "verb":
        if str(base) == "zostać" or str(base) == "być":
            saved_base = str(base)
            saved_idx = idx
        elif saved_base != "":
            results.iloc[saved_idx, results.columns.get_loc('Base')] = ""
            results.iloc[idx, results.columns.get_loc('Base')] = saved_base + " " + str(base)
            saved_base = ""
            saved_idx = ""

In [None]:
for idx, (base, case, ctag, role) in enumerate(zip(results["Base"], results["Case"], results["Ctag"], results["Role"])):
    if base != "" and (ctag == "verb"):    
        results.iloc[idx, results.columns.get_loc('Role')] = "Orzeczenie"   
        break
        
for idx, (base, case, ctag, role) in enumerate(zip(results["Base"], results["Case"], results["Ctag"], results["Role"])):
    if base != "" and (case == "nom" or case == "gen") and role == "":
        results.iloc[idx, results.columns.get_loc('Role')] = "P"        
    
pretty_print(results)    

In [None]:
status = 0
for idx, role in enumerate(results["Role"]):
    
    if role == "P" and status == 0:
        results.iloc[idx, results.columns.get_loc("Role")] = "Podmiot"
        status = 1
    
    elif role == "P" and status == 1:
        results.iloc[idx, results.columns.get_loc("Role")] = ""
        
        
if status == 0:
    for idx, desc in enumerate(results["Description"]):
        if "nam" in desc:
            results.iloc[idx, results.columns.get_loc('Role')] = "P"    
            

status = 0
for idx, role in enumerate(results["Role"]):
    
    if role == "P" and status == 0:
        results.iloc[idx, results.columns.get_loc("Role")] = "Podmiot"
        status = 1
    
    elif role == "P" and status == 1:
        results.iloc[idx, results.columns.get_loc("Role")] = ""
        
        
        
pretty_print(results)

Level 4 - Determine meaning of words
---------------

In [None]:
for par in doc.paragraphs():
    for sent in par.sentences():
        idx = 0
        for token in sent.tokens():
            if token.has_metadata():
                md = token.get_metadata()
                tag = token.get_preferred_lexeme(tagset).tag()
                pos = tagset.tag_to_symbol_string(tag)
                ctag = results["Ctag"][idx]
                if md.has_attribute('sense:ukb:syns_rank') and (ctag == "noun" or ctag == "verb" or ctag == "adv"):
                    sense_attribute = md.get_attribute('sense:ukb:syns_rank').split(" ")
                    synset_ids = [item.split("/")[0] for item in sense_attribute]
                    synset_probabilities = [round(float(item.split("/")[1]), 2) for item in sense_attribute]

                    for (synset_id, synset_probability) in zip(synset_ids, synset_probabilities):
                        description = wn.synset_by_id(synset_id).to_dict()['units'][0]['definition']
                        variant = wn.synset_by_id(synset_id).to_dict()['units'][0]['variant']
                        if description is None or len(description) < 5:
                            description = wn.synset_by_id(synset_id).to_dict()['units'][0]['domain']
                        synset_desc = str(synset_probability) + " | " + "Wariant: " + str(variant) + " | " + description
                        results.iloc[idx, results.columns.get_loc("Description")] = results.iloc[idx, results.columns.get_loc("Description")] + synset_desc + "\n"
            idx += 1

pretty_print(results)