# Chaining search



## Sphinx documentatie: https://pythonhosted.org/an_example_pypi_project/sphinx.html
## in voorbeelden handige python functies opnemen
## zoals ; .sort_values(ascending=False,by=['raw_freq']));  list enz


## Library functions: Search
 

In [None]:
from nltk.tag.perceptron import PerceptronTagger

def filter_condition(df, column, method, regex_or_set):
    '''
    Helpfunction to build some condition to filter a Pandas DataFrame, according to a set of parameters
    Args:
        df: Pandas DataFrame to filter on
        df: column on which we filter
        method: "contains_regex" of "isin_set"
        regex_or_set: Regular expression (if method=="contains_regex") or set (if method=="isin_set")
    Returns:
        a condition
    '''
    
    if method=="contains_regex":
        filter_condition = df[column].str.contains(regex_or_set)
    elif method=="isin_set":
        filter_condition = df[column].isin(regex_or_set)
    else:
        raise ValueError("method should be one of regex or isin")
    return filter_condition
    
    

def concat_df(df_arr, keys_arr=None):
    '''
    This function concatenates two dataframes 
    Args:
        df_arr: array of Pandas DataFrames
        keys_arr: array of keys to assign to the records of each DataFrame, so we can still distinguish the original DataFrames
    Returns:
        a single Pandas DataFrame 
        
    >>> new_df = concat_df( [dataframe1, dataframe2, dataframe3], ['chn corpus', 'nederlab', 'opensonar'] )
    >>> display_df(new_df)
    '''
    # ref: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
    
    if keys_arr is not None:
        concat_df = pd.concat( df_arr, keys=keys_arr )
    else:
        concat_df = pd.concat( df_arr )
    
    return concat_df



def join_df(df_arr, join_type=None):
    
    '''
    This function joins two dataframes (=concat along axis 1) 
    Args:
        df_arr: array of Pandas DataFrames
        join_type: {inner, outer (default)}
    Returns:
        a single Pandas DataFrame 
        
    >>> new_df = join_df( [dataframe1, dataframe2] )
    >>> display_df(new_df)
    '''
    
    # ref: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
    
    if join_type is None:
        concat_df = pd.concat( df_arr, axis=1 )
    else:
        concat_df = pd.concat( df_arr, axis=1, join=join_type )
    
    return concat_df

    
    

def get_tagger(df_corpus):
    '''
    This function instantiates a tagger trained with some corpus annotations 
    Args:
        df_corpus: Pandas DataFrame with annotated corpus data
    Returns:
        a PerceptronTagger instance 
    
    >>> tagger = get_tagger(df_corpus)  # df_corpus containes a Pandas DataFrame with lots of corpus data
    >>> sentence = 'Here is some beautiful sentence'
    >>> tagged_sentence = tagger.tag( sentence.split() )
    >>> print(tagged_sentence) 
    
    '''
    
    # The corpus DataFrame consists of a number of sentences (rows) with a fixed number of tokens.
    # Each token has a fixed number of layers holding info like: lemma, wordform or part-of-speech. 
    # As a result, the number of columns of each row = [number of tokens] x [number of layers]
    
    # To be able to feed the tagger correctly, we need to compute the number of layers,
    # so we can infer the number of tokens the sentences hold. This is because
    # the tagger expects us to feed it with arrays with length = [number of tokens], as elements of
    # one single array holding all sentences arrays (see below).
    
    # So, determine how many layers (lemma, pos, wordform) we have 
    column_names = list(df_corpus.columns.values)
    for n, val in enumerate(column_names):
        # remove the numbers at the end of the layers names (lemma 1, lemma 2, ..., pos 1, pos 2, ...)
        # so we end up with clean layers name only
        column_names[n] = val.split(' ')[0] 
    number_of_layers = len(set(column_names))

    # Now we can determine the standard length of our corpus sentences: that can be computed 
    # by dividing the number of columns of the corpus DataFrame by the number of layers
    # we just computed.
    sentences = []
    nr_of_words_per_sentence = int( df_corpus.shape[1] / number_of_layers )  

    # Build training data for the tagger in the right format
    # The input must be like: [ [('today','NN'),('is','VBZ'),('good','JJ'),('day','NN')], [...] ]
    for index, row in df_corpus.iterrows():
        one_sentence =  []
        wrong = False
        for i in range(0, nr_of_words_per_sentence, 1): 
            tuple = ( row['word '+str(i)], row['pos '+str(i)] )
            one_sentence.append( tuple )
            if (row['word '+str(i)] is None or row['pos '+str(i)] is None):
                wrong = True
        if wrong is False:
            sentences.append(one_sentence)

    # Instantiate and train the tagger now
    tagger = PerceptronTagger(load=False)
    tagger.train(sentences)
    
    return tagger

In [2]:
import requests
from collections import defaultdict
import pandas as pd
import xml.etree.ElementTree as ET
import json
import urllib
#import wx   # for interaction popups          TODO -> omzetten naar JS of zo
import itertools # for frequency list function and from_iterable
import numpy     # idem
from IPython.display import FileLink, FileLinks
AVAILABLE_CORPORA = {'chn':'http://svprmc05.inl.nl/blacklab-server/chn',
                     'opensonar':'http://172.16.10.93:8080/blacklab-server/opensonar',
                     'zeebrieven':'http://svprmc20.ivdnt.org/blacklab-server/zeebrieven',
                     'gysseling':'http://svprmc20.ivdnt.org/blacklab-server/gysseling',
                     'nederlab':''}
RECORDS_PER_PAGE = 1000

# Fields parsed by default from corpus xml by _parse_xml
# Extra fields can be given to _parse_xml by users
DEFAULT_FIELDS_TOKEN = ["word", "lemma", "universal_dependency"]
DEFAULT_FIELDS_DOC = []

# Get rid of ellipsis in display (otherwise relevant data might not be shown)
pd.set_option('display.max_colwidth',1000)



# Search methods

def search_corpus_allwords(corpus, pos=None):
    '''
    This function gets all words of a corpus. If needed, the output can be restricted to words with a given part-of-speech
    Args:
        corpus: corpus name
        pos: part-of-speech (optional)
    Returns:
        a Pandas DataFrame containing corpus data
        
    >>> df_corpus = search_corpus_allwords("gysseling")
    >>> display_df(df_corpus)
    '''
    
    query = r'[word=".*"]'
    if pos is not None:
        query = r'[word=".*" & pos="'+pos+r'"]'
    return search_corpus(query, corpus)

def search_corpus_alllemmata(corpus, pos):
    '''
    This function gets all lemmata of a corpus. If needed, the output can be restricted to lemmata with a given part-of-speech
    Args:
        corpus: corpus name
        pos: part-of-speech (optional)
    Returns:
        a Pandas DataFrame containing corpus data
        
    >>> df_corpus = search_corpus_alllemmata("chn")
    >>> display_df(df_corpus)
    '''
    
    query = r'[lemma=".*"]'
    if pos is not None:
        query = r'[lemma=".*" & pos="'+pos+r'"]'
    return search_corpus(query, corpus) 

def search_corpus(query, corpus, start_position=1, detailed_context=False, extra_fields_doc=[], extra_fields_token=[]):
    '''
    This function searches a corpus given a query and a corpus name
    Args:
        query: a corpus query, eg. previously generated by corpus_query_lemma() or such
        corpus: a corpus name
        start_position: (optional) corpus response page (usually used by the function automatically calling itself recursively)
        detailed_context: (optional) {True, False (default)} 
        extra_fields_doc: 
        extra_fields_token: 
    Returns:
        a Pandas DataFrame containing corpus data
        
    >>> df_corpus = search_corpus(r'[pos="ADJ"][word="huis"]', "chn")
    >>> display_df(df_corpus)
    '''
    
    # show wait indicator
    show_wait_indicator('Searching '+corpus+ ' at page '+str(start_position))    
    
    if corpus not in AVAILABLE_CORPORA:
        raise ValueError("Unknown corpus: " + corpus)
    try:
        # Do request to federated content search corpora, so we get same output format for every corpus
        url = "http://portal.clarin.inl.nl/fcscorpora/clariah-fcs-endpoints/sru?operation=searchRetrieve&queryType=fcs&maximumRecords=1000&x-fcs-context=" + corpus + "&query=" + urllib.parse.quote(query)
        #print(url)
        response = requests.get(url)
        response_text = response.text    
        df, next_page = _parse_xml(response_text, detailed_context, extra_fields_doc, extra_fields_token)
        # If there are next pages, call search_corpus recursively
        #print(next_page)
        if next_page > 0:
            remove_wait_indicator()
            df_more = search_corpus(query, corpus, next_page, detailed_context, extra_fields_doc, extra_fields_token)
            df = df.append(df_more, ignore_index=True)
            
        remove_wait_indicator()
        
        # show message out of xml, if some error has occured (prevents empty output)
        _show_error_if_any(response_text)
        
        return df
    except Exception as e:
        remove_wait_indicator()
        raise ValueError("An error occured when searching corpus " + corpus + ": "+ str(e))
     

    
    
def search_corpus_multiple(queries, corpus):
    '''
    This function sends multiples queries at once to the search_corpus function
    Args:
        queries: array of corpus queries, eg. previously generated by corpus_query_lemma() or such
        corpus: a corpus name 
    Returns:
        a dictionary of Pandas DataFrames, associating each query (key) to the resulting corpus data (value)
    '''
    result_dict = {}
    for query in queries:
        result_dict[query] = search_corpus(query,corpus)
    return result_dict
   
    

def search_lexicon_alllemmata(lexicon, pos=None):
    '''
    This function gets all lemmata of a lexicon. If needed, the output can be restricted to lemmata with a given part-of-speech
    Args:
        lexicon: a lexicon name
        pos: part-of-speech (optional)
    Returns:
        a Pandas DataFrame containing lexicon data 
        
    >>> df_corpus = search_corpus_alllemmata("chn")
    >>> display_df(df_corpus)
    '''
    query = lexicon_query_alllemmata(lexicon, pos)
    return search_lexicon(query, lexicon)



def search_lexicon(query, lexicon):
    '''
    This function searches a lexicon given a query and a lexicon name
    Args:
        query: a lexicon query, typically previously generated by lexicon_query() or such 
        lexicon: a lexicon name
    Returns:
        a Pandas DataFrame with lexicon data 
        
    '''
     # show wait indicator, so the user knows what's happening
    show_wait_indicator('Searching '+lexicon)
    
    # default endpoint, except when diamant is invoked
    endpoint = "http://172.16.4.56:8890/sparql"
    if (lexicon=="diamant"):
        endpoint = "http://svprre02:8080/fuseki/tdb/sparql"
    
    try:
        # Accept header is needed for virtuoso, it isn't otherwise!
        response = requests.post(endpoint, data={"query":query}, headers = {"Accept":"application/sparql-results+json"})
        
        response_json = json.loads(response.text)
        records_json = response_json["results"]["bindings"]
        records_string = json.dumps(records_json)    
        df = pd.read_json(records_string, orient="records")
    
        # make sure cells containing NULL are added too, otherwise we'll end up with ill-formed data
        # CAUSES MALFUNCTION: df = df.fillna('')
        df = df.applymap(lambda x: '' if pd.isnull(x) else x["value"])         
        
        # remove wait indicator, 
        remove_wait_indicator()
        
        return df
    except Exception as e:
        remove_wait_indicator()
        raise ValueError("An error occured when searching lexicon " + lexicon + ": "+ str(e))          
        

# Processing methods

def column_difference(df_column1, df_column2):
    '''
    This function computes differences and similarities between two Pandas DataFrames
    Args:
        df_column1: a Pandas DataFrame, filtered by one column
        df_column2: a Pandas DataFrame, filtered by one column
    Returns:
        diff_left: array of words only in df_column1
        diff_right: array of words only in df_column2
        intersec: array of words both in df_column1 and df_column2
        
    >>> diff_left, diff_right, intersec = column_difference(df_corpus1["word 1"], df_corpus2["word 1"])
    >>> display( 'These words are only in DataFrame #1 : ' + ", ".join(diff_left) )
    >>> display( 'These words are only in DataFrame #2 : ' + ", ".join(diff_right) )
    >>> display( 'These words are common to both DataFrame : ' + ", ".join(intersec) )
    '''
    
    set_df1 = set(df_column1)
    set_df2 = set(df_column2)
    diff_left = set_df1.difference(set_df2)
    diff_right = set_df2.difference(set_df1)
    intersec = set_df1.intersection(set_df2)
    return diff_left, diff_right, intersec

def diamant_get_synonyms(df):
    '''
    This function gets lemmata or definitions out of a Pandas DataFrame with Diamant data. 
    The output set content depends on the result type.
    
    Args:
        df: a Pandas DataFrame containing Diamant data
    Returns:
        a set of lemmata OR a set of synonym definitions
        
    >>> query = lexicon_query(word=search_word, pos= '', lexicon=lexicon)
    >>> df_lexicon = search_lexicon(query, lexicon)
    >>> syns = diamant_get_synonyms(df_lexicon) 
    >>> display( 'Synoniemen voor ' + search_word + ': ' + ", ".join(syns)))
    '''
    
    # Depending on the result type, we return the lemma or the definition text
    lemmas = set(df[df["inputMode"]=="defText"]["n_ontolex_writtenRep"])
    defTexts = set(df[df["inputMode"]=="lemma"]["n_syndef_definitionText"])
    return lemmas|defTexts


def _parse_xml(text, detailed_context=False, extra_fields_doc=[], extra_fields_token=[]):
    '''
    This function converts the XML output of a lexicon or corpus search into a Pandas DataFrame for further processing
    
    Args:
        text: the XML response of a lexicon/corpus search, as a string
        detailed_context: (optional) True to parse the layers of all tokens, False to limit detailed parsing to hits
        extra_fields_doc: 
        extra_fields_token: 
    Returns:
        df: a Pandas DataFrame representing the parse results
        next_pos: the next result page to be parsed (since the results might be spread among several XML response pages), 
        or 0 if there is no page left to be parsed
    '''
    
    # TODO: should we secure against untrusted XML?
    root = ET.fromstring(text)
    records = []
    n_tokens = 0
    computed_nt = False
    cols= []
    
    fields_token = DEFAULT_FIELDS_TOKEN + extra_fields_token
    fields_doc = DEFAULT_FIELDS_DOC + extra_fields_doc
    for entry in root.iter("{http://clarin.eu/fcs/resource}ResourceFragment"):
        doc_metadata = {}
        for dataView in entry.findall("{http://clarin.eu/fcs/resource}DataView"):
            # Parse document metadata
            if(dataView.get("type")=="application/x-clariah-fcs-simple-metadata+xml"):
                for keyval in dataView.findall("keyval"):
                    key = keyval.get("key")
                    if key in fields_doc:
                        value = keyval.get("value")
                        doc_metadata[key] = value
            
            # ----- [part 1] ----- 
            # in 'hits only' mode, we'll gather the hits, otherwise we'll gather all the words of the sentences
            
            # We only take hits into account, ignore metadata and segmenting dataViews
            if (detailed_context is False and dataView.get("type")=="application/x-clarin-fcs-hits+xml"):
                result = dataView.find("{http://clarin.eu/fcs/dataview/hits}Result")
                left_context = result.text if result.text is not None else ''
                hits = list(result)
                if len(hits)==0:
                    print([w for w in result.itertext()])
                    print("no hit in kwic, skip")
                    continue
                last_hit = hits[-1]
                right_context = last_hit.tail if last_hit.tail is not None else ''
                #hit_words = [hit.text for hit in hits]
            
            # ----- [part 2] ----- 
            # gather info about each hit (=hits only mode) or about each word (=NOT hits only mode)
            
            # Get lemma of each hit
            cols= []
            if (dataView.get("type")=="application/x-clarin-fcs-adv+xml"):
                hit_layer = defaultdict(list) 
                for layer in dataView.findall(".//{http://clarin.eu/fcs/dataview/advanced}Layer"):
                    layer_id = layer.get("id").split("/")[-1]
                    # Only capture this layer, if it is in the list of designated fields (default+extra by user)
                    if layer_id in fields_token:
                        path = ".//{http://clarin.eu/fcs/dataview/advanced}Span"
                        if (detailed_context is False):
                            path = path+"[@highlight='h1']" 
                        for one_span in layer.findall(path):
                            span_text = one_span.text            
                            hit_layer[layer_id].append(span_text)
                        # Compute number of columns and create columns only once
                        if not computed_nt:
                            n_tokens = len(hit_layer[layer_id])
                            computed_nt=True
                data, cols = _combine_layers(hit_layer, n_tokens, doc_metadata_req=fields_doc, doc_metadata_recv=doc_metadata)
                if detailed_context is False:
                    kwic = [left_context] + data + [right_context]
                else:
                    kwic = data
                records.append(kwic)  
    if detailed_context is False:
        columns = ["left context"] + cols + ["right context"]
    else:
        columns = cols
    
    next_pos = 0
    next_record_position = root.find("{http://docs.oasis-open.org/ns/search-ws/sruResponse}nextRecordPosition")
    if (next_record_position is not None):
        next_pos = int(next_record_position.text)
    return pd.DataFrame(records, columns = columns), next_pos

def _combine_layers(hit_layer, n_tokens, doc_metadata_req, doc_metadata_recv):
    '''
    Combine the layers, in alphabetical order of the layer names, to a flat list, with separate column per layer per word in hit, and document metadata added as last columns
    
    Args:
        hit_layer: dictionary with list of items per layer
        n_tokens: number of tokens for which token-level annotations exist.
                    Is equal to total number of tokens in sentence if _parse_xml is called with detailed_context=True.
                    Is equal to number of tokens in hit if _parse_xml is called with detailed_context=False.
        doc_metadata_req: list of document metadata fields which have been requested
        doc_metadata_recv: dictionary with document metadata that is actually present in hits:
                        can contain less fields than doc_fields_requested
    Returns:
        data: flat list with combined token layers, sorted alphabetically, and document metadata
    '''
    # Sort layer keys to ensure same order of data in every row and column titles
    layers_keys = sorted(hit_layer.keys())
    # Original structure is list of tokens per layer id
    # Arrange items first on token, then on layer_id
    layers_token_flat = [hit_layer[layer_id][n] for n in range(n_tokens) for layer_id in layers_keys]
    # Flatten list of document metadata fields
    # Use all requested fields, some of which may not be available in this hit
    doc_flat = [doc_metadata_recv[field] if field in doc_metadata_recv else "" for field in doc_metadata_req]
    # Combine token and document data
    data = layers_token_flat + doc_flat
    
    ### Columns
    # Create list of columns, in same order
    tokens_columns = [layer_id+ " "+str(n) for n in range(n_tokens) for layer_id in layers_keys]
    # Add all requested document metadata fields as columns
    columns = tokens_columns + doc_metadata_req
    return data, columns

def _show_error_if_any(text):
    '''
    This function reads error messages in the XML output of a lexicon or corpus search 
    and it finds any, it is printed on screen
    
    Args:
        text: the XML response of a lexicon/corpus search, as a string
    Returns:
        N/A
    '''
    root = ET.fromstring(text)
    msgs = []
    for diagnostic in root.iter("{http://docs.oasis-open.org/ns/search-ws/diagnostic}diagnostic"):
        for msg in diagnostic.findall("{http://docs.oasis-open.org/ns/search-ws/diagnostic}message"):
            msg_text = msg.text if msg.text is not None else ''
            msgs.append(msg_text)
    if len(msgs) > 0:
        print("; ".join(msgs))

# View methods


def view_multiple_results(results, labels):
    '''
    This function shows the content of multiple Pandas DataFrames out of a dictionary associating
    labels (eg. corpus or lexicon names) to dataframes (values). It is typically called
    after search_corpus_multiple(), since this function returns such a dictionary.
    
    Args:
        results: a dictionary of Pandas DataFrames
        labels: list of labels corresponding to the Pandas DataFrames in results
    Returns:
        N/A
        
    >>> result_dict = search_corpus_multiple(queries, corpus)
    >>> view_multiple_results(result_dict, labels=list(syns))
    '''
    assert len(labels)==len(results)
    for n,query in enumerate(results):
        df = results[query]
        if not df.empty:
            display(HTML('Resultaten voor <b>' + labels[n] + "</b>:"))
            display(df)
            
            
            
def get_frequency_list(lexicon, pos, corpus):
    '''
    This function builds a lemmata frequency list of a corpus, 
    given a lexicon (for obvious reasons limited to some part-of-speech).
    
    Args:
        lexicon: a lexicon name
        pos: a part-of-speech to limit the search to
        corpus: the corpus to be searched
    Returns:
        a Pandas DataFrame with raw frequencies ('raw_freq' column) and rankings ('rank' column)
        
    >>> df_frequency_list = get_frequency_list(some_lexicon, "NOUN", corpus_to_search)
    >>> display(df_frequency_list)
    '''
    
    print('Beware: building a frequency list can take a long time')
    
    # LEXICON: get a lemmata list to work with
    df_lexicon = search_lexicon_alllemmata(lexicon, pos)
    lexicon_lemmata_set = sorted( set([w.lower() for w in df_lexicon["writtenForm"]]) )
    lexicon_lemmata_arr= numpy.array(lexicon_lemmata_set)

    # instantiate a dataframe for storing lemmata and frequencies
    df_frequency_list = pd.DataFrame(index=lexicon_lemmata_arr, columns=['raw_freq'])
    df_frequency_list.index.name = 'lemmata'

    # CORPUS: loop through lemmata list, query the corpus with that lemma, and count the results

    # It's a good idea to work with more than one lemma at once!
    nr_of_lemmata_to_query_atonce = 100
    
    # loop over lemmata list 
    for i in range(0, len(lexicon_lemmata_set), nr_of_lemmata_to_query_atonce):
        # slice to small sets of lemmata to query at once
        small_lemmata_set = set( lexicon_lemmata_arr[i : i+nr_of_lemmata_to_query_atonce] )    

        # join set of lemmata to send them in a query all at once
        # beware: single quotes need escaping
        lemmata_list = "|".join(small_lemmata_set).replace("'", "\\\\'")
        df_corpus = search_corpus(r'[lemma="' + lemmata_list + r'"]', corpus)

        # store frequencies
        if (len(df_corpus)>0):
            for one_lemma in small_lemmata_set: 
                raw_freq = len(df_corpus[df_corpus['lemma 0'] == one_lemma])
                df_frequency_list.at[one_lemma, 'raw_freq'] = raw_freq 
                
    # final step: compute rank
    # this is needed to be able to compare different frequency lists 
    # with each other (which we could achieve by computing a rank diff)
    df_frequency_list['rank'] = df_frequency_list['raw_freq'].rank(ascending = False).astype(int)
    
    return df_frequency_list;


def get_missing_wordforms(lexicon, pos, corpus):    
    '''
    This function gathers all paradigms of a lexicon with a given part-of-speech
    and searches an annotated corpus for words missing in those paradigms
    
    Args:
        lexicon: a lexicon name
        pos: a part-of-speech to limit the search to
        corpus: the corpus to be searched
    Returns:
        a Pandas DataFrame associating lemmata to their paradigms ('known_wordforms' column) and
        missing wordforms found in the corpus ('unknown_wordforms' column).
        
    >>> df = get_missing_wordforms("molex", "VERB", "opensonar")
    >>> df.to_csv( "missing_wordforms.csv", index=False)
    '''
    
    print('Beware: finding missing wordforms in a lexicon can take a long time');
    
    # LEXICON: get a lemmata list to work with
    df_lexicon = search_lexicon_alllemmata(lexicon, pos)
    lexicon_lemmata_set = sorted( set([w.lower() for w in df_lexicon["writtenForm"]]) )
    lexicon_lemmata_arr= numpy.array(lexicon_lemmata_set)
    
    # instantiate a dataframe for storing lemmata and wordforms
    df_enriched_lexicon = pd.DataFrame(index=lexicon_lemmata_arr, columns=['lemma', 'pos', 'known_wordforms', 'unknown_wordforms'])
    df_enriched_lexicon.index.name = 'lemmata'
    
    # CORPUS: loop through lemmata list, query the corpus with that lemma, 
    # and compute difference between both

    # It's a good idea to work with more than one lemma at once!
    nr_of_lemmata_to_query_atonce = 100
    
    # loop over lemmata list 
    for i in range(0, len(lexicon_lemmata_set), nr_of_lemmata_to_query_atonce):
        # slice to small sets of lemmata to query at once
        small_lemmata_set = set( lexicon_lemmata_arr[i : i+nr_of_lemmata_to_query_atonce] )    
        
        # join set of lemmata to send them in a query all at once
        # beware: single quotes need escaping
        lemmata_list = "|".join(small_lemmata_set).replace("'", "\\\\'")
        df_corpus = search_corpus(r'[lemma="' + lemmata_list + r'"]', corpus)
        
        # process results
        if (len(df_corpus)>0):
            for one_lemma in small_lemmata_set: 
                
                # look up the known wordforms in the lexicon
                query = lexicon_query(one_lemma, pos, lexicon)
                df_known_wordforms = search_lexicon(query, lexicon)
                
                if (len(df_known_wordforms) != 0):
                    known_wordforms = set( df_known_wordforms['wordform'].str.lower() )
                    # find the wordforms in the corpus
                    corpus_wordforms = set( (df_corpus[df_corpus['lemma 0'] == one_lemma])['word 0'].str.lower() )
                    # determine which corpus wordforms are not in lexicon wordforms
                    unknown_wordforms = corpus_wordforms.difference(known_wordforms)

                    if (len(unknown_wordforms) !=0):
                        # store the results
                        df_enriched_lexicon.at[one_lemma, 'lemma'] = one_lemma
                        df_enriched_lexicon.at[one_lemma, 'pos'] = pos
                        df_enriched_lexicon.at[one_lemma, 'known_wordforms'] = known_wordforms
                        df_enriched_lexicon.at[one_lemma, 'unknown_wordforms'] = unknown_wordforms
                
    # return non-empty results, t.i. cases in which we found some wordforms
    return df_enriched_lexicon[ df_enriched_lexicon['unknown_wordforms'].notnull() ]
        
    
def get_rank_diff(df1, df2):
    '''
    This function compares the rankings of words common to two dataframes, and compute a rank_diff, in such
    a way that one can see which words are very frequent in one set and rare in the other.
    
    Args:
        df1: a Pandas DataFrame
        df2: a Pandas DataFrame
    Returns:
        a Pandas DataFrame with lemmata (index), ranks of both input dataframes ('rank_1' and 'rank_2' columns) 
        and the rank_diff ('rank_diff' column).
        
    >>> df_frequency_list1 = get_frequency_list(base_lexicon, "NOUN", corpus_to_search1)
    >>> df_frequency_list2 = get_frequency_list(base_lexicon, "NOUN", corpus_to_search2)
    >>> df_rankdiffs = get_rank_diff(df_frequency_list1, df_frequency_list2)
    '''
    
    # Find lemmata shared by both dataframes: computing ranks diffs is only possible
    # when dealing with lemmata which are in both frames
    lemmata_list1 = set(df1.index.tolist())
    lemmata_list2 = set(df2.index.tolist())
    common_lemmata_list = list( lemmata_list1.intersection(lemmata_list2) )
    
    # Build dataframes limited to the common lemmata
    limited_df1 = df1.loc[ common_lemmata_list , : ]
    limited_df2 = df2.loc[ common_lemmata_list , : ]
    
    # Recompute ranks in both dataframes, because in each frame the original ranks were
    # computed with a lemmata list which might be larger than the lemmata list common
    # to both dataframes
    
    limited_df1['rank'] = limited_df1['raw_freq'].rank(ascending = False).astype(int)
    limited_df2['rank'] = limited_df2['raw_freq'].rank(ascending = False).astype(int)
    
    # Instantiate a dataframe for storing lemmata and rank diffs
    df_rankdiffs = pd.DataFrame(index=common_lemmata_list, columns=['rank_1', 'rank_2', 'rank_diff'])
    df_rankdiffs.index.name = 'lemmata'
    
    df_rankdiffs['rank_1'] = limited_df1['rank']
    df_rankdiffs['rank_2'] = limited_df2['rank']
    df_rankdiffs['rank_diff'] = pd.DataFrame.abs( df_rankdiffs['rank_1'] - df_rankdiffs['rank_2'] )
    
    return df_rankdiffs


# TODO: Method misses token fields which are extracted from POS tag by FCS (eg. inflection)
def _parse_blacklab_metadata(text):
    '''
    This method parses metadata fields from a Blacklab metadata response
    Args:
        text: the XML response of a lexicon/corpus search, as a string
    Returns:
        A dictionary of with lists of document and token metadata
    '''
    
    # TODO: should we secure against untrusted XML?
    root = ET.fromstring(text)
    doc_fields = [md.get("name") for md in root.iter("metadataField")]
    token_fields = [prop.get("name") for prop in root.iter("property")]
    return {"document": doc_fields, "token": token_fields}
    

def _corpus_metadata_blacklab(corpus_name):
    '''
    Return all possible metadata fields for a BlackLab-based corpus, by sending a request to the corpus
    
    Args:
        corpus_name: Name of the corpus
    Returns:
        A dictionary of with lists of document and token metadata
    '''
    corpus_url = AVAILABLE_CORPORA[corpus_name]
    response = requests.get(corpus_url)
    response_text = response.text  
    return _parse_blacklab_metadata(response_text)

def get_available_metadata(resource_type, resource_name):
    '''
    Return all possible metadata fields for a lexicon or corpus
    
    Args:
        resource_type: One of 'lexicon' or 'corpus'
        resource_name: Name of the lexicon or corpus
    Returns:
        A list of metadata fields
    '''
    if resource_type=="lexicon":
        # Create sample query for this lexicon
        q = lexicon_query(word="", pos="", lexicon=resource_name)
        return _etadata_from_lexicon_query(q)
    elif resource_type=="corpus":
        if resource_name in AVAILABLE_CORPORA and resource_name != "nederlab":
            return _corpus_metadata_blacklab(resource_name)
        elif resource_name=="nederlab":
            print("Corpus metadata not yet available for Nederlab")
            return []
        else:
            ValueError("Unknown corpus: " + resource_name + ". Should be one of " + AVAILABLE_CORPORA.keys())
    else:
        raise ValueError("resource_type should be 'corpus' or 'lexicon'.")

## Library functions: UI

In [3]:
import sys
import ipywidgets as widgets
from IPython.display import display
#import tkinter as tk
#from tkinter import filedialog
from pathlib import Path
from IPython.display import Javascript
from IPython.core.display import display, HTML
import matplotlib.pyplot as plt

DEFAULT_QUERY = r'[lemma="boek" & pos="verb"]' #r'[lemma="boeken" pos="verb"]'
DEFAULT_CORPUS = "chn"


def show_wait_indicator(message=None):
    
    print('...' + (message if message else 'Busy now') + '...', end="\r") 
    sys.stdout.write("\033[F")

def remove_wait_indicator():    
    print('                                                                    ', end="\r")
    sys.stdout.write("\033[F")
    
    
    

def create_corpus_ui():
    '''
    This function builds a GUI for corpus search
    
    Args:
        N/A
    Returns:
        N/A
    '''
    
    # Create UI elements
    corpusQueryField = widgets.Text(description="<b>CQL query:</b>", value=DEFAULT_QUERY)
    corpusField = widgets.Dropdown(
        options=AVAILABLE_CORPORA.keys(),
        value=DEFAULT_CORPUS,
        description='<b>Corpus:</b>',
    )
    '''corpusSearchButton = widgets.Button(
        description='Search',
        button_style='info', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Search',
    )
    # Handle events
    corpusSearchButton.on_click(corpus_search)'''
    
    # Stack UI elements in vertical box and display
    corpusUiBox = widgets.VBox([corpusQueryField,corpusField])
    display(corpusUiBox)
    
    # Return fields, so their contents are accessible from the global namespace of the Notebook
    return corpusQueryField, corpusField

def create_lexicon_ui():
    '''
    This function builds a GUI for lexicon search.
    
    Args:
        N/A
    Returns:
        N/A
    '''
    
    DEFAULT_SEARCHWORD = 'boek'
    DEFAULT_LEXICON = "diamant"

    # Create UI elements
    searchWordField = widgets.Text(description="<b>Word:</b>", value=DEFAULT_SEARCHWORD)
    lexiconField = widgets.Dropdown(
        options=['anw', 'celex', 'diamant', 'duelme', 'molex'],
        value=DEFAULT_LEXICON,
        description='<b>Lexicon:</b>',
    )
    '''lexSearchButton = widgets.Button(
        description='Search',
        button_style='info', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Search',
    )
    # Handle events
    lexSearchButton.on_click(lexicon_search)'''
    # Stack UI elements in vertical box and display
    lexUiBox = widgets.VBox([searchWordField,lexiconField])
    display(lexUiBox)
    return searchWordField, lexiconField


def create_save_dataframe_ui(df):
    '''
    This function builds a GUI for saving the results of some lexicon or corpus query to a .csv file.
    One can use load_dataframe(filepath) to reload the results later on.
    
    Args:
        df: a Pandas DataFrame
    Returns:
        N/A
    '''
    
    # build ui for saving results
    DEFAULT_FILENAME = 'mijn_resultaten.csv'
    saveResultsCaption = widgets.Label(value='Sla uw resultaten op:')
    fileNameField = widgets.Text(value=DEFAULT_FILENAME)
    savebutton = widgets.Button(
        description='Bestand opslaan',
        disabled=False,
        button_style='warning', 
        tooltip=DEFAULT_FILENAME,  # trick to pass filename to button widget
        icon=''
    )
    # inject dataframe into button object
    savebutton.df = df
    # when the user types a new filename, it will be passed to the button tooltip property straight away
    fileNameLink = widgets.jslink((fileNameField, 'value'), (savebutton, 'tooltip'))
    # click event with callback
    savebutton.on_click( _save_dataframe )    
    saveResultsBox = widgets.HBox([saveResultsCaption, fileNameField, savebutton])
    display(saveResultsBox)
    
def _save_dataframe(button):
    fileName = button.tooltip
    # The result files can be saved locally or on the server:
    # If result files are to be offered as downloads, set to True; otherwise set to False    
    fileDownloadable = False
    # specify paths here, if needed:
    filePath_onServer = ''  # could be /path/to
    filePath_default = ''
    # compute full path given chosen mode
    fullFileName = (filePath_onServer if fileDownloadable else filePath_default ) + fileName
        
    try:
        button.df.to_csv( fullFileName, index=False)
        # confirm it all went well
        print(fileName + " saved")    
        button.button_style = 'success'
        button.icon = 'check'
        # trick: https://stackoverflow.com/questions/31893930/download-csv-from-an-ipython-notebook
        if (fileDownloadable):
            downloadableFiles = FileLinks(filePath_onServer)
            display(downloadableFiles)
    except Exception as e:
        button.button_style = 'danger'
        raise ValueError("An error occured when saving " + fileName + ": "+ str(e))    

    
    
def load_dataframe(filepath):
    '''
    This functions (re)loads some previously saved Pandas DataFrame
    
    Args:
        filepath: path to the saved Pandas DataFrame (.csv)
    Returns: 
        a Pandas DataFrame representing the content of the file
    
    >>> df_corpus = load_dataframe('mijn_resultaten.csv')
    >>> display_df(df_corpus, title="Results:")
    '''
    try:
        df = pd.read_csv(filepath)
        print(filepath + " loaded successfully")            
    except Exception as e:
        raise ValueError("An error occured when loading " + filepath + ": "+ str(e))
    finally:
        return df


def display_df(df, columns=None, title=None, mode='table'):
    '''
    This function displays a Pandas DataFrame as a table of as a chart.
    
    If the 'chart' mode is chosen, the function draws a horizontal chart representing a dataframe.
    One axis is the index of the dataframe, and the other axis is the given column, which holds the 
    values to plot in the chart.
    
    Args:
        df: DataFrame to be displayed
        columns: columns to display, or None to display all columns
        title: Title displayed
        mode: Way of displaying, one of 'table' (default) or 'chart'
    Returns:
        N/A
    '''
    if columns is not None:
        df_display = df[columns]
    else:
        df_display = df
    
    # chart mode
    if mode == 'chart':
        plt.figure()
        df_display.plot.barh().set_title(title)
    
    # table mode (default)
    else:    
        if title is not None:
            display(HTML("<b>%s</b>" % title))        

        display(df_display)
    
    # eventually, give UI to save data
    create_save_dataframe_ui(df_display)


## Library functions: Queries

In [4]:
import re

def containsRegex(word):
    '''
    This function checks whether some string contains a regular expression or not
    
    Args:
        word: a string to check for regular expressions
    Returns:
        A boolean
    '''
    return ( word.find('^')>-1 or
            word.find('$')>-1 or 
            re.match("\(.+?\)", word) or
            re.match("\[.+?\]", word) or
            re.match("[\+*]", word) )
                     
def lexicon_query(word, pos, lexicon):
    '''
    This function builds a query for getting the paradigm etc. of a given lemma out of a given lexicon.
    The resulting query string is to be used as a parameter of search_lexicon() 
    
    Args:
        word: a lemma/wordform to build the query with
        pos: a part-of-speech to build the query with
        lexicon: a lexicon to build the query for
    Returns:
        a query string to be used as a parameter of search_lexicon() 
    '''
    
    if (lexicon=="anw"):
        # part-of-speech filter not supported for this lexicon
        if (pos is not None and pos != ''):
            print('Filtering by part-of-speech is not (yet) supported in the \''+lexicon+'\' lexicon')
        # exact or fuzzy search
        exactsearch = (not containsRegex(word))
        subpart = """FILTER ( regex(?lemma, \""""+word+"""\") || regex(?definition, \""""+word+"""\") ) . """
        if (exactsearch == True):
              subpart =  """
                { { ?lemId rdfs:label ?lemma .  
                values ?lemma { \""""+word+"""\"@nl \""""+word+"""\" } }                 
                UNION
                { ?definitionId lemon:value ?definition .
                values ?definition { \""""+word+"""\"@nl \""""+word+"""\" } } } .
                """               
        query = """PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
                  PREFIX anw: <http://rdf.ivdnt.org/lexica/anw>
                  PREFIX anwsch: <http://rdf.ivdnt.org/schema/anw/>
                  PREFIX lemon: <http://lemon-model.net/lemon#>
                  
                  SELECT ?lemId ?lemma ?writtenForm ?definition concat('', ?definitionComplement) as ?definitionComplement
                  FROM <http://rdf.ivdnt.org/lexica/anw>
                  WHERE {
                      ?lemId rdfs:label ?lemma .
                      ?lemId ontolex:sense ?senseId .
                      ?senseId lemon:definition ?definitionId .
                      ?definitionId lemon:value ?definition .
                      OPTIONAL { ?definitionId anwsch:definitionComplement ?definitionComplement .}
                      OPTIONAL { ?lemId ontolex:canonicalForm ?lemCFId . 
                          ?lemCFId ontolex:writtenRepresentation ?writtenForm . }
                      """+subpart+"""
                      }"""
    elif (lexicon=="diamant"):
        # part-of-speech filter not supported for this lexicon
        if (pos is not None and pos != ''):
            print('Filtering by part-of-speech is not (yet) supported in the \''+lexicon+'\' lexicon')
        # exact or fuzzy search
        exactsearch = (not containsRegex(word))
        subpart1 = """?n_form ontolex:writtenRep ?n_ontolex_writtenRep . 
            FILTER regex(?n_ontolex_writtenRep, \""""+word+"""\") . """
        subpart2 = """?n_syndef diamant:definitionText ?n_syndef_definitionText .  
            FILTER regex(?n_ontolex_writtenRep, \""""+word+"""\") . """
        if (exactsearch == True):
            subpart1 =  """
                { ?n_form ontolex:writtenRep ?n_ontolex_writtenRep . 
                values ?n_ontolex_writtenRep { \""""+word+"""\"@nl \""""+word+"""\" } } 
                """                
            subpart2 = """
                { ?n_syndef diamant:definitionText ?n_syndef_definitionText . 
                values ?n_syndef_definitionText { \""""+word+"""\"@nl \""""+word+"""\" } } 
                """
        query = """
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        prefix prov: <http://www.w3.org/ns/prov#>
        prefix diamant: <http://rdf.ivdnt.org/schema/diamant#>
        prefix lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        prefix lemon: <http://lemon-model.net/lemon#>
        prefix ontolex: <http://www.w3.org/ns/lemon/ontolex#>
        prefix ud: <http://universaldependencies.org/u/pos/>
        prefix skos: <http://www.w3.org/2004/02/skos/core#>
        prefix dcterms: <http://purl.org/dc/terms/>
        prefix dc: <http://purl.org/dc/terms/>

        select ?n_entry ?n_form ?n_ontolex_writtenRep ?n_syndef ?n_sensedef ?n_sensedef_definitionText ?n_syndef_definitionText ?n_sense ?inputMode ?wy_f_show ?wy_t_show
        where
        {
        graph ?g
        {
        {
            """ + subpart1 + """
            { ?n_entry a ontolex:LexicalEntry} .
            { ?n_form a ontolex:Form} .
            { ?n_sense a ontolex:LexicalSense} .
            { ?n_syndef a diamant:SynonymDefinition} .
            { ?n_sensedef a lemon:SenseDefinition} .
            { ?n_syndef diamant:definitionText ?n_syndef_definitionText } .
            { ?n_sensedef diamant:definitionText ?n_sensedef_definitionText } .
            { ?n_entry ontolex:canonicalForm ?n_form } .
            { ?n_entry ontolex:sense ?n_sense } .
            { ?n_sense lemon:definition ?n_syndef } .
            { ?n_sense lemon:definition ?n_sensedef } .
              ?n_sense diamant:attestation ?n_attest_show .
              ?n_sense diamant:attestation ?n_attest_filter .
              ?n_attest_show diamant:text ?n_q_show .
              ?n_attest_filter diamant:text ?n_q_filter .
              ?n_attest_show a diamant:Attestation .
              ?n_attest_filter a diamant:Attestation .
              ?n_q_filter a diamant:Quotation .
              ?n_q_show a diamant:Quotation .
              ?n_q_filter diamant:witnessYearFrom ?wy_f_filter .
              ?n_q_filter diamant:witnessYearTo ?wy_t_filter .
              ?n_q_show diamant:witnessYearFrom ?wy_f_show .
              ?n_q_show diamant:witnessYearTo ?wy_t_show .
              FILTER (xsd:integer(?wy_f_show) >= 1200)
              FILTER (xsd:integer(?wy_t_show) >= 1200)
              FILTER (xsd:integer(?wy_f_show) <= 2018)
              FILTER (xsd:integer(?wy_t_show) <= 2018)
            { bind("lemma" as ?inputMode) } .
            } UNION
          {
            """ + subpart2 + """
            { ?n_sense a ontolex:LexicalSense} .
            { ?n_syndef a diamant:SynonymDefinition} .
            { ?n_sensedef a lemon:SenseDefinition} .
            { ?n_form a ontolex:Form} .
            { ?n_form ontolex:writtenRep ?n_ontolex_writtenRep } .  { ?n_entry a ontolex:LexicalEntry} .
            { ?n_entry ontolex:sense ?n_sense } .
            { ?n_sense lemon:definition ?n_syndef } .
            { ?n_sense lemon:definition ?n_sensedef } .
            { ?n_sensedef diamant:definitionText ?n_sensedef_definitionText } .
            { ?n_entry ontolex:canonicalForm ?n_form } .
            ?n_sense diamant:attestation ?n_attest_show .
            ?n_sense diamant:attestation ?n_attest_filter .
            ?n_attest_filter diamant:text ?n_q_filter .
            ?n_attest_show diamant:text ?n_q_show .
            ?n_q_filter diamant:witnessYearFrom ?wy_f_filter .
            ?n_q_filter diamant:witnessYearTo ?wy_t_filter .
            ?n_q_show diamant:witnessYearFrom ?wy_f_show .
            ?n_q_show diamant:witnessYearTo ?wy_t_show .
            ?n_attest_show a diamant:Attestation .
            ?n_attest_filter a diamant:Attestation .
            ?n_q_filter a diamant:Quotation .
            ?n_q_show a diamant:Quotation .
            FILTER (xsd:integer(?wy_f_show) >= 1200)
            FILTER (xsd:integer(?wy_t_show) >= 1200)
            FILTER (xsd:integer(?wy_f_show) <= 2018)
            FILTER (xsd:integer(?wy_t_show) <= 2018)
          { bind("defText" as ?inputMode) } .
            }
        }
        }"""
    elif (lexicon=="molex"):
        # exact or fuzzy search
        exactsearch = (not containsRegex(word))
        subpart1 = """"""
        subpart2 = """"""
        subpartPos = """"""
        if (word != ''):
            if (exactsearch == True):
                subpart1 =  """
                    { ?lemCFId ontolex:writtenRep ?lemma . 
                    values ?lemma { \""""+word+"""\"@nl \""""+word+"""\" } } 
                    UNION
                    { ?wordformId ontolex:writtenRep ?wordform . 
                    values ?wordform { \""""+word+"""\"@nl \""""+word+"""\" } } .
                    """        
            else:
                subpart2 = """FILTER ( regex(?lemma, \""""+word+"""\") || regex(?wordform, \""""+word+"""\") ) . """
        if (pos is not None and pos != ''):
            subpartPos = """FILTER ( regex(?lemPos, \""""+pos+"""$\") ) ."""
        query = """
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            PREFIX UD: <http://universaldependencies.org/u/>
            PREFIX diamant: <http://rdf.ivdnt.org/schema/diamant#>
            
            SELECT ?lemEntryId ?lemma ?lemPos ?wordformId ?wordform ?hyphenation ?wordformPos ?Gender ?Number
            FROM <http://rdf.ivdnt.org/lexica/molex>
            WHERE
            {
            ?lemEntryId ontolex:canonicalForm ?lemCFId .
            ?lemCFId ontolex:writtenRep ?lemma .
            """+subpart1+"""
            OPTIONAL {?lemEntryId UD:Gender ?Gender .}
            OPTIONAL {?lemEntryId UD:VerbForm ?verbform .}
            ?lemEntryId UD:pos ?lemPos .
            """+subpartPos+"""
            ?lemEntryId ontolex:lexicalForm ?wordformId .
            ?wordformId UD:pos ?wordformPos .
            OPTIONAL {?wordformId UD:Number ?Number .}
            OPTIONAL {?wordformId ontolex:writtenRep ?wordform .}
            OPTIONAL {?wordformId diamant:hyphenation ?hyphenation .}
            """+subpart2+"""
            }
        """
    elif (lexicon=="duelme"):
        # part-of-speech filter not supported for this lexicon
        if (pos is not None and pos != ''):
            print('Filtering by part-of-speech is not (yet) supported in the \''+lexicon+'\' lexicon')
        # exact or fuzzy search
        exactsearch = (not containsRegex(word))
        subpart = """FILTER ( regex(?lemma, \""""+word+"""\") || regex(?wordform, \""""+word+"""\") ) ."""
        if (exactsearch == True):
            subpart =  """
                { ?y lmf:hasLemma ?dl .  
                values ?dl { \""""+word+"""\"@nl \""""+word+"""\" } }                 
                """        
        query = """
            PREFIX duelme: <http://rdf.ivdnt.org/lexica/duelme>
            PREFIX intskos: <http://ivdnt.org/schema/lexica#>
            PREFIX lmf: <http://www.lexinfo.net/lmf>
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            PREFIX UD: <http://rdf.ivdnt.org/vocabs/UniversalDependencies2#>
            
            SELECT ?exampleSentence ?lemma ?gender ?number
            WHERE  {
                  ?d intskos:ExampleSentence ?exampleSentence .
                  ?d lmf:ListOfComponents [lmf:Component ?y] .
                  ?y lmf:hasLemma ?lemma . 
                  OPTIONAL {?y UD:Gender ?gender}
                  OPTIONAL {?y UD:Number ?number}
            """+subpart+"""
            }
        """
    elif (lexicon=="celex"):
        # part-of-speech filter not supported for this lexicon
        if (pos is not None and pos != ''):
            print('Filtering by part-of-speech is not (yet) supported in the \''+lexicon+'\' lexicon')
        # exact or fuzzy search
        exactsearch = (not containsRegex(word))
        subpart = """FILTER ( regex(?lemma, \""""+word+"""\") ) . """
        if (exactsearch == True):
            subpart =  """
                { ?lemmaId ontolex:canonicalForm [ontolex:writtenRep ?lemma] .  
                values ?lemma { \""""+word+"""\"@nl \""""+word+"""\" } }                 
                """        
        query = """
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            PREFIX celex: <http://rdf.ivdnt.org/lexica/celex>
            PREFIX UD: <http://rdf.ivdnt.org/vocabs/UniversalDependencies2#>
            PREFIX decomp: <http://www.w3.org/ns/lemon/decomp#>
            PREFIX gold: <http://purl.org/linguistics/gold#>
            
            SELECT DISTINCT ?lemmaId ?lemma ?wordformId ?wordform ?number ?gender concat('', ?subLemmata) AS ?subLemmata
            WHERE  {
                ?lemmaId ontolex:canonicalForm [ontolex:writtenRep ?lemma] .
                """+subpart+"""
                BIND( ?lemmaId AS ?lemmaIdIRI ).
                ?lemmaId ontolex:lexicalForm ?wordformId .
                ?wordformId ontolex:writtenRep ?wordform .
                OPTIONAL {?wordformId UD:Number ?number} .
                OPTIONAL {
                    ?lemmaId UD:Gender ?g . 
                        bind( 
                            if(?g = UD:Fem_Gender, 
                            UD:Com_Gender, 
                                if(?g = UD:Masc_Gender,
                                    UD:Com_Gender,
                                    UD:Neut_Gender
                                )
                            )
                            AS ?gender
                        )
                }
                OPTIONAL {
                    SELECT ?lemmaIdIRI (group_concat(DISTINCT concat(?partNr,":",?subLemma);separator=" + ") as ?subLemmata)
                    WHERE {
                        SELECT ?lemmaIdIRI ?celexComp ?aWordformId ?subLemma ?partNr
                        WHERE {
                                {
                                ?lemmaIdIRI ontolex:lexicalForm ?aWordformId . 
                                ?lemmaIdIRI decomp:constituent ?celexComp .
                                OPTIONAL { ?celexComp gold:stem [ontolex:writtenRep ?subLemma] . }
                                OPTIONAL { ?celexComp decomp:correspondsTo [ ontolex:canonicalForm [ontolex:writtenRep ?subLemma]] . }
                                }
                                {
                                    {
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_1> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_2> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_3> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_4> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_5> ?celexComp .}
                                        UNION
                                        {?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#_6> ?celexComp .}                                        
                                    }
                                ?lemmaIdIRI ?rdfsynt ?celexComp .
                                BIND(IF(STRSTARTS(str(?rdfsynt), "http://www.w3.org/1999/02/22-rdf-syntax-ns#"), replace(STRAFTER(str(?rdfsynt), "#"), "_", ""), "999") AS ?partNr) .
                                MINUS {
                                    ?lemmaIdIRI <http://www.w3.org/1999/02/22-rdf-syntax-ns#0> ?celexComp .
                                    }
                                }
                            FILTER (?partNr != "999") .
                            }
                            ORDER BY ?partNr
                            }
                        GROUP BY ?aWordformId ?lemmaIdIRI
                    }
            }
        """
        
    return query



def corpus_query_lemma(lemma):
    '''
    This function builds a query for getting occurances of a given lemma within a given corpus
    Args:
        lemma: a lemma to look for 
    Returns:
        a corpus query string
        
    >>> lemma_query = corpus_query_lemma("lopen")
    >>> df_corpus = search_corpus(lemma_query, "chn")
    >>> display(df_corpus)
    '''
    return r'[lemma="'+ lemma + r'"]'


def corpus_query_wordform(word):
    '''
    This function builds a query for getting occurances of a given wordform within a given corpus
    Args:
        word: a wordform to look for 
    Returns:
        a corpus query string
        
    >>> wordform_query = corpus_query_wordform("liep")
    >>> df_corpus = search_corpus(wordform_query, "chn")
    >>> display(df_corpus)
    '''
    return r'[word="'+ word + r'"]'

def lexicon_query_alllemmata(lexicon, pos):
    '''
    This function builds a query for getting all lemmata of a lexicon, if needed restricted to a given part-of-speech.
    The resulting query string is to be used as a parameter of search_lexicon().
    
    Args:
        lexicon: a lexicon name 
        pos: (optional) a part-of-speech
    Returns:
        a lexicon query string
    '''
    
    if (lexicon=="anw"):
        # part-of-speech filter not supported for this lexicon
        if (pos is not None and pos != ''):
            print('Filtering by part-of-speech is not (yet) supported in the \''+lexicon+'\' lexicon')
        query = """PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
                  PREFIX anw: <http://rdf.ivdnt.org/lexica/anw>                  
                  SELECT DISTINCT ?writtenForm
                  FROM <http://rdf.ivdnt.org/lexica/anw>
                  WHERE {
                      ?lemId rdfs:label ?lemma .
                      ?lemId ontolex:canonicalForm ?lemCFId . 
                      ?lemCFId ontolex:writtenRepresentation ?writtenForm .
                      }
                      ORDER BY ?writtenForm"""
    elif (lexicon=="celex"):
        # part-of-speech filter not supported for this lexicon
        if (pos is not None and pos != ''):
            print('Filtering by part-of-speech is not (yet) supported in the \''+lexicon+'\' lexicon')
        query = """
            PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
            
            SELECT DISTINCT ?lemma AS ?writtenForm
            WHERE  {
                ?lemmaId ontolex:canonicalForm [ontolex:writtenRep ?lemma] .                
                }
            ORDER BY ?lemma"""
    elif (lexicon=="diamant"):
        # part-of-speech filter not supported for this lexicon
        if (pos is not None and pos != ''):
            print('Filtering by part-of-speech is not (yet) supported in the \''+lexicon+'\' lexicon')
        query = """
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        prefix prov: <http://www.w3.org/ns/prov#>
        prefix diamant: <http://rdf.ivdnt.org/schema/diamant#>
        prefix lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        prefix lemon: <http://lemon-model.net/lemon#>
        prefix ontolex: <http://www.w3.org/ns/lemon/ontolex#>
        prefix ud: <http://universaldependencies.org/u/pos/>
        prefix skos: <http://www.w3.org/2004/02/skos/core#>
        prefix dcterms: <http://purl.org/dc/terms/>
        prefix dc: <http://purl.org/dc/terms/>

        select DISTINCT ?n_ontolex_writtenRep AS ?writtenForm
        where
        {
        graph ?g
        {
        {
            { ?n_form ontolex:writtenRep ?n_ontolex_writtenRep} .
            { ?n_form a ontolex:Form} .
        }
        }
        }
        ORDER BY ?n_ontolex_writtenRep
        LIMIT 10000
        """
    elif (lexicon=="duelme"):
        # part-of-speech filter not supported for this lexicon
        if (pos is not None and pos != ''):
            print('Filtering by part-of-speech is not (yet) supported in the \''+lexicon+'\' lexicon')
        query = """
            PREFIX lmf: <http://www.lexinfo.net/lmf>            
            SELECT DISTINCT ?lemma AS ?writtenForm
            WHERE  {
                  ?y lmf:hasLemma ?lemma . 
            }
            ORDER BY ?lemma"""
    elif (lexicon=="molex"):
        # part-of-speech filter
        pos_condition = """"""
        if pos is not None and pos != '':
            pos_condition = """
            {?lemEntryId UD:pos ?lemPos .
            FILTER regex(?lemPos, '"""+pos+"""') } .
            """
        query = """
                PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
                PREFIX UD: <http://universaldependencies.org/u/>
                SELECT DISTINCT ?lemma AS ?writtenForm
                FROM <http://rdf.ivdnt.org/lexica/molex>
                WHERE
                {
                ?lemEntryId ontolex:canonicalForm ?lemCFId .
                ?lemCFId ontolex:writtenRep ?lemma .  
                """+pos_condition+"""
                }
                 ORDER BY ?lemma"""
    else:
        raise ValueError("Lexicon " + lexicon + " not supported for querying all words.")
        
    #print(query)
    return query

def _metadata_from_lexicon_query(lex_query):
    '''
    Extract metadata fields from a lexicon query string
    
    Args:
        lex_query: A query string issued to a lexicon, can be constructed using lexicon_query()
    Returns:
        A list of metadata fields
    '''
    # Get part after select, eg: "?x ?y ?concat('',z) as ?a"
    select_match = re.search(r'select\s+(?:distinct)*\s*(.*)\s*(?:where|from)', lex_query, flags=re.IGNORECASE)
    if select_match:
        select_string = select_match.group(1)
        #Delete concat() part and following AS, because it can contain a space we do not want to split on
        string_wh_concat = re.sub(r'concat\(.*\) AS', '', select_string, flags=re.IGNORECASE)
        split_string = string_wh_concat.split()
        for i,elem in enumerate(split_string):
            if elem.lower()=="AS":
                # Remove AS and element before AS
                split_string.pop(i)
                split_string.pop(i-1)
                # Assume only one AS, so we escape loop
                break
        columns = [c.lstrip("?") for c in split_string]
    else:
        raise ValueError("No columns find in lexicon query.")
    return columns


## Corpus search

* Run the cell below to show the UI, and fill in your search query

In [5]:
#from chaininglib import ui

# Create corpus UI, creates references to field contents
corpusQueryField, corpusField = create_corpus_ui()

VBox(children=(Text(value='[lemma="boek" & pos="verb"]', description='<b>CQL query:</b>'), Dropdown(descriptio…

 * Click the cell below and press Run to perform the given query

In [None]:
#from chaininglib import search
query= corpusQueryField.value
corpus = corpusField.value
df_corpus = search_corpus(query, corpus)
#df_corpus = load_dataframe('mijn_resultaten.csv')
display_df(df_corpus, title="Results:")



[F...Searching zeebrieven at page 1001...                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

## Lexicon search

* Run the cell below to show the UI, and fill in your search query in the UI

In [None]:
#from chaininglib import ui
searchWordField, lexiconField = create_lexicon_ui()

 * Click the cell below and press Run to perform the given query

In [None]:
#from chaininglib import queries, search

search_word = searchWordField.value
lexicon = lexiconField.value
# USER: can replace this by own custom query
query = lexicon_query(word=search_word, pos= '', lexicon=lexicon)

df_lexicon = search_lexicon(query, lexicon)
display(df_lexicon)
#df_columns_list = list(df_lexicon.columns.values)
#df_lexicon_in_columns = df_lexicon[df_columns_list]
#display(df_lexicon_in_columns)

## Case study 1 (parallel): Frequency of *puur*+verb and *zuiver*+verb compared
* Below cell searches for *puur*+verb and for *zuiver*+verb in the CHN corpus
* Compare frequencies

In [None]:
#from chaininglib import search
from IPython.core.display import display, HTML

# Word 1: puur
word1= "puur"
df_corpus1 = search_corpus('[word="' + word1 + r'"][pos="verb"]',corpus="chn")
display(HTML('<b>' + word1 + '</b>'))
display(df_corpus1)

# Word 2: zuiver
word2 = "zuiver"
df_corpus2 = search_corpus(r'[word="' + word2 + r'"][pos="verb"]',"chn")
display(HTML('<b>' + word2 + '</b>'))
display(df_corpus2)

# Compute difference
diff_left, diff_right, intersec = column_difference(df_corpus1["word 1"], df_corpus2["word 1"])
# Elements of 1 that are not in 2
display(HTML('Werkwoorden voor <b>' + word1 + '</b> niet in <b>' + word2 + '</b>: ' + ", ".join(diff_left)))
# Elements of 2 that are not in 1
display(HTML('Werkwoorden voor <b>' + word1 + '</b> niet in <b>' + word2 + '</b>: ' + ", ".join(diff_right)))
# Elements both in 1 and 2
display(HTML('Werkwoorden zowel voor <b>' + word1 + '</b> als voor <b>' + word2 + '</b>: ' + ", ".join(intersec)))

## Case study 2 (sequential): Retrieve synonyms from DiaMaNT, look up in Gysseling
* Below cell searches for term "boek" in DiaMaNT, and looks up all variants in Gysseling

In [None]:
search_word = "boek"
lexicon = "diamant"
corpus= "gysseling"

# First, lookup synonyms in DiaMaNT
query = lexicon_query(word=search_word, pos= '', lexicon=lexicon)
df_lexicon = search_lexicon(query, lexicon)
syns = diamant_get_synonyms(df_lexicon) 
syns.add(search_word) # Also add search word itself
display(HTML('Synoniemen voor <b>' + search_word + '</b>: ' + ", ".join(syns)))

# Search for all synonyms in corpus
## Create queries: search by lemma
syns_queries = [corpus_query_lemma(syn) for syn in syns]
## Search for all synonyms in corpus
result_dict = search_corpus_multiple(syns_queries, corpus)
view_multiple_results(result_dict, labels=list(syns))



## Case study (parallel) 3: Find corpus words not in lexicon; list most frequent ones.
* Only parallel if you can ask the lexicon a list of all words.
* Currently only working: ask DiaMaNT list of words (limited at 10000)

In [None]:
# Query lexicon to give list of all words
lexicon="anw"
df_lexicon = search_lexicon_alllemmata(lexicon)
## TODO: Why do double words appear?
display(df_lexicon)
lexicon_set = sorted( set([w.lower() for w in df_lexicon["writtenForm"]]) )
display(lexicon_set)

df_corpus = search_corpus_allwords("gysseling", None)
display(df_corpus)
len(df_corpus)



## Case study (sequential) 4: Find occurences of attributive adjectives not ending with -e, even though they are preceeded by a definite article

In [None]:
corpus_to_search="opensonar"
lexicon_to_search="molex"

# CORPUS: get [article + attributive adjective + nouns] combinations in which the adjective does not end with -e
print('Stap 1:')
df_corpus = search_corpus(r'[lemma="de|het"][word="^g(.+)[^e]$" & pos="ADJ"][pos="NOUN"]', corpus=corpus_to_search)
display(df_corpus)

# LEXICON: get adjectives the lemma of which does not end with -e
query=lexicon_query('^g(.+)[^e]$', 'ADJ', lexicon_to_search)
df_lexicon = search_lexicon(query, lexicon_to_search)
display(df_lexicon)

# LEXICON: get adjectives having a final -e in definite attributive use
print('Filtering lexicon results')
df_lexicon_form_e = df_lexicon[ filter_condition(df_lexicon,column="wordform",method="contains_regex", regex_or_set = 'e$') ]
#final_e_condition=df_lexicon.wordform.str.contains('e$')
#df = df_lexicon[final_e_condition]
display(df_lexicon_form_e)

# RESULT: get the records out of our first list in which the -e-less-adjectives match the lemma form of our last list
print('Wanted list:')
e_forms = list(df_lexicon_form_e.lemma)
#no_final_e_condition = df_corpus['word 1'].isin(eless_forms)
#result_df = df_corpus[no_final_e_condition]
result_df = df_corpus[ filter_condition(df_corpus, column = "word 1", method="isin_set", regex_or_set=e_forms) ]
display( result_df )

## Case study (sequential) 5: (morphosyntactic lexicon and possibly unannotated corpus) Look up inflected forms and spelling variants for a given lemma in a corpus

In [None]:
lexicon_to_search="molex"
corpus_to_search="chn"

##############################################
# TODO  zelfde met meerdere lemmata en gegroepeerd 
##############################################

lemma_to_look_for="denken"

# LEXICON: Search for the inflected forms of a lemma in a morphosyntactic lexicon
query=lexicon_query(lemma_to_look_for, None, lexicon_to_search)
df_lexicon = search_lexicon(query, lexicon_to_search)
display(df_lexicon)

# Put all inflected forms into a list
inflected_wordforms = list(df_lexicon.wordform)

# CORPUS: Look up the inflected forms in a (possibly unannotated) corpus
# beware: If the corpus is not annotated, all we can do is searching for the inflected words
#         But if the corpus is lemmatized, we have to make sure we're retrieving correct data by specifying the lemma as well
annotated_corpus = True
query = r'[lemma="'+lemma_to_look_for+r'" & word="'+r"|".join(inflected_wordforms)+r'"]' if annotated_corpus else r'[word="'+r"|".join(inflected_wordforms)+r'"]'
df_corpus = search_corpus(query, corpus=corpus_to_search)
display(df_corpus)

## Case study 6: Build frequency table of some corpus, based on lemma list of a given lexicon

In [None]:
base_lexicon="molex"
corpus_to_search1="opensonar"
corpus_to_search2="chn"

# build frequency tables of two corpora

df_frequency_list1 = get_frequency_list(base_lexicon, "NOUN", corpus_to_search1)
display( df_frequency_list1.sort_values(ascending=False,by=['raw_freq']).head(25) )
display_df(df_frequency_list1.sort_values(ascending=True, by=['rank']).head(25), columns='raw_freq', title='chart df1', mode='chart' )

df_frequency_list2 = get_frequency_list(base_lexicon, "NOUN", corpus_to_search2)
display(df_frequency_list2.sort_values(ascending=False,by=['raw_freq']).head(25))
display_df(df_frequency_list2.sort_values(ascending=True, by=['rank']).head(25), columns='raw_freq', title='chart df2', mode='chart' )


# TODO: lemmata tonen die in 1 of 2 ontbreken

# compute the rank diff of lemmata in frequency tables

df_rankdiffs = get_rank_diff(df_frequency_list1, df_frequency_list2)

display(df_rankdiffs.sort_values(by=['rank_diff']).head(25))
display_df( df_rankdiffs.sort_values(ascending=False, by=['rank_diff']).head(25), columns='rank_diff', title='chart large diff', mode='chart' )
display_df( df_rankdiffs.sort_values(ascending=True, by=['rank_diff']).head(25), columns='rank_diff', title='chart small diff', mode='chart' )

## Case study 7: search in a corpus for wordforms of a lemma, which are not included in this lemma's paramadigm in a lexicon

In [None]:

base_lexicon="molex"
corpus_to_search="opensonar"

df = get_missing_wordforms(base_lexicon, "VERB", corpus_to_search)

df.to_csv( "missing_wordforms.csv", index=False)
#df = load_dataframe("missing_wordforms.csv")

display(df)


## Case study 8: Train a tagger with data from an annotated corpus, and do something cool

In [None]:
base_lexicon="molex"
corpus_to_search1="opensonar"
corpus_to_search2="chn"

# we have a given word, let's say: "loop"
some_word = "loop"

# get the paradigm of the lemma our word is a part of
query = lexicon_query(some_word, pos=None, lexicon=base_lexicon)
df_paradigm = search_lexicon(query, base_lexicon)
display(df_paradigm)

# gather some pattern including our word, out of an annotated corpus
# here: DET + ADJ + 'loop'
corpus_query = corpus_query_wordform(some_word)
df_corpus1 = search_corpus(corpus_query, corpus=corpus_to_search1, detailed_context=True)
display(df_corpus1)
df_corpus2 = search_corpus(corpus_query, corpus=corpus_to_search2, detailed_context=True)
display(df_corpus2)


df_all = concat_df([df_corpus1, df_corpus2], [corpus_to_search1, corpus_to_search2])
display(df_all)

# get a tagger trained with our corpus data
tagger = get_tagger(df_all)

# Use the trained tagger to tag unknown sentences
# The input must be like: tagger.tag(['today','is','a','beautiful','day'])

sentence = 'Mijn buurman kijkt door de loop van zijn geweer'
tagged_sentence = tagger.tag( sentence.split() )

print(tagged_sentence)


# Know we can lemmatize each occurence of our lemma in the new sentences

## Case study 9: Search in corpus and filter on metadata
First, we request all available metadata fields of the corpus. Then, we issue a search query, and request all metadata fields for the result. Finally, we filter on metadata values.

In [None]:
corpus_name="zeebrieven"
query=r'[lemma="boek"]'
# Request all metadata fields from corpus
fields = get_available_metadata("corpus", corpus_name)
# Perform query and ask all metadata
df_corpus = search_corpus(query, corpus_name, extra_fields_doc=fields["document"])

# All results
display_df(df_corpus, title="All results:")

# Filter on year: > 1700
df_filter_year = df_corpus[df_corpus["witnessYear_from"].astype('int32') > 1700] 
display_df(df_filter_year, title="After 1700:")

# Filter on sender birth place Leiden
#condition = filter_condition(df_corpus, column="afz_geb_plaats", method="contains_regex", regex_or_set="Leiden")
#df_filter_place = df_corpus[ condition ]
#display_df(df_filter_place, title="Sender born in Leiden:")

# Group by birth place
display_df(df_corpus.groupby("adr_loc_plaats").size())

## Case study 10: Visualizing h-dropping

In [None]:
corpus_to_search="chn"

fields = get_available_metadata("corpus", corpus_to_search)
#print(fields)

df_corpus1 = search_corpus(r'[lemma="h[aeo].*" & word="[aeo].*"]', corpus_to_search, extra_fields_doc=fields["document"])
df_corpus2 = search_corpus(r'[lemma="h[aeo].*" & word="h[aeo].*"]', corpus_to_search, extra_fields_doc=fields["document"])

display_df( df_corpus1)
display_df( df_corpus2)

display_df( df_corpus1.groupby(["Region", "Date"]), title="h-dropping", mode='chart')
display_df( df_corpus2.groupby(["Region", "Date"]), title="normal", mode='chart')