application/utils.py

import pickle
import time
import regex as re
import pathlib
import bokeh.plotting
import bokeh.models
import bokeh.layouts
import pandas as pd
import threading
import lxml
import queue
import socket
import random
import string


TOOLS = "hover, pan, reset, wheel_zoom, zoom_in, zoom_out"
JAVASCRIPT = """
             var exclude = /[!#$&\'"()*+,-\s./:;<=>?@^_`{|}~]/g;
             var textfield = textfield.value.replace(exclude, "");
             var options = %s;

             for (var i in options) {
                 if (textfield == options[i].replace(exclude, "")) {
                     eval(textfield).visible = true;
                 }
                 else {
                     eval(textfield).visible = false;
                 }
             }
             """

def compress(data, filepath):
    """
    Dumps generated data.
    """
    with open(filepath, 'wb') as file:
        pickle.dump(data, file)


def decompress(filepath):
    """
    Loads dumped data.
    """
    with open(filepath, 'rb') as file:
        return pickle.load(file)


def load_data(tempdir):
    """
    Loads the generated data.
    """
    data_path = str(pathlib.Path(tempdir, 'data.pickle'))
    parameter_path = str(pathlib.Path(tempdir, 'parameter.csv'))
    topics_path = str(pathlib.Path(tempdir, 'topics.csv'))

    data = decompress(data_path)
    parameter = pd.read_csv(parameter_path, index_col=0, encoding='utf-8')
    parameter.columns = ['']  # remove column names
    topics = pd.read_csv(topics_path, index_col=0, encoding='utf-8')

    data['parameter'] = [parameter.to_html(classes='parameter', border=0)]
    data['topics'] = [topics.to_html(classes='topics')]
    return data


def remove_markup(content):
    """
    Removes markup from text. If lxml fails, a simple regex is used.
    """
    try:
        parser = lxml.etree.XMLParser(recover=True)
        tree = lxml.etree.parse(content, parser=parser)
        ns = dict(tei='http://www.tei-c.org/ns/1.0')
        lxml.etree.strip_elements(tree, 'speaker', with_tail=False)
        lxml.etree.strip_elements(tree, 'note', with_tail=False)
        lxml.etree.strip_elements(tree, 'stage', with_tail=False)
        lxml.etree.strip_elements(tree, 'head', with_tail=False)
        text = tree.xpath('//text()')
        text = '\n'.join(text)
        text = re.sub('  ', '', text)
        text = re.sub('    ', '', text)
        text = re.sub('\n{1,6}', '\n', text)
        text = re.sub('\n \n', '\n', text)
        text = re.sub('\t\n', '', text)
        return text
    except:
        text = []
        for line in content:
            line = re.sub('<.*?>', '', line)
            line = re.sub('(<.[^(><.)]+>)|<.?>', '', line)
            line = re.sub('\\n', '', line)
            line = re.sub('[ ]{2,}', ' ', line)
            line = re.sub('<?(.*?)?>', '', line)
            text.append(line)
        return ''.join(text)


def boxplot(stats):
    """
    Creates a boxplot for corpus statistics.
    """
    x_labels = ['Document size (clean)', 'Document size (raw)']

    groups = stats.groupby('group')
    q1 = groups.quantile(q=0.25)
    q2 = groups.quantile(q=0.5)
    q3 = groups.quantile(q=0.75)
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    lower = q1 - 1.5 * iqr

    def outliers(group):
        cat = group.name
        return group[(group.score > upper.loc[cat]['score']) |
                     (group.score < lower.loc[cat]['score'])]['score']
    out = groups.apply(outliers).dropna()

    fig = bokeh.plotting.figure(tools='', background_fill_color='#EFE8E2',
                                title='', x_range=x_labels, logo=None,
                                sizing_mode='fixed', plot_width=500,
                                plot_height=350)

    qmin = groups.quantile(q=0.00)
    qmax = groups.quantile(q=1.00)
    upper.score = [min([x, y]) for (x, y) in zip(list(qmax.loc[:, 'score']), upper.score)]
    lower.score = [max([x, y]) for (x, y) in zip(list(qmin.loc[:, 'score']), lower.score)]

    fig.segment(x_labels, upper.score, x_labels, q3.score, line_color='black')
    fig.segment(x_labels, lower.score, x_labels, q1.score, line_color='black')

    fig.vbar(x_labels, 0.7, q2.score, q3.score, fill_color='#E08E79', line_color='black')
    fig.vbar(x_labels, 0.7, q1.score, q2.score, fill_color='#3B8686', line_color='black')

    fig.rect(x_labels, lower.score, 0.2, 0.01, line_color='black')
    fig.rect(x_labels, upper.score, 0.2, 0.01, line_color='black')

    fig.yaxis.axis_label = 'Tokens'
    fig.xgrid.grid_line_color = None
    fig.ygrid.grid_line_color = 'white'
    fig.grid.grid_line_width = 2
    fig.xaxis.major_label_text_font_size = '11pt'
    fig.yaxis.major_label_text_font_size = '9pt'
    return fig


def barchart(document_topics, height, topics=None, script=JAVASCRIPT, tools=TOOLS):
    """
    Creates an interactive barchart for document-topics proportions.
    """
    y_range = document_topics.columns.tolist()
    fig = bokeh.plotting.figure(y_range=y_range, plot_height=height, tools=tools,
                                toolbar_location='right', sizing_mode='scale_width',
                                logo=None)

    
    plots = {}
    options = document_topics.index.tolist()
    
    for i, option in enumerate(options):
        x_axis = document_topics.loc[option].tolist()
        source = bokeh.models.ColumnDataSource(dict(Describer=y_range, Proportion=x_axis))
        bar = fig.hbar(y='Describer', right='Proportion', source=source,
                       height=0.5, color='#053967')
        bar = fig.hbar(y=[1,2,3,4,5], height=0.5, right=x_axis, color='#053967')

        if i == 0:
            bar.visible = True
        else:
            bar.visible = False
        print(x_axis)
        plots[exclude_punctuations(option)] = bar
    """
    fig.xgrid.grid_line_color = None
    fig.x_range.start = 0
    fig.select_one(bokeh.models.HoverTool).tooltips = [('Proportion', '@Proportion')]
    fig.xaxis.axis_label = 'Proportion'
    fig.xaxis.major_label_text_font_size = '9pt'
    fig.yaxis.major_label_text_font_size = '9pt'
    """

    if topics is not None:
        what = 'topic'
    else:
        what = 'document'
    title = 'Type a {} + press enter'.format(what)

    #callback = bokeh.models.CustomJS(args=plots, code=script % options)
    #textfield = bokeh.models.widgets.AutocompleteInput(completions=options,
    #                                                   placeholder=title,
    #                                                   callback=callback)
    #callback.args['textfield'] = textfield
    return bokeh.layouts.row([fig], sizing_mode='scale_width')


def read_logfile(logfile):
    """
    Reads a logfile and returns the current number of iterations.
    """
    time.sleep(3)
    pattern = re.compile('-?\d+')
    with open(logfile, 'r', encoding='utf-8') as file:
        text = file.readlines()
        line = text[-1][:-1]

        if 'likelihood' in line:
            return pattern.findall(line)[0]
        elif 'n_documents' in line:
            return 0
        elif 'vocab_size' in line:
            return 0
        elif 'n_words' in line:
            return 0
        elif 'n_topics' in line:
            return 0
        elif 'n_iter' in line:
            return 0


def enthread(target, args):
    """
    Threads a process.
    """
    q = queue.Queue()
    def wrapper():
        q.put(target(*args))
    t = threading.Thread(target=wrapper)
    t.start()
    return q


def is_connected(host='8.8.8.8', port=53, timeout=3):
    """
    Checks if your machine is connected to the internet.
    """
    try:
        socket.setdefaulttimeout(timeout)
        socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port))
        return True
    except:
        return False


def exclude_punctuations(s):
    """
    Excludes punctuations from a string.
    """
    exclude = set(string.punctuation)
    return ''.join(ch for ch in s if ch not in exclude)