# Python API for conversational data visualization

In [1]:
from os import listdir, rename
from numpy import *
from scipy import signal, interpolate
from bokeh.plotting import figure, show
from bokeh.models import Legend, ColumnDataSource, HoverTool
from bokeh.models.widgets import Button
from bokeh.layouts import gridplot, column
from bokeh.io import output_notebook, push_notebook
from abc import ABC, abstractmethod
from ipywidgets import interact, HBox
from re import match
from ipywidgets import widgets
from collections import defaultdict
import warnings
warnings.simplefilter('ignore')
output_notebook()

In [2]:
class VisualizationData:

    # Constructor
    def __init__(self, directory, conversations=[], speakers=[], corpus_format="SW", **format_details):
        
        # Initialize formats
        if corpus_format == "CID":
            format = { 'data_columns'       : ['corpus', 'id_speaker', 'fill', 'time',
                                               'time_stop', 'values', 'fill'],
                       'metadata_columns'   : ['id_conv','id_speaker','data_type','corpus'],
                       'data_delimiter'     : ",",
                       'metadata_delimiter' : "\t",
                       'data_head_lines'    : 0,
                       'metadata_head_lines': 1,
                       'file_name'          : ['id_speaker'],
                       'file_name_delimiter': " " }
        else: # SW 
            format = { 'data_columns'       : ['id_line','values', 'time'],
                       'metadata_columns'   : ['id','id_conv','id_caller','id_speaker','id_topic',
                                               'sex','age','geography','level_study','corpus','data_type'],
                       'data_delimiter'     : "\t",
                       'metadata_delimiter' : "\t",
                       'data_head_lines'    : 1,
                       'metadata_head_lines': 1,
                       'file_name'          : ['corpus','id_conv','data_type','id_caller'],
                       'file_name_delimiter': "_" }
        for info in format:
            if info in format_details:
                format[info] = format_details[info]
                
        # Read the metadata
        for file_name in listdir(directory):
            if match(".*metadata.*\.csv$",file_name):
                metadata_file = open(directory+"/"+file_name, "r")
                self.metadata = genfromtxt(metadata_file, skip_header=format['metadata_head_lines'],
                                   encoding=None, delimiter=format['metadata_delimiter'],
                                  names=format['metadata_columns'], dtype=None)
                break

        self.data = []
        cpt = 0
        
        # Read files which match with the filters
        for line in self.metadata:
            if ((len(conversations) == 0 or line['id_conv'] in conversations)
                and (len(speakers) == 0 or line['id_speaker'] in speakers)):
                cpt += 1
                to_add = {'data':{},}
                for info_key in format['metadata_columns']:
                    to_add[info_key] = line[info_key]
                file_name = line[format['file_name'][0]]
                if len(format['file_name']) > 1:
                    for i in range(1, len(format['file_name'])):
                        info = line[format['file_name'][i]]
                        file_name = file_name + format['file_name_delimiter'] + str(info)
                file_name += ".csv"
                to_add['data'] = genfromtxt(open(directory + "/" + file_name), encoding=None, 
                                            skip_header=format['data_head_lines'], 
                                            delimiter=format['data_delimiter'],
                                            names=format['data_columns'], dtype=None)
                self.data.append(to_add)
        
        print(str(cpt)+" fichiers de données ont été lus")


In [84]:
class Display():        
    @staticmethod # Dislpay the average speechrate evolution of all vdata data
    def average(vdata, smoothing_window=None, filters={}, interactive=False, output_visibility=False):
        plot = AveragePlot(vdata.data[1]['data_type'], smoothing_window, 120, vdata.metadata.dtype.names)
        if 'output' not in globals():
            global output
            output = {}
            
        # add correct data on plot
        for data in vdata.data:
            if filters is None:
                plot.add_data(data)
                continue
            correct = True
            for filter_name, filter_value in filters.items():
                if type(filter_value) == tuple:
                    if data[filter_name] < filter_value[0] or data[filter_name] > filter_value[1]:
                        correct = False
                elif filter_value != " " and data[filter_name] != filter_value:
                    correct = False
            if correct:
                plot.add_data(data)    
                
        # set filter widgets
        def update(smoothness=50, **args):
            lens=0
            for d in vdata.data:
                lens += len(d['data']['time'])
            smoothing_window = int((lens/len(vdata.data))*smoothness/90)
            Display.average(vdata, smoothing_window=smoothing_window, filters=args, output_visibility=True)
        if interactive:
            filters_string = "smoothness=widgets.IntSlider(description=u'smoothness'," 
            filters_string += "min=3, max=100, value=30, continuous_update=False)"
            for key in vdata.metadata.dtype.names:
                if type(vdata.metadata[key][0]) == int32:
                    minimum = str(min(vdata.metadata[key]))
                    maximum = str(max(vdata.metadata[key]))
                    filters_string += ", " + key + " = widgets.IntRangeSlider(description=u'" + key + "', "
                    filters_string += "min=" + minimum + ", max=" + maximum + ", "
                    filters_string += "value=[" + minimum + ", " + maximum + "], "
                    filters_string += "continuous_update=False)"
                else:
                    filters_string += ", " + key + " = widgets.Dropdown(options=[' '" 
                    for option in set(vdata.metadata[key]):
                        filters_string += ", '" + option + "'"
                    filters_string += "], value=' ', description=u'" + key + "', continuous_update=False)"
            eval("interact(update, " + filters_string + ")")
            
        else:
            show(plot.get_plot())
            
            # set output buttons
            if output_visibility:
                name_output = widgets.Text(placeholder='Curent filtered data name')
                add_output = widgets.Button(description='Add to output')
                clear_output = widgets.Button(description='Clear output')
                def add(button):
                    global output
                    plot.get_plot()
                    output[name_output.value] = {'avg':plot.average, 'patch_x':plot.patch_x,
                                                 'patch_y':plot.patch_y}    
                    print("output contient " + str(len(output)) + " élements")
                def clear(button):
                    global output
                    output = {}
                    print("output est vide")
                add_output.on_click(add)
                clear_output.on_click(clear)
                display(HBox([name_output, add_output, clear_output]))
            
    @staticmethod # Display speechrate evolution plot for each vdata conversations
    def conversation(vdata, smoothing_window=None, points_number=None, linked=False, interactive=False,
                     color_palette = ["red", "blue", "green", "purple", "yellow"]):
        grid = []
        
        # Build conversation by assembling the data
        conversations = {}
        for d in vdata.data:
            if d['id_conv'] not in conversations:
                conversations[d['id_conv']] = {}
            if 'id_caller' in d:
                conversations[d['id_conv']][d['id_caller']] = d
            elif 'id_speaker' in d:
                conversations[d['id_conv']][d['id_speaker']] = d
                
        # Create and add each conversation plots to the grid layout
        for conv_id, conv_data in conversations.items():
            conv = ConversationPlot(conv_id, smoothing_window, points_number, color_palette)
            for speaker, data in conv_data.items():
                conv.add_data(speaker, data['data_type'], data['data'])  
            grid.append(conv.get_plot())
        
        # Synchronize plots if required
        if linked == True:
            for i in range(1, len(grid)):
                grid[i].x_range = grid[0].x_range
                grid[i].y_range = grid[0].y_range
        
        # Display the plot
        def update(smoothing_window=30, points_number=100):
            Display.conversation(vdata, smoothing_window=smoothing_window, points_number=points_number)
        if interactive:
            interact(update, smoothing_window=(5,100), points_number=(3, 150))
        else:
            show(gridplot(grid, ncols=1))
            
            
    @staticmethod # Display filters output comparison
    def comparison(elements, color_palette = ["red", "blue", "green", "purple", "yellow"]):
        plot = figure(width=950,height=300)
        color_iterator = 0
        legend_items = []
        for name, data in elements.items():
            color = color_palette[color_iterator%len(color_palette)]
            g1 = plot.line(linspace(0, 100, data['avg'].size), data['avg'], line_width=5, color=color)
            g2 = plot.patch(hstack((data['patch_x'], data['patch_x'][::-1])), data['patch_y'],
                                 fill_alpha=0.1, fill_color=color, line_color=color)
            legend_items.append((name + " average",[g1]))
            legend_items.append((name + " standard deviation",[g2]))
            color_iterator += 1
        plot.add_layout(Legend(items=legend_items), 'right')
        plot.legend.click_policy="hide"
        show(plot)        
        
        
    @staticmethod #!  
    def aggregation(vdata, smoothing_window = 30, color_palette = ["green", "blue", "red"]):
        if len(vdata.data) != 2:
            print("2 données sont demandées, pas " + str(len(vdata.data)))
            return
        plot = figure(width=950,height=300)
        legend_items = []
        color_iterator = 0
        len_avg = (len(vdata.data[0]['data']['time']) + len(vdata.data[1]['data']['time'])) / 2
        smoothness = int(len_avg / 2 * smoothing_window / 90)
        smoothed_data = []
        for d in vdata.data:
            x = []
            for row_x in d['data']['time']:
                x.append(row_x * 100 / d['data']['time'][-1])
            smoothed_data.append(Plot.smooth(x, d['data']['values'], smoothness, 120))
        g1 = plot.line(smoothed_data[0]['x'], smoothed_data[0]['y'], alpha= 0.7, color=color_palette[0])
        g2 = plot.line(smoothed_data[1]['x'], smoothed_data[1]['y'], alpha= 0.7, color=color_palette[1])
        ga = plot.line(linspace(0,100,120), smoothed_data[0]['y'] - smoothed_data[1]['y'],
                       color = color_palette[2], line_width = 5)
        legend_items.append(("speaker " + str(vdata.data[0]['id_speaker']), [g1]))
        legend_items.append(("speaker " + str(vdata.data[1]['id_speaker']), [g2]))
        legend_items.append(("difference", [ga]))
        plot.add_layout(Legend(items=legend_items), 'right')
        plot.legend.click_policy="hide"
        show(plot)   


In [4]:
class Plot:
    # Constructor
    def __init__(self, smoothing_window, points_number):
        self.smoothing_window = smoothing_window
        self.points_number = points_number        
        
    # Smoothing function
    def smooth(x, y, window_len, points_number):
        if window_len is None:
            window_len = int(x[-1]/15)
        if points_number is None:
            points_number = 200
        
        # First step, get the values closer together by averaging each values with a window of hann :
        window = hanning(window_len)
        if len:
            wider_data = r_[y[window_len-1:0:-1],y,y[-2:-window_len-1:-1]]
        else:
            wider_data = [y]
        new_y=convolve(window/window.sum(),wider_data,mode='valid')

        # Second step, create new points to smooth the curve :
        smooth_function = interpolate.CubicSpline(linspace(0, int(x[-1]),new_y.size), new_y)
        smooth_x = linspace(0, x[-1], points_number)
        smooth_y = smooth_function(smooth_x)
    
        return {'x':smooth_x, 'y':smooth_y}

In [5]:
class AveragePlot (Plot):
    # Constructor
    def __init__(self, title, smoothing_window, points_number, tmp):
        Plot.__init__(self, smoothing_window, points_number)
        informations = []
        for info in tmp:
            informations.append((info,"@"+info))
        hover = HoverTool(tooltips=informations)
        self.plot = figure(width=950,height=300, title=title,
                           tools=["box_zoom", "pan", hover, "wheel_zoom", "reset", "save"],
                           x_axis_label="conversation progress (%)", y_axis_label=title)
        self.src = defaultdict(list)

    # Add x in percent and smoothed y to source data
    def add_data(self, data):
        x=[]
        for row_x in data['data']['time']:
            x.append(row_x * 100 / data['data']['time'][-1])
        smoothed_values = Plot.smooth(x, data['data']['values'], self.smoothing_window, self.points_number)
        self.src['x'].append(smoothed_values['x'])
        self.src['y'].append(smoothed_values['y'])
        for info_key in data:
            if info_key != "data":
                self.src[info_key].append(data[info_key])
    
    # Return plot after drawing lines on it
    def get_plot (self):
        if len(self.src.keys()) == 0:
            self.plot.text(x=100, y=100, text=["Aucune donnée ne correspond aux filtres"])
            return self.plot
        self.average = mean(self.src['y'], axis=0)
        self.patch_x = linspace(0, 100, self.average.size)
        patch_y1 = self.average - std(self.src['y'], axis=0)
        patch_y2 = self.average + std(self.src['y'], axis=0)
        self.patch_y = hstack((patch_y1, patch_y2[::-1]))
        source = ColumnDataSource(self.src)
        g1 = self.plot.multi_line(source=source, xs='x', ys='y', line_width=1, color="grey", alpha = 0.4)
        g2 = self.plot.line(linspace(0, 100, self.average.size), self.average, line_width=4, color="blue")
        g3 = self.plot.patch(hstack((self.patch_x, self.patch_x[::-1])), self.patch_y,
                             fill_alpha=0.2, fill_color="blue", line_color="blue")
        legend = Legend(items=[("All data",[g1]), ("Average",[g2]), ("Standard deviation",[g3])],
                        location=(20,20))
        self.plot.add_layout(legend, 'right')
        self.plot.legend.click_policy="hide"
        return self.plot

In [6]:
class ConversationPlot (Plot):
    
    # Constructor
    def __init__(self, id_conv, smoothing_window, points_number, color_palette):
        Plot.__init__(self, smoothing_window, points_number)
        self.plot = figure(width=950, height=250, title="Conversation "+str(id_conv),
                           x_axis_label="time (s)")
        self.color_palette = color_palette
        self.legend_items = [("Raw data",[]),("Averages",[])]
        self.color_number = 0

    # Draw lines on the plot with data 
    def add_data (self, speaker, data_type, data):
        color=self.color_palette[self.color_number%len(self.color_palette)]
        average = mean(data['values'])
        smoothed_data = Plot.smooth(data['time'], data['values'],
                                    self.smoothing_window, self.points_number)
        g1 = self.plot.line(data['time'], data['values'],
                            alpha=0.2, line_dash="10 4", line_width=1, color=color)
        g2 = self.plot.cross(data['time'], data['values'],
                             line_width=1, alpha=0.3, size=10, color=color)
        g3 = self.plot.line(smoothed_data['x'], smoothed_data['y'],
                            line_width=3, color=color)
        g4 = self.plot.line([0,data['time'][-1]], [average, average],
                            line_width=1, line_dash="20 3", color=color)
        self.legend_items[0][1].append(g1)
        self.legend_items[0][1].append(g2)
        self.legend_items[1][1].append(g4)
        self.legend_items.append(("Smooth Speaker "+speaker, [g3]))
        self.color_number += 1
        
    # Return the plot after added its legend
    def get_plot (self):
        legend = Legend(items=self.legend_items)
        self.plot.add_layout(legend, 'right')
        self.plot.legend.click_policy="hide"
        return self.plot

## API demonstration

In [15]:
vdata2 = VisualizationData("../Desktop/X11", corpus_format="SW", conversations=[2222,3456,2051,3034,2444,2333])
vdata = VisualizationData("../Desktop/X11", corpus_format="SW", conversations=[2051])

12 fichiers de données ont été lus
2 fichiers de données ont été lus


In [85]:
Display.aggregation(vdata,smoothing_window=50)

In [38]:
Display.average(vdata2, interactive=True)

In [11]:
Display.comparison(output, color_palette=['#dd0066', 'blue'])

In [10]:
Display.conversation(vdata, smoothing_window=20, linked=True) 