# Python API for conversational data visualization

In [1]:
from os import listdir
from numpy import *
from scipy import signal, interpolate
from bokeh.plotting import figure, show
from bokeh.models import Legend
from bokeh.layouts import gridplot
from bokeh.io import output_notebook
from abc import ABC, abstractmethod
from re import match
from pickle import load
output_notebook()

In [2]:
class VisualizationData:

    # Constructor
    def __init__(self, directory, corpus=[], conversations=[], speakers=[], data_type=[],
                 id_conv="id_conv", id_caller="id_caller", id_speaker="id_speaker",
                 file_name_format=["corpus","id_conv","data_type","fill","id_caller"], file_name_delimiter="_"):
        
        # Read the metadata file and put the data into an array named "metadata"
        metadata_file = open(directory+"/SW_metadata.csv", "r")
        column_names = metadata_file.readline().split("\t")
        self.metadata = genfromtxt(metadata_file, delimiter="\t", names=column_names,
                                   dtype=None, usecols=range(1,len(column_names)))

        self.data = []
        cpt = 0
                
        # Search all correct data files in directory
        for file_name in listdir(directory):
            if match("[a-zA-Z]+_[0-9]+_[a-zA-Z0-9]+_.+\.csv$",file_name):
                
                # Get informations from the file name
                file_info_values = file_name.replace(".csv", "").split(file_name_delimiter) 
                file_info = {}
                for i in range(len(file_info_values)):
                    file_info[file_name_format[i]] = file_info_values[i]
                
                # Find in metadata the speaker id of this data file
                for line in self.metadata:
                    if (str(line[id_conv]) == file_info["id_conv"]
                    and line[id_caller].decode('UTF-8') == file_info["id_caller"]):
                        speaker = line[id_speaker]
                        break
                        
                # Add the data from the files which match with the filters
                if ((len(corpus) == 0 or int(file_info["corpus"]) in corpus) 
                and (len(conversations) == 0 or int(file_info["id_conv"]) in conversations) 
                and (len(data_type) == 0 or file_info["data_type"] in data_type)
                and (len(speakers) == 0 or speaker in speakers)): 
                    to_add = {"corpus":file_info["corpus"],
                              "conversation":file_info["id_conv"],
                              "speaker":file_info["id_caller"]} 
                    to_add[file_info["data_type"]] = loadtxt(open(directory+"/"+file_name, "rb"),
                                                             delimiter="\t", skiprows=1)
                    self.data.append(to_add)
                    cpt += 1
                    
        print(str(cpt)+" fichiers de données ont été lus")

In [3]:
class Display():
    
    @staticmethod # Dislpay the average speechrate evolution of all vdata data
    def average(vdata, smoothing_window=None, points_number=None):
        plot = AveragePlot(smoothing_window, points_number)
        for d in vdata.data:
            plot.add_data(d)
        show(plot.get_plot())
    

    @staticmethod # Display speechrate evolution plot for each vdata conversations
    def conversation(vdata, smoothing_window=None, points_number=None, linked = False,
                     color_palette = ["red", "blue", "green", "purple", "yellow"]):
        grid = []
        
        # Build discussions by assembling the data
        conversations = {}
        for d in vdata.data:
            if d['conversation'] not in conversations.keys():
                conversations[d['conversation']] = {}
            conversations[d['conversation']][d['speaker']] = d['speech'] # ! temporary only speech rate
        
        # Create and add each conversation plots to the grid layout
        for conv_id, conv_data in conversations.items():
            conv = ConversationPlot(conv_id, smoothing_window, points_number, color_palette)
            for speaker, data in conv_data.items():
                conv.add_data(speaker, data)
            grid.append(conv.get_plot())
        
        # Synchronize plots if required
        if linked == True:
            for i in range(1, len(grid)):
                grid[i].x_range = grid[0].x_range
                grid[i].y_range = grid[0].y_range
                
        show(gridplot(grid, ncols=1))

In [4]:
class Plot:
    # Constructor
    def __init__(self, smoothing_window, points_number):
        self.smoothing_window = smoothing_window
        self.points_number = points_number        
        
    # Smoothing function
    def smooth(x, y, window_len, points_number):
        if window_len is None:
            window_len = int(x[-1]/15)
        if points_number is None:
            points_number = 200
        
        # First step, get the values closer together by averaging each values with a window of hann :
        window = hanning(window_len)
        wider_data = r_[y]
        new_y=convolve(window/window.sum(),wider_data,mode='valid')

        # Second step, create new points to smooth the curve :
        smooth_function = interpolate.CubicSpline(linspace(0, int(x[-1]),new_y.size), new_y)
        smooth_x = linspace(0, x[-1], points_number)
        smooth_y = smooth_function(smooth_x)

        return {'x':smooth_x, 'y':smooth_y}

In [5]:
class AveragePlot (Plot):  # temporary only speech rate
    
    # Constructor
    def __init__(self, smoothing_window, points_number):
        Plot.__init__(self, smoothing_window, points_number)
        self.plot = figure(width=800,height=250,
                           x_axis_label="conversation progress (%)", y_axis_label="speech rate")
        self.grouped_data = [[],[]]
    
    # Add x in percent and smoothed y to grouped data
    def add_data(self, data):
        x=[]
        for row_x in data['speech'][:,2]:
            x.append(row_x * 100 / data['speech'][-1,2])
        smoothed_values = Plot.smooth(x, data['speech'][:,1], self.smoothing_window, self.points_number)
        self.grouped_data[0].append(smoothed_values['x'])
        self.grouped_data[1].append(smoothed_values['y'])
    
    # Return plot after drawing grouped data and average lines on it
    def get_plot (self):
        self.plot.multi_line(self.grouped_data[0], self.grouped_data[1], line_width=2,
                             color="grey", alpha = 0.6, legend = "all data")
        average = mean(self.grouped_data[1], axis=0)
        self.plot.line(linspace(0, 100, average.size), average, legend="Average", line_width=4, color="blue")
        self.plot.legend.click_policy="hide"
        return self.plot

In [6]:
class ConversationPlot (Plot): # temporary only speech rate
    
    # Constructor
    def __init__(self, id_conv, smoothing_window, points_number, color_palette):
        Plot.__init__(self, smoothing_window, points_number)
        self.plot = figure(width=950, height=250, title="Conversation "+id_conv,
                           x_axis_label="time (s)", y_axis_label="speech rate")
        self.color_palette = color_palette
        self.legend_items = [("Raw data",[]),("Averages",[])]
        self.color_number = 0

    # Draw lines on the plot with data 
    def add_data (self, speaker, data):
        color=self.color_palette[self.color_number%9]
        average = mean(data[:,1])
        smoothed_data = Plot.smooth(data[:,2], data[:,1], self.smoothing_window, self.points_number)
        g1 = self.plot.line(data[:,2], data[:,1], alpha=0.3, line_dash="10 4", line_width=1, color=color)
        g2 = self.plot.cross(data[:,2], data[:,1], line_width=1, alpha=0.4, size=10, color=color)
        g3 = self.plot.line(smoothed_data['x'], smoothed_data['y'], line_width=3, color=color)
        g4 = self.plot.line([0,data[-1,2]], [average, average], line_width=1,  line_dash="20 3", color=color)
        self.legend_items[0][1].append(g1)
        self.legend_items[0][1].append(g2)
        self.legend_items[1][1].append(g4)
        self.legend_items.append(("Smooth Speaker"+speaker, [g3]))
        self.color_number += 1
    
    # Return the plot after added its legend
    def get_plot (self):
        legend = Legend(items=self.legend_items, location=(20,20))
        self.plot.add_layout(legend, 'right')
        self.plot.legend.click_policy="hide"
        return self.plot

## API demonstration

In [7]:
vdata = VisualizationData("../Desktop/X11", conversations=[3456,2222])

  if sys.path[0] == '':


4 fichiers de données ont été lus


In [8]:
Display.conversation(vdata, color_palette=["green", "#220099"], smoothing_window=13)

In [9]:
Display.average(vdata, smoothing_window=9)