# Python API for conversational data visualization

In [1]:
from os import listdir
from numpy import *
from scipy import signal, interpolate
from bokeh.plotting import figure, show
from bokeh.models import Legend
from bokeh.layouts import gridplot
from bokeh.io import output_notebook
from abc import ABC, abstractmethod
from re import match
from pickle import load
output_notebook()

In [2]:
class VisualizationData:

    # Constructor
    def __init__(self, directory, corpus=[], conversations=[], data_type=[], speakers=[],
                 format="SW", data_format=None, data_delimiter=None, metadata_format=None,
                 file_name_format=None, file_name_delimiter=None, conv_composition=None, head_lines=None):
        
        # Initialize formats
        if format == "CID":
            if data_format is None:
                data_format = ['corpus', 'id_speaker', 'fill', 'time', 'time_stop', 'values', 'fill']
            if data_delimiter is None:
                data_delimiter = ","
            if file_name_format is None:
                file_name_format = ['id_speaker']
            if file_name_delimiter is None:
                file_name_delimiter = " "
            if conv_composition is None:
                conv_composition = [('AB','CM'),('AG','YM'),('EB','SR'),('NH','LL'),
                                    ('AC','MB'),('BX','MG'),('LJ','AP'),('ML','IM')]
            if head_lines is None:
                head_lines = 0
        else: #SW
            if data_format is None:
                data_format = ['id_line','values', 'time']
            if data_delimiter is None:
                data_delimiter = "\t"
            if metadata_format is None:
                metadata_format = ['id','id_conv','id_caller','id_speaker','id_topic',
                                   'sex','age','geography','level_study']
            if file_name_format is None:
                file_name_format = ['corpus','id_conv','data_type','fill','id_caller']
            if file_name_delimiter is None:
                file_name_delimiter = "_"
            if head_lines is None:
                head_lines = 1
            
        
        # Read the metadata if it exists
        for file_name in listdir(directory):
            if match(".*metadata.*\.csv$",file_name):
                metadata_file = open(directory+"/"+file_name, "r")
                self.metadata = genfromtxt(metadata_file, delimiter=data_delimiter,
                                           names=metadata_format, dtype=None)
                print(self.metadata)

        self.data = []
        cpt = 0
                
        # Search all correct data files in directory
        for file_name in listdir(directory):
            if match(".*\.csv$",file_name) and not match(".*metadata.*",file_name):
                
                # Get informations from name, file or metadata.
                file_info = {'corpus':None,'id_conv':None,'data_type':None,
                             'id_speaker':None,'id_caller':None}
                file_name_info = file_name.replace(".csv","").split(file_name_delimiter)
                file_line_info = open(directory+"/"+file_name, "r").readline().split(data_delimiter)
                for info_key in file_info:
                    if info_key in file_name_format:
                        file_info[info_key] = file_name_info[file_name_format.index(info_key)]
                    else:
                        file = open(directory+"/"+file_name)
                        maybe_head = file.readline()
                        line = file.readline().split(data_delimiter)
                        if info_key in data_format:
                            file_info[info_key] = line[data_format.index(info_key)]
                        
                # Add the data from the files which match with the filters
                if ((len(corpus) == 0 or int(file_info['corpus']) in corpus) 
                and (len(conversations) == 0 or int(file_info['id_conv']) in conversations) 
                and (len(data_type) == 0 or file_info['data_type'] in data_type)
                and (len(speakers) == 0 or file_info['id_speaker'] in speakers)):
                    to_add = {'data':{},'corpus':file_info['corpus']}
                    if file_info['id_conv'] is not None:
                        to_add['conversation'] = file_info['id_conv']
                    elif conv_composition is not None:
                        for conv in conv_composition:
                            if (conv[0] == file_info['id_caller'] or conv[1] == file_info['id_caller']
                            or conv[0] == file_info['id_speaker'] or conv[1] == file_info['id_speaker']):
                                to_add['conversation'] = str(conv)
                    else:
                        to_add['conversation'] = str(file_info)
                    if file_info['id_caller'] is not None:
                        to_add['caller'] = file_info['id_caller']
                    if file_info['id_speaker'] is not None:
                        to_add['speaker'] = file_info['id_speaker']
                    data_to_add = genfromtxt(open(directory+"/"+file_name, "r"), skip_header=head_lines, 
                                                delimiter=data_delimiter, names=data_format, dtype=None)
                    if file_info['data_type'] is not None:
                        to_add['data'][file_info['data_type']] = data_to_add
                    elif file_info['corpus'] is not None:
                        to_add['data'][file_info['corpus']] = data_to_add
                    else:
                        print("ni data_type, ni corpus")
                        break
                    self.data.append(to_add)
                    cpt += 1
             
        print(str(cpt)+" fichiers de données ont été lus")


In [3]:
class Display():
    
    @staticmethod # Dislpay the average speechrate evolution of all vdata data
    def average(vdata, smoothing_window=None, points_number=None):
        plot = AveragePlot(list(vdata.data[0]['data'].keys())[0], smoothing_window, points_number)
        for data_list in vdata.data:
            for data in data_list['data'].values():
                plot.add_data(data)
        show(plot.get_plot())

    @staticmethod # Display speechrate evolution plot for each vdata conversations
    def conversation(vdata, smoothing_window=None, points_number=None, linked=False,
                     color_palette = ["red", "blue", "green", "purple", "yellow"]):
        grid = []
        # Build conversation by assembling the data
        conversations = {}
        for d in vdata.data:
            if d['conversation'] not in conversations:
                conversations[d['conversation']] = {}
            if 'caller' in d:
                conversations[d['conversation']][d['caller']] = d['data'] 
            elif 'speaker' in d:
                conversations[d['conversation']][d['speaker']] = d['data']
                
        # Create and add each conversation plots to the grid layout
        for conv_id, conv_data in conversations.items():
            conv = ConversationPlot(conv_id, smoothing_window, points_number, color_palette)
            for speaker, data_list in conv_data.items():
                for data_type, data in data_list.items():
                    conv.add_data(speaker, data_type, data)
            grid.append(conv.get_plot())
        
        # Synchronize plots if required
        if linked == True:
            for i in range(1, len(grid)):
                grid[i].x_range = grid[0].x_range
                grid[i].y_range = grid[0].y_range
                
        show(gridplot(grid, ncols=1))

In [4]:
class Plot:
    # Constructor
    def __init__(self, smoothing_window, points_number):
        self.smoothing_window = smoothing_window
        self.points_number = points_number        
        
    # Smoothing function
    def smooth(x, y, window_len, points_number):
        if window_len is None:
            window_len = int(x[-1]/15)
        if points_number is None:
            points_number = 200
        
        # First step, get the values closer together by averaging each values with a window of hann :
        window = hanning(window_len)
        wider_data = r_[y]
        new_y=convolve(window/window.sum(),wider_data,mode='valid')

        # Second step, create new points to smooth the curve :
        smooth_function = interpolate.CubicSpline(linspace(0, int(x[-1]),new_y.size), new_y)
        smooth_x = linspace(0, x[-1], points_number)
        smooth_y = smooth_function(smooth_x)
    
        return {'x':smooth_x, 'y':smooth_y}

In [5]:
class AveragePlot (Plot):  # temporary only speech rate
    
    # Constructor
    def __init__(self, title, smoothing_window, points_number):
        Plot.__init__(self, smoothing_window, points_number)
        self.plot = figure(width=800,height=300, title=title,
                           x_axis_label="conversation progress (%)", y_axis_label="speech rate")
        self.grouped_data = [[],[]]
    
    # Add x in percent and smoothed y to grouped data
    def add_data(self, data):
        x=[]
        for row_x in data['time']:
            x.append(row_x * 100 / data['time'][-1])
        smoothed_values = Plot.smooth(x, data['values'], self.smoothing_window, self.points_number)
        self.grouped_data[0].append(smoothed_values['x'])
        self.grouped_data[1].append(smoothed_values['y'])
    
    # Return plot after drawing grouped data and average lines on it
    def get_plot (self):
        self.plot.multi_line(self.grouped_data[0], self.grouped_data[1], line_width=1,
                             color="grey", alpha = 0.4, legend = "all data")
        average = mean(self.grouped_data[1], axis=0)
        self.plot.line(linspace(0, 100, average.size), average, legend="Average", line_width=4,color="blue")
        self.plot.legend.click_policy="hide"
        return self.plot

In [6]:
class ConversationPlot (Plot): # temporary only speech rate
    
    # Constructor
    def __init__(self, id_conv, smoothing_window, points_number, color_palette):
        Plot.__init__(self, smoothing_window, points_number)
        self.plot = figure(width=950, height=250, title="Conversation "+str(id_conv),
                           x_axis_label="time (s)")
        self.color_palette = color_palette
        self.legend_items = [("Raw data",[]),("Averages",[])]
        self.color_number = 0

    # Draw lines on the plot with data 
    def add_data (self, speaker, data_type, data):
        color=self.color_palette[self.color_number%9]
        average = mean(data['values'])
        smoothed_data = Plot.smooth(data['time'], data['values'], self.smoothing_window, self.points_number)
        g1 = self.plot.line(data['time'], data['values'], alpha=0.2, line_dash="10 4", line_width=1, color=color)
        g2 = self.plot.cross(data['time'], data['values'], line_width=1, alpha=0.3, size=10, color=color)
        g3 = self.plot.line(smoothed_data['x'], smoothed_data['y'], line_width=3, color=color)
        g4 = self.plot.line([0,data['time'][-1]], [average, average], line_width=1, line_dash="20 3", color=color)
        self.legend_items[0][1].append(g1)
        self.legend_items[0][1].append(g2)
        self.legend_items[1][1].append(g4)
        self.legend_items.append(("Smooth Speaker "+speaker, [g3]))
        self.color_number += 1
        
    # Return the plot after added its legend
    def get_plot (self):
        legend = Legend(items=self.legend_items, location=(20,20))
        self.plot.add_layout(legend, 'right')
        self.plot.legend.click_policy="hide"
        return self.plot

## API demonstration

In [7]:
vdata = VisualizationData("../Desktop/fp_CID", format="CID", speakers=["AB","CM","AC","MB","AP"])

5 fichiers de données ont été lus




In [8]:
Display.conversation(vdata, color_palette=["green", "#220099"], smoothing_window=50, linked=True)

In [10]:
Display.average(vdata, smoothing_window=80)