In [1]:
from os import listdir
from numpy import *
from scipy import signal, interpolate
from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from bokeh.io import output_notebook
from abc import ABC, abstractmethod
output_notebook()

In [2]:
class VisualizationData:
    
    def __init__(self, directory, corpus=[], conversations=[], speakers=[], data_type=[]):
        self.metadata = self.read_metadata(directory)
        self.data = []
        for file_name in listdir(directory):
            file_info = file_name.replace(".csv", "").split("_")
            if file_info[1] != "metadata":
                id_caller = self.find_info(file_info[1], file_info[-1])[3]
                if ((len(corpus) == 0 or int(file_info[0]) in corpus)
                and (len(conversations) == 0 or int(file_info[1]) in conversations)
                and (len(speakers) == 0 or id_caller in speakers)
                and (len(data_type) == 0 or file_info[2] in data_type)):
                    to_add = {"corpus":file_info[0], "conversation":file_info[1], "speaker":file_info[-1]}
                    to_add[file_info[2]] =  loadtxt(open(directory+"/"+file_name, "rb"),
                                                    delimiter="\t", skiprows=1)
                    self.data.append(to_add)
    
    def read_metadata(self, directory):
        for file_name in listdir("data"):
            file_info = file_name.replace(".csv", "").split("_")
            if (file_info[1] == "metadata"):
                metadata_file = open(directory+"/SW_metadata.csv", "r")
                columns = metadata_file.readline().split("\t")
                return(genfromtxt(metadata_file, delimiter="\t", usecols=range(1,len(columns)),
                                  dtype=None, names=columns))    
    def find_info(self, conv, caller):
        for line in self.metadata:
            if line["id_conv"] == int(conv) and line["id_caller"].decode('UTF-8') == caller:
                return line

In [3]:
class Display():
    
    @staticmethod
    def average(vdata, smoothing_window=10, points_number=200):
        show(AveragePlot(vdata.data, smoothing_window, points_number).plot)
    
    @staticmethod
    def conversation(vdata, smoothing_window=10, points_number=200, linked = False):
        grid = []
        conversations = {}
        for d in vdata.data:
            if d['conversation'] not in conversations.keys():
                conversations[d['conversation']] = {}
            conversations[d['conversation']][d['speaker']] = d['speech'] # ! pour l'instant cette fonction ne gère que les speech rate
            
        for conv_id, conv_data in conversations.items():
            grid.append(ConversationPlot(conv_id, conv_data, smoothing_window, points_number).plot)
            
        if linked == True:
            for i in range(1, len(grid)):
                grid[i].x_range = grid[0].x_range
                grid[i].y_range = grid[0].y_range
        show(gridplot(grid, ncols=1))

In [4]:
class Plot:
    def __init__(self, smoothing_window, points_number):
        self.smoothing_window = smoothing_window
        self.points_number = points_number        
        
    # Smoothing function
    def smooth(x,y,window_len, points_number):

        # first step, get the values closer together by averaging each values with a window of hann :
        window = hanning(window_len)
        wider_data = r_[y[window_len-1:0:-1],y,y[-2:-window_len-1:-1]]
        new_y=convolve(window/window.sum(),wider_data,mode='valid')

        # second step, create new points to smooth the curve :
        smooth_function = interpolate.CubicSpline(linspace(0, int(x[-1]),new_y.size), new_y)
        smooth_x = linspace(0, x[-1], points_number)
        smooth_y = smooth_function(smooth_x)

        return {'x':smooth_x, 'y':smooth_y}

In [5]:
class AveragePlot (Plot):
    def __init__(self, data, smoothing_window, points_number):
        Plot.__init__(self, smoothing_window, points_number)
                
        self.plot = figure(width=800,height=250)
        grouped_data = [[],[]]
        
        for d in data:
            x=[]
            for row_x in d['speech'][:,2]:
                x.append(row_x * 100 / d['speech'][-1,2]) # time to pourcent conversion
            smoothed_values = Plot.smooth(x, d['speech'][:,1], smoothing_window, points_number)
            grouped_data[0].append(smoothed_values['x'])
            grouped_data[1].append(smoothed_values['y'])  
        self.plot.multi_line(grouped_data[0],grouped_data[1], line_width=2, color="grey",
                        alpha = 0.6, legend = "all data")
        
        average = mean(grouped_data[1], axis=0)
        self.plot.line(linspace(0, 100, average.size), average, legend="Average", line_width=3, color="blue")
        self.plot.legend.click_policy="hide"

In [6]:
class ConversationPlot (Plot):
    def __init__(self, id_conv, conversation, smoothing_window, points_number):
        Plot.__init__(self, smoothing_window, points_number)
        
        self.plot = figure(width=800, height=250, title="Speech rate evolution conversation number "+id_conv)
        color_palette = ["red", "blue", "green", "purple", "yellow"]
        color_number = 0

        for speaker, data in conversation.items():

            smoothed_data = Plot.smooth(data[:,2], data[:,1], smoothing_window, points_number)

            self.plot.line(data[:,2], data[:,1], legend="Speaker "+speaker,
                      line_width=1, color=color_palette[color_number%9], alpha=0.5, line_dash="10 4")
            self.plot.line(smoothed_data['x'], smoothed_data['y'], legend="Smooth Speaker "+speaker,
                      line_width=2, color=color_palette[color_number%9])
            color_number += 1

        self.plot.legend.click_policy="hide"

In [7]:
vdata = VisualizationData("data", conversations=[2001, 4936])



In [8]:
Display.conversation(vdata, smoothing_window=30, linked=True)

In [9]:
Display.average(vdata, smoothing_window=25, points_number=10)