# Data-analyse messenger chat

## Initialization

### Importing necessary packages

In [None]:
# misc numerical stuff
import numpy as np
import scipy.stats as nst
import math

# making plots
import matplotlib.pyplot as plt
# mainly used to change the general style of plots
import matplotlib as mpl
import matplotlib.colors as colors
from matplotlib import patches
import matplotlib.path as mpath
Path = mpath.Path

# class to draw curved text (following a circle)
# class taken from this answer: 
# https://stackoverflow.com/questions/19353576/curved-text-rendering-in-matplotlib
from curvedtext import CurvedText

# used for handling the data
import pandas as pd

# used for importing the data
import json

mpl.rcParams["figure.dpi"] = 150

### constants used throughout the notebook

In [None]:
# whether to save plots or not
SAVE_PLOTS = True
# format used to save the plots
EXTENSION = ".png"
# the subfolder where the plots will be saved
# THIS FOLDER MUST EXIST
SUBFOLDER = "plots bis/"

# the dpi used to save the plots
DPI = 600
# the amount of JSON files to read (messages are grouped per 10 000)
FILES = 4

# date range from which data should be used
# date format used is yyyy-mm-dd hh:mm:ss
# you must specify up to the day, when no more
# hours or more precise is given, it will include
# everything from the beginning and end day
BEGIN = "2000-01-01"
END = "2020-12-31"

## Extracting the data

In [None]:
# array to hold the data temporarly
data_strings = []

# loop over all files containing data for this chat and extract the data
for i in range(1, FILES+1):
    with open('message_%d.json' % i, 'r') as data_file:
        data_strings.append(data_file.read())
        data_file=None
        
data_dicts = []
        
# convert the json strings to dictionaries
for string in data_strings:
    data_dicts.append(json.loads(string))

extract some useful data out of the dictionaries, and make a dataframe containing all the messages with metadata

In [None]:
# extract the names
participants = data_dicts[0]['participants']
NAMES = [participant['name'] for participant in participants]
participants = None

# extract the name of the group chat, this will be used
# in the titles for plots etc.
TITLE = data_dicts[0]["title"]

In [None]:
# the names displayed on charts
first_names = [name.split()[0] for name in NAMES]

In [None]:
# extract data about which types of messages are there and
# which keys are used
messages = data_dicts[0]['messages']
types = set()
keys = set()

for message in messages:
    types.add(message['type'])
    for key in message.keys():
        keys.add(key)

### add all message data to a dataframe

In [None]:
frames = []

for data_dict in data_dicts:
    # create the dataframe
    df = pd.DataFrame(data_dict['messages'])
    # convert the timestamps to a readable format
    df["timestamp_ms"] = pd.to_datetime(df["timestamp_ms"], unit="ms")
    # set the datums as index of the dataframe
    df = df.set_index("timestamp_ms")
    # sort the frame
    df = df.sort_index()
    frames.append(df)
    
# create the frame containing all messages
message_frame = pd.concat(frames, sort=True)
# sort again
message_frame = message_frame.sort_index()

### some examples of querying the data + defenition of a count function

Examples of how to querry data in the dataframe, used as a reference for the sections where the data is analysed

In [None]:
# create a frame with all messages of type subscribe
message_frame.loc[message_frame["type"] == "Subscribe"]
# create a frame with all messages that do not contain an audio file
# in this case the valu of audio_files will be NaN, test this with isna
message_frame.loc[pd.isna(message_frame["audio_files"])];

To use frame.loc you need to create a boolean array, with 1 element for each row in the frame. loc will return a new frame containing the rows where the corresponding value in the boolean array is True. expressions like message_frame["type"] == "subscribe" and pd.isna(message_frame["audio_files"]) return boolean arrays.

A way to create a custom querry with a function that uses data from (multiple) columns that cannot be easily expressed in one line or is hard to vectorize is using the apply function, as shown in the following example. 

In [None]:
# example: find the audio files with reactions

# write a function that acts on a line of the dataframe:
def audio_and_reaction(line):
    # get the audio files
    file = line["audio_files"]
    # test if this line contains a (list of) audio file(s)
    has_audio = (type(file) == list)
    # get the reactions
    reactions = line["reactions"]
    # test if there are reactions
    has_reactions = (type(reactions) == list)
    
    # test if the line has audio and reactions
    if has_audio and has_reactions:
        return True
    return False

# create the boolean array
lines = message_frame.apply(audio_and_reaction, axis=1)
# locate the relevant rows (remove semi-colon to see the result)
frame = message_frame.loc[lines]
frame;

Its often the goal to count how many messages have a certain property, per person. The following function takes a dataframe (eg. the result of previous query) and counts the lines per person

In [None]:
def count_per_person(condition, df=message_frame, names=NAMES):
    # get the lines that match the given condition
    lines = df.apply(condition, axis=1)
    # locate these lines
    df = df.loc[lines]
    # count how many lines each person has
    count = {name : df.loc[df["sender_name"] == name].count()["sender_name"]
             for name in names}
    return count

as an example, count the audio files with reactions sent by each person

In [None]:
count_per_person(audio_and_reaction)

additional example: xp and xpxd in messages

In [None]:
def xp_in_message(line):
    if type(line.content) == str:
        words = line.content.lower().split(" ")
        if "xp" in words or "xpxd" in words:
            return True
    return False

count_per_person(xp_in_message)

### List all keys and types

In [None]:
keys

In [None]:
types

## messages sent per person

In [None]:
count = count_per_person(lambda line : line["type"] == "Generic")

# make a barplot of the number of messages sent by each person
fig, ax = plt.subplots(1,1, figsize=(12,7))

# we cannot just use count.values as this method is non-deterministic,
# which can potentially lead to values ending up with the wrong person.
# furthermore its possible that the order of NAMES has been manually
# changed in a more desired order.
ax.bar(first_names, [count[name] for name in NAMES], width=0.65)
plt.xticks(rotation=65)

plt.title("Verstuurde berichten per persoon in %s" %TITLE)

plt.show()

if SAVE_PLOTS:
    fig.savefig(SUBFOLDER + "messages sent" + EXTENSION, dpi=DPI)

In [None]:
cum_stats = pd.DataFrame(list(count.values()))
cum_stats.index = NAMES
cum_stats.columns = ["aantal berichten"]

count = None

## \<type\> sent by each person

count the amount of each type sent by each person

In [None]:
# function to test if a line contains a certain type
def has_type(line, typ):
    if type(line[typ]) == float:
        if pd.isna(line[typ]):
            return False
    return True

# all the types we ar interested in
types = ["photos", "videos", "audio_files", "plan", "gifs", "files", "sticker"]

# dict with for each type a dict with the counts per person
type_per_person = {typ : {} for typ in types}

# count how many times each type was sent by each person
for typ in types:
    type_per_person[typ] = count_per_person(lambda line : has_type(line, typ))

**amount of photos sent**

In [None]:
# make a barplot of the number of photo's sent by each person
fig, ax = plt.subplots(1,1, figsize=(12,7))

ax.bar(first_names, [type_per_person['photos'][name] for name in NAMES], width=0.65)
plt.xticks(rotation=75)

plt.title("Verstuurde foto's per persoon in %s" %TITLE)

plt.show()

if SAVE_PLOTS:
    fig.savefig(SUBFOLDER + "photos sent total" + EXTENSION, dpi=DPI)

**other non-text messages**

In [None]:
# make a barplot of the number of <type> sent by each person
fig, ax = plt.subplots(1,1, figsize=(12,7))

# the amount of entries (persons) and the width of the bars
index = np.arange(len(type_per_person['videos']))
width = 0.1

# plot each dataset, ofset of 'index' so that the index is between bar 3 and bar 4
for i in range(0,len(types)-1):
    typ = types[i+1]
    ax.bar(index-0.25+0.1*i, 
           [type_per_person[typ][name] for name in NAMES],
           width=width, label=typ)

plt.legend()
# lay-out    
ax.set_xticks(index)
ax.set_xticklabels(first_names)
plt.xticks(rotation=75)
ax.grid(True)

plt.title("Aantal <type> verstuurd of gemaakt per persoon in %s" %TITLE)

plt.show()

if SAVE_PLOTS:
    fig.savefig(SUBFOLDER + "misc sent total" + EXTENSION, dpi=DPI)

In [None]:
for i in range(0,len(types)-1):
    typ = types[i+1]
    cum_stats[typ] = [type_per_person[typ][name] for name in NAMES]

In [None]:
cum_stats

## analysis reactions

dictionary containing information to decode the reactions

In [None]:
# dictionary to decode the (wrongly red) unicode values for the emoji's to
# the type of emoji they represent
decode = {"ð\x9f\x98\x86":"laugh", "ð\x9f\x98\x8d":"heart", 'ð\x9f\x98®':"wow", 
          'ð\x9f\x98¢':"crying", 'ð\x9f\x98\xa0':"BOOS", 'ð\x9f\x91\x8d':'thumbs up',
         'ð\x9f\x91\x8e':"thumbs down"}
# list with all reactions (used in the analysis) + other
REACTIONS = ["laugh", "heart", 'wow', "crying", "BOOS", "thumbs up", "thumbs down", "other"]
# get the index in REACTIONS of a certain reaction quickly, avoids having
# to repeatedly search in an array
REACTION_INDEX = {REACTIONS[i] : i for i in range(len(REACTIONS))}
# same thing for the NAMES array
NAMES_INDEX = {NAMES[i] : i for i in range(len(NAMES))}

# the color corresponding to each emoji
# color to use in plots for each emji
emoji_color = {"laugh" : "xkcd:sunflower", "heart" : "xkcd:vermillion", "wow" : "g",
               "crying" : "xkcd:royal blue", "BOOS" : "xkcd:orange", 
               "thumbs up" : "xkcd:cyan", "thumbs down" : "w", "other":"xkcd:grey"}

boolean array wether a message has reactions

In [None]:
with_reacs = message_frame.apply(lambda line : type(line["reactions"]) == list, axis=1)

functions to count reactions by a person and reactions on a persons messages

In [None]:
# 3D array to keep reaction data in.
# the value in reacs[i,j,k] represents the amount of reactions
# BY NAMES[i] on messages of NAMES[j] where the reaction type is of type
# REACTIONS[k]
reacs = np.zeros((len(NAMES), len(NAMES), len(REACTIONS)))

# function to fill the array reacs
def count_reacs(line, reacs=reacs):
    # get the sender of the message
    sender = line["sender_name"]
    sender_index = NAMES_INDEX[sender]
    # get the array with reactions
    reactions = line["reactions"]
    
    # loop over the reactions and add 1 to the relevant counts
    for reaction in reactions:
        # get the data about the reaction and actor
        actor = reaction["actor"]
        actor_index = NAMES_INDEX[actor]
        # info about the emoticon
        try:
            emoticon = decode[reaction["reaction"]]
            emoticon_index = REACTION_INDEX[emoticon]    
        except:
            emoticon_index = REACTION_INDEX["other"]
        # add 1 to the right entry in reacs
        reacs[actor_index, sender_index, emoticon_index] += 1
    # end for over reactions
        

# loop over all messages with reactions and parse them
for index, row in message_frame.loc[with_reacs].iterrows():
    count_reacs(row)

create a function to easily create a dataframe from a slice of reacs, or by summing over the third dimension

In [None]:
# the keyword index will be ignored when sliced=False
def reacs_frame(reacs, axis=2, sliced=True, index=0):
    # get the right 2D-array with data
    if sliced:
        if axis==0:
            data = reacs[index,:,:]
        elif axis==1:
            data = reacs[:,index,:]
        elif axis==2:
            data = reacs[:,:,index]
        else:
            return "enter a valid axis"
    else:
        data = np.sum(reacs, axis=axis)
        
    # the labels for the axisses of the reaxs array
    label0 = NAMES
    label1 = NAMES
    label2 = REACTIONS
    
    # create the dataframe
    # the index is the first axis that is not sliced/summed over
    # the column names is the second axis that is not sliced/summed over
    if axis==0:
        df = pd.DataFrame(data=data, index=label1, columns=label2)
    elif axis==1:
        df = pd.DataFrame(data=data, index=label0, columns=label2)
    else:
        df = pd.DataFrame(data=data, index=label0, columns=label1)
        
    return df

In [None]:
reacs_frame(reacs, axis=2, sliced=True, index=REACTION_INDEX["BOOS"])

In [None]:
data=np.sum(reacs, (1))
pd.DataFrame(data=data[:,REACTION_INDEX["BOOS"]], index=NAMES, columns=["BOOS reacs door"])

## Plotderplots

In [None]:
# create the figure and ax objects
# fig, ax = plt.subplots(1,1)

# calculate the begin and end of each piece of the arc. 
# the length of each arc corresponds
# to the amount of reactions it represents. Leave a gap of
# 2° between different persons and 1° between in and out

# create the empty dictionary where the angles will be stored
arcs = {name : {"out" : {reac : {name : (0,0) for name in NAMES} for reac in REACTIONS},
                "in" : {reac : {name : (0,0) for name in NAMES} for reac in REACTIONS}}
        for name in NAMES}
# array containing the angles for the seperators between persons and in/out
seps = {name : {"inout" : (0,0), "person" : (0,0)} for name in NAMES}
# dict with the start and end of the complete arc for one type
total_arcs = {name : {"all" : [0,0], "by" : [0,0], "on" : [0,0]} for name in NAMES}


# the angle of the seperator between reactions by and on
SPACE_INOUT = 0.2
# the angle of the seperator between two people
SPACE_PERSON = 0.3
# the amount of whitespace around a separator (in units of the width
# of the separator)
WHITE_SPACE = 2
# the total angle that can be used for the visualisation
total_angle = 360 - (WHITE_SPACE*2+1)*(SPACE_INOUT+SPACE_PERSON)*len(NAMES)
# the angle that represents 1 reaction
reaction_angle = total_angle / (np.sum(reacs)*2)

# small angle of overlap to avoid white lines between segments
delta = reaction_angle/2

# function to cycle the order of NAMES to minimize crossing of lines
def cycle_names(names, own_name):
    # the own name should be last in the array
    index = NAMES_INDEX[own_name]
    out = []
    for i in range(len(names)):
        out.append(names[(index-1-i) % (len(names))])
    return out

# the starting angle
theta = 1
for name1 in NAMES:
    # variables with the midpoint of the segments for this person
    total_arcs[name1]["by"][0] = theta
    total_arcs[name1]["all"][0] = theta
    # the outgoing lines, reactions BY name1
    for reac in REACTIONS:
        for name2 in cycle_names(NAMES, name1):
            # calculate the angle of this segment
            by = NAMES_INDEX[name1]
            on = NAMES_INDEX[name2]
            re = REACTION_INDEX[reac]
            dtheta = reaction_angle*reacs[by,on,re]
            
            # add the beginning and end angle to the dictionary
            arcs[name1]["out"][reac][name2] = (theta-delta, theta+dtheta+delta)
            theta += dtheta

        # end for over name2
    # end for over reac
    
    # end the arc for reactions by this person
    total_arcs[name1]["by"][1] = theta
    
    # add the separator between reactions on and by
    theta += WHITE_SPACE*SPACE_INOUT
    seps[name1]["inout"] = (theta, theta+SPACE_INOUT)
    theta += (WHITE_SPACE+1)*SPACE_INOUT
    
    # start the arc for reactions on this person
    total_arcs[name1]["on"][0] = theta
    
    # the incomming lines, reactions ON name1
    for name2 in NAMES:
        # iterate over this list in revers order for better
        # aesthtic effect (this prevent excessive crossing of lines)
        for i in range(len(REACTIONS)-1, -1,-1):
            reac = REACTIONS[i]
            # calculate the angle of this segment
            by = NAMES_INDEX[name2]
            on = NAMES_INDEX[name1]
            re = REACTION_INDEX[reac]
            dtheta = reaction_angle*reacs[by,on,re]
            
            # add the beginning and end angle to the dictionary
            arcs[name1]["in"][reac][name2] = (theta, theta+dtheta)
            theta += dtheta
            
        # end for over reac
    # end for over name2
    
    # end the total arc and the arc for reactions on this person
    total_arcs[name1]["on"][1] = theta
    total_arcs[name1]["all"][1] = theta
    
    # add the separator between two persons
    theta += WHITE_SPACE*SPACE_PERSON
    seps[name1]["person"] = (theta, theta+SPACE_PERSON)
    theta += (WHITE_SPACE+1)*SPACE_PERSON
# end for over name1

In [None]:
# ---------------- Plotting parameters ---------------------

# sidelingth of the plot in inches
SIDE = 15

# different color schemes for this plot (light/dark)
fmt_light = {"face_color" : "w", "sep_color" : "k", "text_color" : "k",
            "legend_background" : "xkcd:light grey"}
fmt_dark = {"face_color" : "k", "sep_color" : "w", "text_color" : "w",
            "legend_background" : "xkcd:dark grey"}
FMT = fmt_dark

DISPLAY_NAMES = ["Caesar", "Abraracourcix", "Asterix", "Obelix", "Ambiorix", "Idéfix"]

TITLE = "reaction diagram anonymous light"

# the names to draw arrows from
NAMES1 = NAMES

# Whether the arrow drawn are empty or filled
FILLED_ARROWS = True

# position, size and width of the circle
CENTER = (0,0)
WIDTH = 2
HEIGHT = 2
LINEWIDTH = 10

# how much larger the circle with arcs that differenciate between
# reactions by and reactions on is
FACTOR = 1.03
# colors for the arcs for messages by and on
COLOR_BY = "xkcd:cyan"
COLOR_ON = "xkcd:orange"

# color and size of the separators
SEP_COLOR = FMT["sep_color"]
SEP_INOUT_WIDTH = 18
SEP_PERSON_WIDTH = 22

# set the limits of the plot
EXTENT = 2.5

# linewidth of the Bezier-curve
CURVE_WIDTH = (SIDE/EXTENT*0.0245)/2
ALPHA = 0.65

# the offset of the beginning and end of arrows to compensate
# for the finite width of the reaction ring
offset = 0.036 * (EXTENT/2*LINEWIDTH/2) / SIDE
ARROW_FACTOR = 1-offset

# ----------------- The plotting --------------------
# create the figure and ax objects
fig, ax = plt.subplots(1,1, figsize=(SIDE,SIDE), facecolor=FMT["face_color"])
plt.axis("off")
ax.set_facecolor(FMT["face_color"])
ax.set_xlim(CENTER[0]-EXTENT/2,CENTER[0]+EXTENT/2)
ax.set_ylim(CENTER[0]-EXTENT/2,CENTER[0]+EXTENT/2)
ax.set_aspect("equal")

# whether to save the handles, only done for the very first iteration
# of drawing arcs representing emojis
save_handles = True
handles = {}

# draw the circle
for name1 in NAMES:
    # draw the in-out seperators
    theta = seps[name1]["inout"]
    arc_sep_inout = patches.Arc(CENTER, WIDTH, HEIGHT, fill=False,
                               theta1=theta[0], theta2=theta[1],
                               color=SEP_COLOR, linewidth=SEP_INOUT_WIDTH)
    ax.add_patch(arc_sep_inout)
    # draw the in-out seperators
    theta = seps[name1]["person"]
    arc_sep_person = patches.Arc(CENTER, WIDTH, HEIGHT, fill=False,
                               theta1=theta[0], theta2=theta[1],
                               color=SEP_COLOR, linewidth=SEP_PERSON_WIDTH)
    ax.add_patch(arc_sep_person)
    
    for name2 in NAMES:
        for reac in REACTIONS:
            # draw the patch for reactions by name1
            theta = arcs[name1]["out"][reac][name2]
            arc_out = patches.Arc(CENTER, WIDTH, HEIGHT, fill=False,
                                 theta1=theta[0], theta2=theta[1],
                                 color=emoji_color[reac], linewidth=LINEWIDTH)
            ax.add_patch(arc_out)
            
            # draw the patch for reactions on name1
            theta = arcs[name1]["in"][reac][name2]
            arc_in = patches.Arc(CENTER, WIDTH, HEIGHT, fill=False,
                                 theta1=theta[0], theta2=theta[1],
                                color=emoji_color[reac], linewidth=LINEWIDTH)
            ax.add_patch(arc_in)
            
            if save_handles:
                handles[reac] = arc_in
        # end for over reac
        save_handles = False
    # end for over name2
    
    # draw the arcs signifying whether its reactions by or reactions on
    theta = total_arcs[name1]["on"]
    arc_on = patches.Arc(CENTER, WIDTH*FACTOR, HEIGHT*FACTOR, fill=False,
                        theta1 = theta[0], theta2=theta[1],
                        color= COLOR_ON, linewidth=LINEWIDTH/2)
    theta = total_arcs[name1]["by"]
    arc_by = patches.Arc(CENTER, WIDTH*FACTOR, HEIGHT*FACTOR, fill=False,
                        theta1 = theta[0], theta2=theta[1],
                        color= COLOR_BY, linewidth=LINEWIDTH/2)
    ax.add_patch(arc_on)
    ax.add_patch(arc_by)
    
# end for over name1

handles["Reactions by person"] = arc_by
handles["Reactions on person"] = arc_on

# draw the lines displaying the reactions
for name1 in NAMES1:
    for reac in REACTIONS:
        for name2 in NAMES:
            # the start and end arc
            arc0 = arcs[name1]["out"][reac][name2]
            arc1 = arcs[name2]["in"][reac][name1]
            # the start and begin angle
            theta0 = (arc0[0]+arc0[1])/2 * np.pi/180
            theta1 = (arc1[0]+arc1[1])/2 * np.pi/180
            # the offset to compensate for the finite width
            # of the circle representing reactions
            # the starting and end point
            START = (np.cos(theta0)*ARROW_FACTOR, np.sin(theta0)*ARROW_FACTOR)
            FINNISH = (np.cos(theta1)*ARROW_FACTOR, np.sin(theta1)*ARROW_FACTOR)
            
            # the weight of the link
            i1 = NAMES_INDEX[name1]
            i2 = NAMES_INDEX[name2]
            i3 = REACTION_INDEX[reac]
            weight = reacs[i1,i2,i3]
            
            # define the stle for the arrows
            arrowstyle = patches.ArrowStyle("Simple",
                        tail_width=CURVE_WIDTH*weight,
                        head_width=CURVE_WIDTH*weight*1.8,
                        head_length=CURVE_WIDTH*weight*1.5)
            if FILLED_ARROWS:
                lw=0
            else:
                lw=1
            # the Bezier curve connecting the points
            curve = patches.FancyArrowPatch(path=Path([START, CENTER, FINNISH],
                        [Path.MOVETO, Path.CURVE3, Path.CURVE3]),
                        color=emoji_color[reac], fill=FILLED_ARROWS, alpha=ALPHA,
                        arrowstyle=arrowstyle, lw=lw)
            ax.add_patch(curve)
            # tail_width=CURVE_WIDTH*weight
            
        # end for over name2
    # end for over reac
# end for over name1

i = 0
# draw the labels
for name in NAMES:  
    # generate the curve to plot the text on
    # text direction is the same as the direction of this parametric curve!
    T = np.linspace(total_arcs[name]["all"][1] * np.pi/180,
                    total_arcs[name]["all"][0] * np.pi/180, 1000) 
    (x, y) = (1.1*np.cos(T), 1.1*np.sin(T))
    text = CurvedText(x=x, y=y, text=DISPLAY_NAMES[i], color=FMT["text_color"], size=15,
                     verticalalignment="center", horizontalalignment="center",
                     axes=ax)
    i += 1
            
legend = plt.legend(handles.values(), handles.keys(), loc="upper left", 
           facecolor=FMT["legend_background"])
for text in legend.get_texts():
    text.set_color(FMT["text_color"])

plt.title("Reaction diagram messenger chat", color=FMT["text_color"], size=30)
    
plt.show()

fig.savefig(SUBFOLDER + TITLE + ".pdf", dpi=DPI, facecolor=FMT["face_color"],
           bbox_inches="tight")

## Analysis \<type\> sent through time

Create a dataframe to keep track of the messages sent in each interval

In [None]:
# the time interval used to group lines
interval = pd.Timedelta("1 days")

ONE_PERSON = False
PERSON_NAME = ""

In [None]:
# this tests if a line should be counted towards
# <type> and for which person it should be counted
def condition(line):
    # this just counts generic messages
    if (line.type == "Generic"):
        return (True, line.sender_name)
    return (False, None)

# determine the start and end dates for the frame, rounded to 
start = pd.to_datetime(message_frame.index.values[0])
end = pd.to_datetime(message_frame.index.values[-1])
start = max(pd.to_datetime(BEGIN),
            start.floor(interval))
end = min(pd.to_datetime(END)+interval, 
          end.floor(interval)+interval)

# create the index
index = pd.date_range(start, end, freq=interval)
# create the dataframe
count_frame = pd.DataFrame(0, columns=NAMES + ["total"], index=index)

# iterate over every interval and count the lines that meet
# condition for every person
for i in range(len(index)-1):
    for temp, line in message_frame[index[i]:index[i+1]].iterrows():
        result = condition(line)
        if result[0]:
            count_frame.loc[index[i]]["total"] += 1
            count_frame.loc[index[i]][result[1]] += 1

In [None]:
# kernel density estimate for the bar plot
def KDE(X, counts, bandwidth):
    Y = np.zeros_like(X)
    x = 0.5
    for count in counts:
        Y += count*nst.norm.pdf(X, loc=x, scale=bandwidth)
        x += 1
        
    return Y

In [None]:
fig, ax = plt.subplots(1,1,figsize=(24,8))

if ONE_PERSON:
    y_data = np.array(count_frame[PERSON_NAME].values, np.float64)
else:
    y_data = np.array(count_frame["total"].values, np.float64)

KDE_points = 2000
X = np.linspace(0.5,len(y_data)-0.5,KDE_points)
D = pd.date_range(count_frame.index[0], count_frame.index[-1], KDE_points)
Y = KDE(X, y_data, 3)
Y0 = np.zeros_like(X)

ax.bar(count_frame.index, y_data, width=1, color="xkcd:royal blue")
ax.plot(D, Y, color='xkcd:vermillion', linewidth=3)
ax.fill_between(D, Y0, Y, color='xkcd:vermillion', alpha=0.3)
ax.grid(True)

if ONE_PERSON:
    plt.title("Aantal versuurde berichten per dag in %s door %s" %(TITLE, PERSON_NAME))
else:
    plt.title("Aantal versuurde berichten per dag in %s door iedereen" %TITLE)

plt.show()

if SAVE_PLOTS:
    fig.savefig(SUBFOLDER + "/berichten per dag bar" + EXTENSION, dpi=DPI)

**Compare the activity of 2 people**

In [None]:
# the persons to make the plot for and the colors for each person
PERSONS = Names[0:4]
colors = ["xkcd:vermillion", "xkcd:royal blue", "xkcd:lime", "xkcd:sunflower"]
KDE_points = 2000
PLOT_BARS = False

# generate the figure and axis
fig, ax = plt.subplots(1,1,figsize=(24,8))

# get the data to plot
y_data = [np.array(count_frame[name].values, np.float64) for name in PERSONS]

# arrays for kernel density estimate
X = np.linspace(0.5,len(y_data[0])-0.5,KDE_points)
D = pd.date_range(count_frame.index[0], count_frame.index[-1], KDE_points)
Y0 = np.zeros_like(X)

# make the barplots
# get for how many people the plot is made:
num_persons = len(PERSONS)
# calculate the width of each bar, the total should be 1
width = 1/num_persons
# calculate the offset of the bars.
offset = -0.5 + width/2
# set the dates as index:
index = np.arange(len(count_frame))
# make the bar plots
for i in range(len(PERSONS)):
    if PLOT_BARS:
        ax.bar(index - offset + i*width, y_data[i], width, color=colors[i])
    # calculate the kernel density estimate
    Y = KDE(X, y_data[i], 3)
    # plot the KDE
    ax.plot(X, Y, color=colors[i], linewidth=3, label=PERSONS[i])
    ax.fill_between(X, Y0, Y, color=colors[i], alpha=0.3)

# positions of xticks and corresponding labels
# 15 labels
major_indexes = [math.floor(index[-1]*i/15) for i in range(16)]
minor_indexes = [math.floor(index[-1]*i/45) for i in range(46)]
major_ticks = [index[i] for i in major_indexes]
labels = [count_frame.index[i] for i in major_indexes]
minor_ticks = [index[i] for i in minor_indexes]
    
# lay-out
ax.set_xticks(major_ticks)
ax.set_xticks(minor_ticks, True)
ax.set_xticklabels(labels)
plt.xticks(rotation=75)
plt.legend()
plt.title("Berichten per dag, per persoon in %s" %TITLE)
ax.grid(True)


if SAVE_PLOTS:
    fig.savefig(SUBFOLDER + "/berichten per dag per persoon" + EXTENSION, dpi=DPI)