In [3]:
import re
import os
import pandas as pd
import matplotlib.pyplot as plt
   #For the time graph
import matplotlib.dates as mdates
import datetime as dt
from numpy import cumsum

pd.set_option('display.max_rows', None)


def startsWithDateTime(s):
   pattern = '^(\d+/\d+/\d+, \d+:\d+\d+ [A-Z]*) -'
   result = re.match(pattern, s)
   if result:
      return True
   return False

    
def startsWithAuthor(s):
   patterns = [
        'Louisa \(HSK\):',
        '🧀 🧀 🧀:',
        'Kira Arlt \(HSK\):',
        'Tiziana \(Couchsurf\):',
        'Mr. S:',
        'G-dizzle:',
        'Good Ol\' Kyle:',
        '([\w]+):',                        # First Name
        '([\w]+[\s]+[\w]+):',              # First Name + Last Name
        '([\w]+[\s]+[\w]+[\s]+[\w]+):',    # First Name + Middle Name + Last Name
        '([+]\d{2} \d{5} \d{5}):',         # Mobile Number (India)
        '([+]\d{2} \d{3} \d{3} \d{4}):',   # Mobile Number (US)
        '([+]\d{2} \d{4} \d{7})'           # Mobile Number (Europe)
   ] 
   pattern = '^' + '|'.join(patterns)
   result = re.match(pattern, s)
   if result:
      return True
   return False
   

my_name = 'Miles Keating'
PATH = '/home/miles/pydir/socialLogs' 
directory = os.listdir(PATH)
   
def getDataPoint(line):
   # line = 18/06/17, 22:47 - Loki: Why do you have 2 numbers, Banner?
    
   splitLine = line.split(' - ') # splitLine = ['18/06/17, 22:47', 'Loki: Why do you have 2 numbers, Banner?']
    
   dateTime = splitLine[0] # dateTime = '18/06/17, 22:47'
    
   date, time = dateTime.split(', ') # date = '18/06/17'; time = '22:47'
    
   message = ' '.join(splitLine[1:]) # message = 'Loki: Why do you have 2 numbers, Banner?'
    
   if startsWithAuthor(message): # True
      splitMessage = message.split(': ') # splitMessage = ['Loki', 'Why do you have 2 numbers, Banner?']
      author = splitMessage[0] # author = 'Loki'
      message = ' '.join(splitMessage[1:]) # message = 'Why do you have 2 numbers, Banner?'
   else:
      author = None
   return date, time, author, message
    
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe

for file in directory:
    with open(PATH +'/'+ file, encoding="utf-8") as fp:
        fp.readline() # Skipping first line of the file (usually contains information about end-to-end encryption)

        messageBuffer = [] # Buffer to capture intermediate output for multi-line messages
        date, time, author = None, None, None # Intermediate variables to keep track of the current message being processed
        conversation = file[19:len(file)-4]
        while True:
          line = fp.readline() 
          if not line: # Stop reading further if end of file has been reached
             break    
          line = line.strip() # Guarding against erroneous leading and trailing whitespaces
          if startsWithDateTime(line): # If a line starts with a Date Time pattern, then this indicates the beginning of a new message
             if len(messageBuffer) > 0: # Check if the message buffer contains characters from previous iterations
                parsedData.append([date, time, author, ' '.join(messageBuffer), conversation]) # Save the tokens from the previous message in parsedData
             messageBuffer.clear() # Clear the message buffer so that it can be used for the next message
             date, time, author, message = getDataPoint(line) # Identify and extract tokens from the line
             messageBuffer.append(message) # Append message to buffer
          else:
             messageBuffer.append(line) # If a line doesn't start with a Date Time pattern, then it is part of a multi-line message. So, just append to buffer
         

df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message', 'Conversation'])

authors = df['Author'].unique()
conversations = df['Conversation'].unique()

df = df.drop(df[df['Author'].isnull()].index)
df = df.drop(df[df['Message'] == '<Media omitted>'].index)
df['Word Count'] = df['Message'].apply(lambda s : len(s.split(' ')))

df['Datetime'] = df['Date'] +' '+ df['Time']
df['Datetime'] = pd.to_datetime(df['Datetime'])
df['Date'] = pd.to_datetime(df['Date'])
df['Time'] = pd.to_datetime(df['Time']).dt.time
cols = df.columns.tolist()
cols = cols[-1:]+cols[:-1]
df = df[cols]
df = df.sort_values(['Datetime'])

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Adding new columns
copy = df.copy()
copy['Running Message Count'] = 1
copy['Running Word Count'] = 1
copy['Running WPM'] =1

copy['Sent Messages'] = 1
copy['Received Messages'] = 1
copy.loc[copy.Author == my_name, 'Received Messages'] = 0
copy.loc[copy.Author != my_name, 'Sent Messages'] = 0


copy['Sent Words'] = copy['Word Count']
copy['Received Words'] = copy['Word Count']
copy.loc[copy.Author == my_name, 'Received Words'] = 0
copy.loc[copy.Author != my_name, 'Sent Words'] = 0


copy['Sent WPM'] =1
copy['Received WPM'] =1
copy['Sent vs Received Messages']=1
copy['Sent vs Received Words']=1
copy['Global Messages']= copy['Running Message Count'].cumsum()

temp2 = pd.DataFrame()



for key in conversations:
    temp1 = copy[copy.Conversation == key].copy()
    temp1['Running Message Count'] = temp1['Running Message Count'].cumsum()
    
    temp1['Running Word Count'] = temp1['Word Count'].cumsum()
    
    temp1['Running WPM'] = round(temp1['Running Word Count'] / temp1['Running Message Count'], 2)
    
    temp1['Received Messages'] = temp1['Received Messages'].cumsum()
    temp1['Sent Messages'] = temp1['Sent Messages'].cumsum()
    
    temp1['Received Words'] = temp1['Received Words'].cumsum()
    temp1['Sent Words'] = temp1['Sent Words'].cumsum()
    
    temp1['Received WPM'] = temp1['Received Words'] / temp1['Received Messages']
    temp1['Sent WPM'] = temp1['Sent Words'] / temp1['Sent Messages']
    
    temp1['Sent vs Received Messages'] = temp1['Sent Messages']/temp1['Received Messages']
    temp1['Sent vs Received Words'] = temp1['Sent Words']/temp1['Received Words']
    temp2 = temp2.append(temp1)
    
copy = temp2
df=copy


pm_df = df[df.Conversation.isin(authors)]
group_df = df.drop(df[df.Conversation.isin(authors)].index)


    
'''

metric = pd.DataFrame(columns=['Datetime'])
metric['Datetime'] = pd.date_range(copy.sort_values(['Datetime']).Datetime.iloc[0], 
                                  copy.sort_values(['Datetime']).Datetime.iloc[-1], freq='min')
metric = copy.merge(metric,left_on=['Datetime'], right_on=['Datetime'], how='right')
metric = metric.sort_values(['Datetime'])
    '''
print('done')

'''
THINGS TO FIX
    ~~~~~SEPERATE INSTANTANEOUS SENT / RECEIVED WORDS VS. RUNNING VS. GLOBAL
    
'''



done


'\nTHINGS TO FIX\n    ~~~~~SEPERATE INSTANTANEOUS SENT / RECEIVED WORDS VS. RUNNING VS. GLOBAL\n    \n'

In [190]:
option = radio_button_group
option

TypeError: 'RadioButtonGroup' object does not support indexing

In [8]:
import pandas as pd
import numpy as np

from bokeh.io import show, output_file, curdoc
from bokeh.plotting import figure

from bokeh.models.widgets import RadioButtonGroup, Button
from bokeh.models import CustomJS
from bokeh.palettes import turbo

from bokeh.layouts import column

output_file('FirstSim.html')



global pm_df


def make_plot(option):
    p = figure(plot_width=1500, plot_height=1200, x_axis_type='datetime')
    
    author_list = list(pm_df.Author.unique())
    author_list.remove('Miles Keating')
    colors = turbo(len(author_list))

    for i, author in enumerate(author_list):
        subset = pm_df[pm_df.Author == author]
        r= p.line(subset.Datetime, subset[option], line_width=2, color=colors[i], alpha=0.8, legend_label=author)
        r.visible = False
    
    return p
    
def style(p):
    p.title.text = 'Dumb Graph'
    
    p.legend.location = 'top_left'
    p.legend.click_policy='hide'
    return p

def update(attr, old, new):
    
    print(new)
    

radio_button_group = RadioButtonGroup(labels=['Received Messages', 'Sent Messages'], active=0) #Active states which label is active by default


option = radio_button_group.labels[radio_button_group.active]
    
p = style(make_plot(option))


radio_button_group.on_change('active', update)
show(column(radio_button_group, p))

You are generating standalone HTML/JS output, but trying to use real Python
callbacks (i.e. with on_change or on_event). This combination cannot work.

Only JavaScript callbacks may be used with standalone output. For more
information on JavaScript callbacks with Bokeh, see:

    https://docs.bokeh.org/en/latest/docs/user_guide/interaction/callbacks.html

Alternatively, to use real Python callbacks, a Bokeh server application may
be used. For more information on building and running Bokeh applications, see:

    https://docs.bokeh.org/en/latest/docs/user_guide/server.html



In [239]:
p = style(make_plot(option))


In [235]:
button = Button(label='Sent Messages', button_type='success')
show(button)

AttributeError: 'tuple' object has no attribute 'references'

In [203]:
option.labels[option.active]

'Received Messages'

In [1]:
from bokeh.io import output_file, show
from bokeh.models import CheckboxButtonGroup

output_file("checkbox_button_group.html")

checkbox_button_group = CheckboxButtonGroup(
        labels=["Option 1", "Option 2", "Option 3"], active=[0, 1])

show(checkbox_button_group)

In [206]:
# STILL BUGGY TRY KEY LENA ULRICH AND CHECK DATETIME DIFFERENCEES
pm_dict = {}

for key in pm_df.Conversation.unique():
    pm_dict.update({key : pm_df[['Datetime', 'Author', 'Conversation']][pm_df.Conversation == key]})
    temp_df = pm_dict[key].loc[pm_dict[key].Author != pm_dict[key].Author.shift(1)].copy()
    temp_df['DeltaT Minutes'] = temp_df.Datetime - temp_df.Datetime.shift(1)
    pm_dict[key]['DeltaT Minutes']= '0'# days 00:00:00'
    pm_dict[key] = temp_df.append(pm_dict[key]) # I assumed this would be the fastest sorting method
    pm_dict[key].sort_index(inplace=True)
    pm_dict[key]=pm_dict[key].loc[~pm_dict[key].index.duplicated(keep='first')]
    



In [178]:
#CONVERTING DATETIMES TO MINUTES
days = (hist_df.Datetime.iloc[1]-hist_df.Datetime.iloc[0]).days * 24 * 60
seconds = (hist_df.Datetime.iloc[1]-hist_df.Datetime.iloc[0])
days + seconds
my_chat.head()

Unnamed: 0,Datetime,Author
65285,2017-07-07 12:28:00,Miles Keating
65290,2017-07-11 08:01:00,Kenny
65292,2017-07-11 08:01:00,Miles Keating
65293,2017-07-11 08:01:00,Kenny
65294,2017-07-11 08:02:00,Miles Keating
