In [1]:
import re
import os
import pandas as pd
import matplotlib.pyplot as plt
   #For the time graph
import matplotlib.dates as mdates
import datetime as dt
from numpy import cumsum

pd.set_option('display.max_rows', None)


def startsWithDateTime(s):
   pattern = '^(\d+/\d+/\d+, \d+:\d+\d+ [A-Z]*) -'
   result = re.match(pattern, s)
   if result:
      return True
   return False

    
def startsWithAuthor(s):
   patterns = [
        'Louisa \(HSK\):',
        '🧀 🧀 🧀:',
        'Kira Arlt \(HSK\):',
        'Tiziana \(Couchsurf\):',
        'Mr. S:',
        'G-dizzle:',
        'Good Ol\' Kyle:',
        '([\w]+):',                        # First Name
        '([\w]+[\s]+[\w]+):',              # First Name + Last Name
        '([\w]+[\s]+[\w]+[\s]+[\w]+):',    # First Name + Middle Name + Last Name
        '([+]\d{2} \d{5} \d{5}):',         # Mobile Number (India)
        '([+]\d{2} \d{3} \d{3} \d{4}):',   # Mobile Number (US)
        '([+]\d{2} \d{4} \d{7})'           # Mobile Number (Europe)
   ] 
   pattern = '^' + '|'.join(patterns)
   result = re.match(pattern, s)
   if result:
      return True
   return False
   

my_name = 'Miles Keating'
PATH = '/home/miles/pydir/socialLogs' 
directory = os.listdir(PATH)
   
def getDataPoint(line):
   # line = 18/06/17, 22:47 - Loki: Why do you have 2 numbers, Banner?
    
   splitLine = line.split(' - ') # splitLine = ['18/06/17, 22:47', 'Loki: Why do you have 2 numbers, Banner?']
    
   dateTime = splitLine[0] # dateTime = '18/06/17, 22:47'
    
   date, time = dateTime.split(', ') # date = '18/06/17'; time = '22:47'
    
   message = ' '.join(splitLine[1:]) # message = 'Loki: Why do you have 2 numbers, Banner?'
    
   if startsWithAuthor(message): # True
      splitMessage = message.split(': ') # splitMessage = ['Loki', 'Why do you have 2 numbers, Banner?']
      author = splitMessage[0] # author = 'Loki'
      message = ' '.join(splitMessage[1:]) # message = 'Why do you have 2 numbers, Banner?'
   else:
      author = None
   return date, time, author, message
    
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe

for file in directory:
    with open(PATH +'/'+ file, encoding="utf-8") as fp:
        fp.readline() # Skipping first line of the file (usually contains information about end-to-end encryption)

        messageBuffer = [] # Buffer to capture intermediate output for multi-line messages
        date, time, author = None, None, None # Intermediate variables to keep track of the current message being processed
        conversation = file[19:len(file)-4]
        while True:
          line = fp.readline() 
          if not line: # Stop reading further if end of file has been reached
             break    
          line = line.strip() # Guarding against erroneous leading and trailing whitespaces
          if startsWithDateTime(line): # If a line starts with a Date Time pattern, then this indicates the beginning of a new message
             if len(messageBuffer) > 0: # Check if the message buffer contains characters from previous iterations
                parsedData.append([date, time, author, ' '.join(messageBuffer), conversation]) # Save the tokens from the previous message in parsedData
             messageBuffer.clear() # Clear the message buffer so that it can be used for the next message
             date, time, author, message = getDataPoint(line) # Identify and extract tokens from the line
             messageBuffer.append(message) # Append message to buffer
          else:
             messageBuffer.append(line) # If a line doesn't start with a Date Time pattern, then it is part of a multi-line message. So, just append to buffer
         

df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message', 'Conversation'])

authors = df['Author'].unique()
conversations = df['Conversation'].unique()

df = df.drop(df[df['Author'].isnull()].index)
df['Word Count'] = df['Message'].apply(lambda s : len(s.split(' ')))

df['Datetime'] = df['Date'] +' '+ df['Time']
df['Datetime'] = pd.to_datetime(df['Datetime'])

df['Sent Messages'] = 1
df['Sent Words'] = df['Word Count']
df['WPM'] =0

new_df = pd.DataFrame()

for group in conversations:
    subset_group = df[df.Conversation == group].copy()
    authors = list(subset_group.Author.unique())
    
    for key in authors:
        subset = subset_group[subset_group.Author == key].copy()
        
        subset['Sent Messages'] = subset['Sent Messages'].cumsum()
        subset['Sent Words'] = subset['Sent Words'].cumsum()
        
        new_df = new_df.append(subset)

df = new_df.copy()
df['WPM'] = df['Sent Words'] / df['Sent Messages']

print('done')


done


In [9]:
debug =df[df.Conversation == 'Gießen Home']
debug = debug.sort_values(['Datetime','Sent Messages'], ascending=[True,True])

In [11]:
debug.tail(50)

Unnamed: 0,Date,Time,Author,Message,Conversation,Word Count,Datetime,Sent Messages,Sent Words,WPM
6977,2/25/20,6:14 AM,Borat,never!,Gießen Home,1,2020-02-25 06:14:00,492,2227,4.526423
6978,2/25/20,7:49 AM,Mr. S,Hahahahaha legend,Gießen Home,2,2020-02-25 07:49:00,263,1130,4.296578
6979,2/25/20,9:49 PM,Kyle Porter,<Media omitted>,Gießen Home,2,2020-02-25 21:49:00,24,101,4.208333
6980,3/3/20,9:51 AM,Miles Keating,<Media omitted>,Gießen Home,2,2020-03-03 09:51:00,184,618,3.358696
6981,3/3/20,9:51 AM,Miles Keating,<Media omitted>,Gießen Home,2,2020-03-03 09:51:00,185,620,3.351351
6982,3/3/20,9:58 AM,Toby,@4915214176529 @14255020834 come upstairs,Gießen Home,5,2020-03-03 09:58:00,161,816,5.068323
6983,3/3/20,10:00 AM,Borat,😂😂😂😂😂😂❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤,Gießen Home,1,2020-03-03 10:00:00,493,2228,4.51927
6984,3/3/20,1:01 PM,Clara Mi Amor,😂😘😘😘,Gießen Home,1,2020-03-03 13:01:00,108,380,3.518519
6985,3/3/20,5:35 PM,Kyle Porter,Amazing 😍,Gießen Home,2,2020-03-03 17:35:00,25,103,4.12
6986,3/3/20,9:16 PM,LittleItaly,"Unfriending you, Miles 😒",Gießen Home,4,2020-03-03 21:16:00,102,514,5.039216


In [24]:
'''
This will add the columns required for the intra-conversation tab

The columns we want are:
All authors sent messages (by author)
All authors sent Words
All authors cumulative Words per Messages

Emoji usage
Possible analyze media sent




'''




df['Sent Messages'] = 1
df['Sent Words'] = df['Word Count']

new_df = pd.DataFrame()

for group in conversations:
    subset_group = df[df.Conversation == group].copy()
    authors = list(subset_group.Author.unique())
    
    for key in authors:
        subset = subset_group[subset_group.Author == key].copy()
        
        subset['Sent Messages'] = subset['Sent Messages'].cumsum()
        subset['Sent Words'] = subset['Sent Words'].cumsum()
        
        new_df = new_df.append(subset)

df = new_df
df['WPM'] = df['Sent Words'] / df['Sent Messages']
        
df = df.sort_values(['Datetime'])



In [None]:
# Working with Moving Averages ( need full data so it gets its own loop)

#Will have to make moving averages specific to each author within a conversation and build a unique dataframe
#custom for each author. Like everything else, this can be pre-built and saved so that data reference is easy.
#Actually the bokeh plots might also be able to be pre-built and then saved, so that on start up there are no
#calculations what-so-ever and thereby making the program super fast.

SMA_df



In [None]:

category = 'Running Word Count'
vals = pm_df.loc[pm_df.Author == 'Bilo']

vals = vals[['Datetime', category]]
vals.set_index('Datetime',inplace=True)

temp = vals.copy()

vals= vals.resample('7D').first()
vals.fillna(value=0,inplace=True)

vals = vals.rolling(window=4).mean()

vals['difference'] = vals[~vals['Running Word Count'].isnull()].diff()


vals.fillna(value=0,inplace=True)

vals = vals.rolling(window=4).mean()

vals.fillna(value=0,inplace=True)

from scipy.signal import savgol_filter
vals.fillna(value=0,inplace=True)

vals[category] = savgol_filter(vals[category], 11, 3)
vals.fillna(value=0,inplace=True)

In [27]:
df = df.sort_values(['Datetime'])

df.head(100)

Unnamed: 0,Datetime,Date,Time,Author,Message,Conversation,Word Count,Sent Messages,Sent Words,WPM
1129,2018-09-28 03:37:00,2018-09-28,03:37:00,Miles Keating,"Hallo Julia, bist du gestern sicher heim gegan...",Neighbor,8,1,8,8.0
1130,2018-09-28 14:28:00,2018-09-28,14:28:00,Neighbor,"Ja, bin ich. Du auch?",Neighbor,5,1,5,5.0
1131,2018-09-28 14:31:00,2018-09-28,14:31:00,Miles Keating,Ja 😊,Neighbor,2,2,10,5.0
1132,2018-09-28 14:49:00,2018-09-28,14:49:00,Neighbor,👍🏽👍🏽,Neighbor,1,2,6,3.0
1133,2018-11-01 12:00:00,2018-11-01,12:00:00,Miles Keating,"Hallo Julia, machen wir Kino Abend am Sonntag?",Neighbor,8,3,18,6.0
1134,2018-11-01 13:45:00,2018-11-01,13:45:00,Neighbor,Am Sonntag bin ich nicht da 😔 Nächste Woche So...,Neighbor,12,3,18,6.0
1135,2018-11-01 14:01:00,2018-11-01,14:01:00,Miles Keating,Nächste Woche kann ich und Ben leider nicht,Neighbor,8,4,26,6.5
1136,2018-11-01 14:01:00,2018-11-01,14:01:00,Miles Keating,Vielleicht könne wir ein andere Tag finden,Neighbor,7,5,33,6.6
1137,2018-11-02 05:25:00,2018-11-02,05:25:00,Miles Keating,Hast du Zeit am Montag nächste Woche?,Neighbor,7,6,40,6.666667
1138,2018-11-02 10:24:00,2018-11-02,10:24:00,Neighbor,Da kann ich leider nicht 😟 Vielleicht an einem...,Neighbor,12,4,30,7.5


In [None]:

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Adding new columns
copy = df.copy()
copy['Running Message Count'] = 1
copy['Running Word Count'] = 1
copy['Running WPM'] =1

copy['Sent Messages'] = 1
copy['Received Messages'] = 1
copy.loc[copy.Author == my_name, 'Received Messages'] = 0
copy.loc[copy.Author != my_name, 'Sent Messages'] = 0


copy['Sent Words'] = copy['Word Count']
copy['Received Words'] = copy['Word Count']
copy.loc[copy.Author == my_name, 'Received Words'] = 0
copy.loc[copy.Author != my_name, 'Sent Words'] = 0


copy['Sent WPM'] =1
copy['Received WPM'] =1
copy['Sent vs Received Messages']=1
copy['Sent vs Received Words']=1
copy['Global Messages']= copy['Running Message Count'].cumsum()

copy['SMA Word Count'] = 0
copy['SMA Sent vs Received Messages'] = 0
copy['SMA Sent vs Received Words'] = 0


temp2 = pd.DataFrame()



for key in conversations:
    temp1 = copy[copy.Conversation == key].copy()
    temp1['Running Message Count'] = temp1['Running Message Count'].cumsum()
    
    temp1['Running Word Count'] = temp1['Word Count'].cumsum()
    
    temp1['Running WPM'] = round(temp1['Running Word Count'] / temp1['Running Message Count'], 2)
    
    temp1['Received Messages'] = temp1['Received Messages'].cumsum()
    temp1['Sent Messages'] = temp1['Sent Messages'].cumsum()
    
    temp1['Received Words'] = temp1['Received Words'].cumsum()
    temp1['Sent Words'] = temp1['Sent Words'].cumsum()
    
    temp1['Received WPM'] = temp1['Received Words'] / temp1['Received Messages']
    temp1['Sent WPM'] = temp1['Sent Words'] / temp1['Sent Messages']
    
    temp1['Sent vs Received Messages'] = temp1['Sent Messages']/temp1['Received Messages']
    temp1['Sent vs Received Words'] = temp1['Sent Words']/temp1['Received Words']
    
    temp1['SMA Word Count'] = temp1['Word Count'].rolling(window=20).mean()
    temp1['SMA Sent vs Received Messages'] = temp1['Sent vs Received Messages'].rolling(window=20).mean()
    temp1['SMA Sent vs Received Words'] = temp1['Sent vs Received Words'].rolling(window=20).mean()
    
    temp2 = temp2.append(temp1)
    
copy = temp2
df=copy


pm_df = df[df.Conversation.isin(authors)]
group_df = df.drop(df[df.Conversation.isin(authors)].index)


    
'''

metric = pd.DataFrame(columns=['Datetime'])
metric['Datetime'] = pd.date_range(copy.sort_values(['Datetime']).Datetime.iloc[0], 
                                  copy.sort_values(['Datetime']).Datetime.iloc[-1], freq='min')
metric = copy.merge(metric,left_on=['Datetime'], right_on=['Datetime'], how='right')
metric = metric.sort_values(['Datetime'])
    '''
print('done')

'''
THINGS TO FIX
    ~~~~~SEPERATE INSTANTANEOUS SENT / RECEIVED WORDS VS. RUNNING VS. GLOBAL
    
'''



In [6]:
pm_df.head(50)

Unnamed: 0,Datetime,Date,Time,Author,Message,Conversation,Word Count,Running Message Count,Running Word Count,Running WPM,...,Sent Words,Received Words,Sent WPM,Received WPM,Sent vs Received Messages,Sent vs Received Words,Global Messages,SMA Word Count,SMA Sent vs Received Messages,SMA Sent vs Received Words
0,2018-10-17 00:09:00,2018-10-17,00:09:00,Bilo,https://youtu.be/UmKMGlulUIg,Bilo,1,1,1,1.0,...,0,1,,1.0,0.0,0.0,2038,,,
1,2018-10-17 00:09:00,2018-10-17,00:09:00,Bilo,Lavinia,Bilo,1,2,2,1.0,...,0,2,,1.0,0.0,0.0,2039,,,
3,2018-10-17 02:27:00,2018-10-17,02:27:00,Miles Keating,He lyrics are so random😁,Bilo,5,3,7,2.33,...,5,2,5.0,1.0,0.5,2.5,2044,,,
2,2018-10-17 02:27:00,2018-10-17,02:27:00,Miles Keating,"Haha ""The ultimate goal is to be like water""😅",Bilo,9,4,16,4.0,...,14,2,7.0,1.0,1.0,7.0,2045,,,
4,2018-10-17 02:57:00,2018-10-17,02:57:00,Bilo,His third eye is so weird 😍,Bilo,7,5,23,4.6,...,14,9,7.0,3.0,0.666667,1.555556,2048,,,
5,2018-10-17 03:06:00,2018-10-17,03:06:00,Miles Keating,😁😁,Bilo,1,6,24,4.0,...,15,9,5.0,3.0,1.0,1.666667,2050,,,
6,2018-10-25 05:20:00,2018-10-25,05:20:00,Miles Keating,"Hi Lavinia, I locked myself out of our flat 😬😬...",Bilo,21,7,45,6.43,...,36,9,9.0,3.0,1.333333,4.0,2650,,,
7,2018-10-25 05:32:00,2018-10-25,05:32:00,Bilo,Ahahhahauauau,Bilo,1,8,46,5.75,...,36,10,9.0,2.5,1.0,3.6,2651,,,
8,2018-10-25 05:33:00,2018-10-25,05:33:00,Bilo,I hope to finish lessons at 15:30,Bilo,11,9,57,6.33,...,36,21,9.0,4.2,0.8,1.714286,2652,,,
9,2018-10-25 05:54:00,2018-10-25,05:54:00,Miles Keating,😠,Bilo,1,10,58,5.8,...,37,21,7.4,4.2,1.0,1.761905,2653,,,


In [206]:
# STILL BUGGY TRY KEY LENA ULRICH AND CHECK DATETIME DIFFERENCEES
pm_dict = {}

for key in pm_df.Conversation.unique():
    pm_dict.update({key : pm_df[['Datetime', 'Author', 'Conversation']][pm_df.Conversation == key]})
    temp_df = pm_dict[key].loc[pm_dict[key].Author != pm_dict[key].Author.shift(1)].copy()
    temp_df['DeltaT Minutes'] = temp_df.Datetime - temp_df.Datetime.shift(1)
    pm_dict[key]['DeltaT Minutes']= '0'# days 00:00:00'
    pm_dict[key] = temp_df.append(pm_dict[key]) # I assumed this would be the fastest sorting method
    pm_dict[key].sort_index(inplace=True)
    pm_dict[key]=pm_dict[key].loc[~pm_dict[key].index.duplicated(keep='first')]
    



In [1]:
#CONVERTING DATETIMES TO MINUTES
days = (hist_df.Datetime.iloc[1]-hist_df.Datetime.iloc[0]).days * 24 * 60
seconds = (hist_df.Datetime.iloc[1]-hist_df.Datetime.iloc[0])
days + seconds
print(seconds)

NameError: name 'hist_df' is not defined