|
| 1 | +from bs4 import BeautifulSoup as bsoup |
| 2 | +import pandas as pd |
| 3 | +import numpy as np |
| 4 | +import humanfriendly |
| 5 | + |
| 6 | +# Read in email data file |
| 7 | +df = pd.read_csv('../bodytext.csv', header = 0) |
| 8 | + |
| 9 | +# Filter out sent mail |
| 10 | +emails = df.query('FromEmail != "[my email address]"').copy() |
| 11 | + |
| 12 | +def wordCount(row): |
| 13 | + |
| 14 | + if(row['Format'] == 'Html'): |
| 15 | + return htmlWordCount(row['Body']) |
| 16 | + |
| 17 | + return textWordCount(row['Body']) |
| 18 | + |
| 19 | +def textWordCount(text): |
| 20 | + if not(isinstance(text, str)): |
| 21 | + return 0 |
| 22 | + |
| 23 | + return len(text.split(None)) |
| 24 | + |
| 25 | +def htmlWordCount(text): |
| 26 | + if not(isinstance(text, str)): |
| 27 | + return 0 |
| 28 | + |
| 29 | + soup = bsoup(text, 'html.parser') |
| 30 | + |
| 31 | + if soup is None: |
| 32 | + return 0 |
| 33 | + |
| 34 | + stripped = soup.get_text(" ", strip=True) |
| 35 | + |
| 36 | + [s.extract() for s in soup(['style', 'script', 'head', 'title'])] |
| 37 | + |
| 38 | + stripped = soup.get_text(" ", strip=True) |
| 39 | + |
| 40 | + return textWordCount(stripped) |
| 41 | + |
| 42 | +averageWordsPerMinute = 350 |
| 43 | + |
| 44 | +# Count the words in each message body |
| 45 | +emails['WordCount'] = emails.apply(wordCount, axis=1) |
| 46 | +emails['MinutesToRead'] = emails['WordCount'] / averageWordsPerMinute |
| 47 | + |
| 48 | +# Get total number of minutes required to read all these emails |
| 49 | +totalMinutes = emails['MinutesToRead'].sum() |
| 50 | + |
| 51 | +# And convert that to a more human-readable timespan |
| 52 | +timeToRead = humanfriendly.format_timespan(totalMinutes * 60) |
0 commit comments