Skip to content

Commit 674ae25

Browse files
Merge pull request keshavsingh4522#146 from Zeeshan6781/patch-2
Create countwords.py
2 parents c9e97bf + 284c134 commit 674ae25

File tree

1 file changed

+52
-0
lines changed

1 file changed

+52
-0
lines changed

Diff for: countwords.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from bs4 import BeautifulSoup as bsoup
2+
import pandas as pd
3+
import numpy as np
4+
import humanfriendly
5+
6+
# Read in email data file
7+
df = pd.read_csv('../bodytext.csv', header = 0)
8+
9+
# Filter out sent mail
10+
emails = df.query('FromEmail != "[my email address]"').copy()
11+
12+
def wordCount(row):
13+
14+
if(row['Format'] == 'Html'):
15+
return htmlWordCount(row['Body'])
16+
17+
return textWordCount(row['Body'])
18+
19+
def textWordCount(text):
20+
if not(isinstance(text, str)):
21+
return 0
22+
23+
return len(text.split(None))
24+
25+
def htmlWordCount(text):
26+
if not(isinstance(text, str)):
27+
return 0
28+
29+
soup = bsoup(text, 'html.parser')
30+
31+
if soup is None:
32+
return 0
33+
34+
stripped = soup.get_text(" ", strip=True)
35+
36+
[s.extract() for s in soup(['style', 'script', 'head', 'title'])]
37+
38+
stripped = soup.get_text(" ", strip=True)
39+
40+
return textWordCount(stripped)
41+
42+
averageWordsPerMinute = 350
43+
44+
# Count the words in each message body
45+
emails['WordCount'] = emails.apply(wordCount, axis=1)
46+
emails['MinutesToRead'] = emails['WordCount'] / averageWordsPerMinute
47+
48+
# Get total number of minutes required to read all these emails
49+
totalMinutes = emails['MinutesToRead'].sum()
50+
51+
# And convert that to a more human-readable timespan
52+
timeToRead = humanfriendly.format_timespan(totalMinutes * 60)

0 commit comments

Comments
 (0)