In [1]:
from IPython.display import display, HTML

from matplotlib import pyplot

In [2]:
from tokenise import tokenise

In [3]:
def tsv_lines(text):
    return [line.split("\t") for line in text.splitlines()][1:]

In [4]:
def html_table_row(items):
    output = "<tr>"
    for item in items:
        output += f"<td>{item}</td>"
    output +="</tr>"
    return output

In [5]:
tsvDirectory = "transcripts/improved_tsv"

In [6]:
with open(f"{tsvDirectory}/episodes.tsv", "r", encoding="utf-8") as episodesTsvFile:
    episodesTsv = tsv_lines(episodesTsvFile.read())

speakerTokenCounts = []
speakerTokenCountsLookup = {}

for episode in episodesTsv:
    fileName = episode[2]
    with open(f"{tsvDirectory}/{fileName}", "r", encoding="utf-8") as episodeFile:
        lines = tsv_lines(episodeFile.read())
    
    for line in lines:
        speaker = line[0]
        text = line[1]
        if speaker in speakerTokenCountsLookup:
            speakerTokenCounts[speakerTokenCountsLookup[speaker]]["COUNT"] += len(tokenise(text))
        else:
            speakerTokenCountsLookup[speaker] = len(speakerTokenCounts)
            speakerTokenCounts.append({
                "SPEAKER": speaker,
                "COUNT": len(tokenise(text))
            })

speakerTokenCounts.sort(key=lambda x: -x["COUNT"])

In [7]:
output = "<b>Top fifteen</b><br /><table><tr><th>Speaker</th><th>Count</th></tr>"
for i in range(15):
    output += html_table_row([speakerTokenCounts[i]["SPEAKER"], speakerTokenCounts[i]["COUNT"]])
output += "</table>"
display(HTML(output))

Speaker,Count
Fry,39709
Bender,37789
Leela,32520
Professor Farnsworth,24461
Zoidberg,9381
Hermes,8117
Zapp Brannigan,7400
Amy,7213
Kif,2657
Mom,2554


In [8]:
totalTokens = 0
for speaker in speakerTokenCounts:
    totalTokens += speaker["COUNT"]
print(f"Total tokens: {totalTokens}")

Total tokens: 265897


In [9]:
output = "<b>Top fifteen (as percentages)</b><br /><table><tr><th>Speaker</th><th>Count</th></tr>"
for i in range(15):
    output += html_table_row([speakerTokenCounts[i]["SPEAKER"], str(round(speakerTokenCounts[i]["COUNT"] / totalTokens * 100, 3)) +" %"])
output += "</table>"
display(HTML(output))

Speaker,Count
Fry,14.934 %
Bender,14.212 %
Leela,12.23 %
Professor Farnsworth,9.199 %
Zoidberg,3.528 %
Hermes,3.053 %
Zapp Brannigan,2.783 %
Amy,2.713 %
Kif,0.999 %
Mom,0.961 %
