In [1]:
from shutil import copy2

In [2]:
from IPython.display import display, HTML

In [3]:
directory = "transcripts/initial_tsv"

with open(f"{directory}/episodes.tsv", "r", encoding="utf-8") as episodesFile:
    episodesTsv = [line.split("\t") for line in episodesFile.read().splitlines()[1:]]

print(episodesTsv[0:3])

[['Space Pilot 3000', '1', 'Space_Pilot_3000.tsv'], ['The Series Has Landed', '1', 'The_Series_Has_Landed.tsv'], ['I, Roommate', '1', 'I__Roommate.tsv']]


In [4]:
speakers = []
speakersLookup = {}

for episode in episodesTsv:
    with open(f"{directory}/{episode[2]}", "r", encoding="utf-8") as inFile:
        lines = [line.split("\t") for line in inFile.read().splitlines()[1:]]
    for line in lines:
        speaker = line[0]
        if speaker not in speakersLookup:
            speakersLookup[speaker] = len(speakers)
            speakers.append({
                "SPEAKER": speaker,
                "COUNT": 1
            })
        else:
            speakers[speakersLookup[speaker]]["COUNT"] += 1

speakers.sort(key=lambda x: -x["COUNT"])

In [5]:
html = "<table><tr><th>SPEAKER</th><th>COUNT</th></tr>"
for speaker in speakers:
    html += f"<tr><td>{speaker['SPEAKER']}</td><td>{speaker['COUNT']}</td></tr>"
html +="</table>"
display(HTML(html))

SPEAKER,COUNT
Fry,3779
Bender,3395
Leela,3009
Farnsworth,1447
Zoidberg,900
Amy,864
Hermes,772
Zapp,482
Professor Farnsworth,280
Kif,249


In [6]:
# probably every character containing brackets would be better without
speakersWithBrackets = [speaker for speaker in speakers if "(" in speaker["SPEAKER"] or ")" in speaker["SPEAKER"] or "{" in speaker["SPEAKER"] or "}" in speaker["SPEAKER"]]
html = "<table><tr><th>SPEAKER</th><th>COUNT</th></tr>"
for speaker in speakersWithBrackets:
    html += f"<tr><td>{speaker['SPEAKER']}</td><td>{speaker['COUNT']}</td></tr>"
html +="</table>"
display(HTML(html))

SPEAKER,COUNT
Farnsworth (v.o.),7
Hermes (VO),6
Zapp (v.o.),6
Fry (Telepathically),5
Susan (v.o.),5
Fry (confused),3
Amy (sobbing),3
Mom (v.o.),3
Bender (Laughing),2
Amy (Crying),2


In [7]:
# Names e.g. Farnsworth have been written in lots of ways
# not all case differences need to be included but in some cases where there are no other textual variations I include a case variation seen in the source
nameMaps = [
    ["Professor Farnsworth", "Farnsworth", "Farnswoth", "Prof. Farnsworth", "Farnsworth (cont'd)", "Farnsworth v.o.", "Farnsowrth", "Professor Farnsoworth"],
    ["Hedonism Bot", "Hedonismbot"],
    ["Kwanzaa Bot", "Kwanzaa-Bot", "Kwanzabot"],
    ["Yivo", "YIVO"],
    ["Richard Nixon's Head", "Nixon", "Nixon's head"],
    ["LaBarbara", "Labarbara"],
    ["ALL", "All"],
    ["Old man", "Old Man"],
    ["Female voice", "Female Voice"],
    ["Suicide booth", "Suicide Booth"],
    ["Hydroponic farmer", "Hydroponic Farmer"],
    ["Underwater house salesman", "Underwater House Salesman"],
    ["Bender doll", "Bender Doll"],
    ["Brown-haired man", "Brown-haired Man"],
    ["Zapp Brannigan", "Zapp"]
]

In [8]:
# lines with brackets also contain unspoken text
for episode in episodesTsv:
    with open(f"{directory}/{episode[2]}", "r", encoding="utf-8") as inFile:
        lines = [line.split("\t") for line in inFile.read().splitlines()[1:]]
    for line in lines:
        if len(line) < 2:
            continue
        line = line[1]
        if "(" in line or ")" in line or "{" in line or "}" in line:
            print(line)

Whoa, slow down! This place just doesn't feel like home. It just isn't ... cosy.  Ah. I can barely move!  (from inside) It's perfect!
Mm-hm.  I like your style. I find it very... (whispering) erotic.
Aww! (whispering to Fry) I'll fire up the grill!
I can't take it anymore! They've been at it for hours!  (shouting) Give it a rest, you two!
Invalid selection. (normal voice) Yo, what are you talking about?
Fry, please--  (from outside) My ponytail's caught in the door.
No, it isn't! Shut your filthy trap!  (talking) Thank you, Walt. If anyone ever got a hold of anchovy DNA, they could chop out the oil-making gene, stick it in a bunch of Third-World kids and bam! Cheap effective robot oil. Enough to put dear, old Mom out of businness.
Get his PIN number, you idiots!  (talking) Now I'm off to some charity BS for knocked-up teenage sluts.
Oh, that's ridiculous! (whispering) I'll take two pounds!
Lie down on table. I take lungs now, gills come next week.  (shouting) Nurse!
(reading) Who slurp

Now, I've simulated that collision using Shrapnovision(?).
Ah! I have everything I every wanted. Money, wealth, riches. Yet something's missing. A hot princess with which to get(?) grizzly.
It's getting cold and smelly. My odor-eaters(?) are going critical.
Aw, jeez with this, hey?(?)
Oh, the Colonial Desktop(?).  Well, there's nuthin' we Brits enjoy more dan a good document'ry. Turn on the televiser, Dr. Zoidsmythe.


#### Steps for improvement per line:

1. Remove brackets from speakers (ensure matching)
2. Correct name variations from speakers (includes case differences)
3. Remove colons from speakers (I think only one occurrence of this exists)
4. Remove brackets from line text (ensure matching)

For each step trim (`strip`) whitespaces.

In [9]:
def remove_bracketed_text(text):
    textBracketsRemoved = ""
    bracketLevel = 0
    for character in text:
        if character == "(":
            bracketLevel += 1
        elif character == ")" and bracketLevel >= 1:
            bracketLevel -= 1
        elif bracketLevel == 0:
            textBracketsRemoved += character
    
    return textBracketsRemoved, bracketLevel

In [10]:
for episode in episodesTsv:
    with open(f"{directory}/{episode[2]}", "r", encoding="utf-8") as inFile:
        lines = [line.split("\t") for line in inFile.read().splitlines()[1:]]
    
    improvedTsv = "SPEAKER\tLINE\n"
    
    for line in lines:
        if len(line) < 2:
            continue
        
        speaker = line[0]
        line = line[1]
        
        if "(" in speaker:
            speakerBracketsRemoved, bracketLevel = remove_bracketed_text(speaker)
            if bracketLevel == 0:
                speaker = speakerBracketsRemoved.strip()
        
        speaker = speaker.replace(":", "")
        
        for mapping in nameMaps:
            mapTo = mapping[0]
            for variation in mapping:
                if speaker.lower() == variation.lower():
                    speaker = mapTo
        
        if "(" in line:
            lineBracketsRemoved, bracketLevel = remove_bracketed_text(line)
            if bracketLevel == 0:
                line = lineBracketsRemoved.strip()
        
        if line == "" or speaker == "":
            continue
        
        improvedTsv += f"{speaker}\t{line}\n"
        
        with open(f"transcripts/improved_tsv/{episode[2]}", "w", encoding="utf-8") as outFile:
            outFile.write(improvedTsv.rstrip())

In [11]:
copy2(f"{directory}/episodes.tsv", "transcripts/improved_tsv")
print("Copied episodes.tsv as well")

Copied episodes.tsv as well


In [12]:
directory = "transcripts/improved_tsv"

with open(f"{directory}/episodes.tsv", "r", encoding="utf-8") as episodesFile:
    episodesTsv = [line.split("\t") for line in episodesFile.read().splitlines()[1:]]

print(episodesTsv[0:3])

[['Space Pilot 3000', '1', 'Space_Pilot_3000.tsv'], ['The Series Has Landed', '1', 'The_Series_Has_Landed.tsv'], ['I, Roommate', '1', 'I__Roommate.tsv']]


In [13]:
speakers = []
speakersLookup = {}

for episode in episodesTsv:
    with open(f"{directory}/{episode[2]}", "r", encoding="utf-8") as inFile:
        lines = [line.split("\t") for line in inFile.read().splitlines()[1:]]
    for line in lines:
        speaker = line[0]
        if speaker not in speakersLookup:
            speakersLookup[speaker] = len(speakers)
            speakers.append({
                "SPEAKER": speaker,
                "COUNT": 1
            })
        else:
            speakers[speakersLookup[speaker]]["COUNT"] += 1

speakers.sort(key=lambda x: -x["COUNT"])

In [14]:
html = "<table><tr><th>SPEAKER</th><th>COUNT</th></tr>"
for speaker in speakers:
    html += f"<tr><td>{speaker['SPEAKER']}</td><td>{speaker['COUNT']}</td></tr>"
html +="</table>"
display(HTML(html))

SPEAKER,COUNT
Fry,3805
Bender,3414
Leela,3020
Professor Farnsworth,1748
Zoidberg,903
Amy,876
Hermes,779
Zapp Brannigan,498
Kif,252
Mom,185


In [15]:
# double check that there aren't examples of same name different case:
speakerExistsLookup = {}
duplicatesFound = False
for speaker in speakers:
    if speaker["SPEAKER"].lower() in speakerExistsLookup:
        print(f"Duplicate found: {speaker['SPEAKER']}")
        duplicatesFound = True
    else:
        speakerExistsLookup[speaker["SPEAKER"].lower()] = True

if not duplicatesFound:
    print("No duplicates found this time.")

No duplicates found this time.
