In [74]:
from tqdm import tqdm
import pandas as pd
from typing import Dict, Tuple
from resources.charlm import (
    train_char_lm,
    print_probs,
    generate_text,
    perplexity,
    smoothed_perplexity,
)

# Question (a)

In [2]:
# Train order 4 model on subtitles.txt
subtitles_file = "resources/subtitles.txt"
model = train_char_lm(subtitles_file)

In [3]:
# continuations of words
print("Continuations of atio:")
print_probs(model, "atio")

print("Continuations of nivi:")
print_probs(model, "nivi")

print("Continuations of supe:")
print_probs(model, "supe")

Continuations of atio:
[('n', 0.9940436161014506),
 (' ', 0.00220962628494572),
 ('.', 0.0013930252665962147),
 (',', 0.0009607070804111826),
 ('?', 0.0003362474781439139),
 ("'", 0.00024017677010279565),
 ('u', 0.00019214141608223654),
 ('"', 0.0001441060620616774),
 ('s', 0.0001441060620616774),
 ('-', 9.607070804111827e-05),
 ('!', 4.8035354020559135e-05),
 (':', 4.8035354020559135e-05),
 ('m', 4.8035354020559135e-05),
 ('p', 4.8035354020559135e-05),
 ('r', 4.8035354020559135e-05)]
Continuations of nivi:
[('n', 0.8), ('e', 0.1), ('s', 0.1)]
Continuations of supe:
[('r', 0.9992144540455616), ('s', 0.0007855459544383347)]


In [8]:
# Produce random strings
num_strings = 10
num_char = 80

for i in range(num_strings):
    random_str = generate_text(model, 4, num_char)
    print(f"Iteration {i+1}: {random_str}")

Iteration 1: Uh...
May away every drug-dee?
A few mortgage.
Just not a pickets own some even.
Iteration 2: Anythink of the disarmed?
Wentz?
Depechecking, "Titus rice to be a should chairc
Iteration 3: To be a man.
Yeah, they're staff.
He door, the help.
I did you.
l'm gone most on
Iteration 4: I've good.
THE BEG YOU JUST BOLO and spell me were injury anyone with the pie Zi
Iteration 5: We cas impale Roy. I loves shows.
Two den night, you can't separtner.
The fucked
Iteration 6: Tell day be our breat
Don't know what you have go with a bomb was back, after fr
Iteration 7: CAPTURE, BUT YOU DON'T LIKE THE NEW MORNISHING LIKE A GOOD LOVE YOU BELIEVE ME.

Iteration 8: 'Cause now.
Well, I got evening to the kids of their own a party?
Henry she in t
Iteration 9: Maybe someteorism, but I can I went out on it he's the code at the long the Colo
Iteration 10: Come pen.
Racherry.
I maybe ther's gave a pornograph?
Actually trying you show t


# Question (b)

In [3]:
# Calculate perplexity of test string
test_string = "The boy loves his mother"
test_perplexity = perplexity(
    test_string, model, 4
)
print(
    f"Perplexity of '{test_string}': {test_perplexity}"
)

Perplexity of 'The boy loves his mother': 3.909190367374623


In [4]:
# Calculate perplexity of specified strings
strings = [
    "The student loves homework",
    "The yob loves homework",
    "It is raining in London",
    "asdfjkl; qwerty",
]

for s in strings:
    perplexity_result = perplexity(s, model, 4)
    print(
        f"Perplexity of '{s}': {perplexity_result}"
    )

Perplexity of 'The student loves homework': 4.606972940490916
Perplexity of 'The yob loves homework': inf
Perplexity of 'It is raining in London': 3.7112360009044525
Perplexity of 'asdfjkl; qwerty': inf


# Question (c)

In [4]:
# Calculate smoothed perplexity of specified strings
strings = [
    "The student loves homework",
    "The yob loves homework",
    "It is raining in London",
    "asdfjkl; qwerty",
]

for s in strings:
    smoothed_perplexity_result = (
        smoothed_perplexity(s, model, 4)
    )
    print(
        f"Smoothed perplexity of '{s}': {smoothed_perplexity_result}"
    )

Smoothed perplexity of 'The student loves homework': 4.606972940490916
Smoothed perplexity of 'The yob loves homework': 1e-07
Smoothed perplexity of 'It is raining in London': 3.7112360009044525
Smoothed perplexity of 'asdfjkl; qwerty': 1e-07


# Question (d)

In [10]:
# Train unigram/bigram/4-gram models on training set of 6 languages
language_names = [
    "da",
    "de",
    "en",
    "fr",
    "it",
    "nl",
]
orders = [0, 1, 3]

# Store all trained models according to order and language
trained_language_models = {}
for order in tqdm(orders):
    trained_models = {}
    for lang in language_names:
        training_file = (
            f"resources/{lang}.train.txt"
        )
        trained_models[lang] = train_char_lm(
            training_file, order=order
        )

    trained_language_models[order] = (
        trained_models
    )

100%|██████████| 3/3 [00:18<00:00,  6.07s/it]


In [None]:
def predict_language(
    text: str,
    trained_language_models: Dict,
    order: int,
) -> Tuple[str, float]:
    """
    Returns language code of model with lowest smoothed perplexity 
        on input text, and perplexity scores of all models.
    """
    best_model = None
    lowest_perplexity = float("inf")
    perplexities = {}

    # Track model with lowest perplexity
    for lang in trained_language_models.keys():
        model = trained_language_models[lang]
        perplexity = smoothed_perplexity(
            text, model, order=order
        )
        perplexities[lang] = perplexity

        if perplexity < lowest_perplexity:
            lowest_perplexity = perplexity
            best_model = lang

    # Return language code of model with lowest perplexity
    return best_model, perplexities

In [23]:
# Parse test file
test_file = "resources/test.txt"
orders = [0, 1, 3]

# Make predictions for models of different orders
for order in orders:
    print(
        f"Making predictions for order {order} model..."
    )

    testing_results = []
    with open(test_file) as f:
        # For each line, get lowest perplexity model
        for idx, line in tqdm(enumerate(f)):
            true_lang, text = line.strip().split(
                "\t"
            )
            predicted_lang, perplexities = (
                predict_language(
                    text,
                    trained_language_models[
                        order
                    ],
                    order=order,
                )
            )
            testing_results.append(
                {
                    "correct_lang": true_lang,
                    "predicted_lang": predicted_lang,
                    "result": int(
                        true_lang
                        == predicted_lang
                    ),
                }
            )

            # Obtain perplexity scores of all models on first line
            if idx == 0:
                print(
                    f"Perplexity scores of all models on first test line: {perplexities}"
                )

    testing_results = pd.DataFrame(
        testing_results
    )

    # For each language, compute accuracy
    accuracies = []
    for (
        group,
        group_table,
    ) in testing_results.groupby(
        ["correct_lang"]
    ):
        language = group[0]
        total_lines = len(group_table)
        correct_lines = len(
            group_table.loc[
                group_table["result"] == 1
            ]
        )
        accuracy = (
            correct_lines * 100 / total_lines
        )
        accuracies.append(
            {
                "language": language,
                "total_lines": total_lines,
                "correct_lines": correct_lines,
                "accuracy": accuracy,
            }
        )

    accuracies = pd.DataFrame(accuracies)

    print(
        "The accuracies for the 6 languages are:"
    )
    print(accuracies)

Making predictions for order 0 model...


8it [00:00, 58.57it/s]

Perplexity scores of all models on first test line: {'da': 28.990266275714845, 'de': 29.177491185275965, 'en': 1e-07, 'fr': 21.23036140889091, 'it': 23.153186591077052, 'nl': 26.319382689734336}


1200it [00:07, 167.60it/s]


The accuracies for the 6 languages are:
  language  total_lines  correct_lines  accuracy
0       da          200             18       9.0
1       de          200             43      21.5
2       en          200            171      85.5
3       fr          200             18       9.0
4       it          200             95      47.5
5       nl          200            159      79.5
Making predictions for order 1 model...


30it [00:00, 291.50it/s]

Perplexity scores of all models on first test line: {'da': 1e-07, 'de': 1e-07, 'en': 1e-07, 'fr': 10.56989170441308, 'it': 1e-07, 'nl': 22.47677726796925}


1200it [00:02, 427.02it/s]


The accuracies for the 6 languages are:
  language  total_lines  correct_lines  accuracy
0       da          200             29      14.5
1       de          200             27      13.5
2       en          200             59      29.5
3       fr          200             11       5.5
4       it          200             64      32.0
5       nl          200             35      17.5
Making predictions for order 3 model...


146it [00:00, 1363.54it/s]

Perplexity scores of all models on first test line: {'da': 1e-07, 'de': 1e-07, 'en': 1e-07, 'fr': 1e-07, 'it': 1e-07, 'nl': 1e-07}


1200it [00:00, 2241.92it/s]

The accuracies for the 6 languages are:
  language  total_lines  correct_lines  accuracy
0       da          200            136      68.0
1       de          200              0       0.0
2       en          200              0       0.0
3       fr          200              0       0.0
4       it          200              0       0.0
5       nl          200              0       0.0





# Question (e)

In [None]:
# Split train set into male/female strings
train_file = "resources/tennis.train.txt"

female_lines = []
male_lines = []
with open(train_file) as f:
    for line in f:

        # Case normalize text
        speaker, text = line.strip().split("\t")
        if speaker == "F":
            female_lines.append(text.lower())
        else:
            male_lines.append(text.lower())

# Write new train sets
with open("female.tennis.train.txt", "w") as f:
    for line in female_lines:
        f.write(line + "\n")

with open("male.tennis.train.txt", "w") as f:
    for line in male_lines:
        f.write(line + "\n")

In [69]:
# Train unigram/bigram/5-gram female and male models
speakers = ["female", "male"]
orders = [0, 1, 4]

# Store all trained models according to order and speaker
trained_speaker_models = {}
for order in tqdm(orders):
    trained_models = {}
    for speaker in speakers:
        training_file = (
            f"{speaker}.tennis.train.txt"
        )
        trained_models[speaker] = train_char_lm(
            training_file, order=order
        )

    trained_speaker_models[order] = trained_models

100%|██████████| 3/3 [00:18<00:00,  6.28s/it]


In [None]:
def predict_speaker(
    text: str,
    trained_speaker_models: Dict,
    order: int,
) -> str:
    """
    Returns speaker with lowest smoothed perplexity on 
        input text.
    """
    best_speaker = None
    lowest_perplexity = float("inf")

    # Track speaker with lowest perplexity
    for speaker in trained_speaker_models.keys():
        model = trained_speaker_models[speaker]
        perplexity = smoothed_perplexity(
            text, model, order=order
        )

        if perplexity < lowest_perplexity:
            lowest_perplexity = perplexity
            best_speaker = speaker

    # Map to speaker code
    if best_speaker == "female":
        best_speaker = "F"
    else:
        best_speaker = "M"

    # Return speaker with lowest perplexity
    return best_speaker

In [71]:
# Parse test file
test_file = "resources/tennis.test.txt"
orders = [0, 1, 4]

# Make predictions for models of different orders
for order in orders:
    print(
        f"Making predictions for order {order} model..."
    )

    testing_results = []
    with open(test_file) as f:
        # For each line, get lowest perplexity model
        for line in f:
            true_speaker, text = (
                line.strip().split("\t")
            )
            predicted_speaker = predict_speaker(
                text.lower(),
                trained_speaker_models[order],
                order=order,
            )
            testing_results.append(
                {
                    "correct_speaker": true_speaker,
                    "predicted_speaker": predicted_speaker,
                    "result": int(
                        true_speaker
                        == predicted_speaker
                    ),
                }
            )

    testing_results = pd.DataFrame(
        testing_results
    )

    # For each speaker, compute accuracy
    accuracies = []
    for (
        group,
        group_table,
    ) in testing_results.groupby(
        ["correct_speaker"]
    ):
        speaker = group[0]
        total_lines = len(group_table)
        correct_lines = len(
            group_table.loc[
                group_table["result"] == 1
            ]
        )
        accuracy = (
            correct_lines * 100 / total_lines
        )
        accuracies.append(
            {
                "speaker": speaker,
                "total_lines": total_lines,
                "correct_lines": correct_lines,
                "accuracy": accuracy,
            }
        )

    accuracies = pd.DataFrame(accuracies)

    print(
        "The accuracies for the speaker models are:"
    )
    print(accuracies)

Making predictions for order 0 model...
The accuracies for the speaker models are:
  speaker  total_lines  correct_lines   accuracy
0       F         3696           1876  50.757576
1       M         4518           2541  56.241700
Making predictions for order 1 model...
The accuracies for the speaker models are:
  speaker  total_lines  correct_lines   accuracy
0       F         3696           2271  61.444805
1       M         4518           2616  57.901726
Making predictions for order 4 model...
The accuracies for the speaker models are:
  speaker  total_lines  correct_lines   accuracy
0       F         3696           2645  71.563853
1       M         4518           1466  32.447986
