In [3]:
from typing import List
from collections import Counter
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import re
import string
import random
from nltk import word_tokenize, sent_tokenize

# Question (a)

In [None]:
# tokenzier function
def primitive_tokenizer(text: str) -> List[str]:
    """
    Separates text into tokens by white-space and punctuations.
    """
    normalize_text = text.lower()
    white_space_split = normalize_text.split(" ")

    punctuation_split = []
    for token in white_space_split:
        all_words = ""

        for char in token:
            if char.isalpha() or char.isdigit():
                all_words += char

            else:
                # Split word if non-alphabetical/numerical
                all_words += " "
                all_words += char
                all_words += " "

        # Strip white space at end, split into alphabets/numericals/punctuations
        split_words = all_words.strip().split(" ")
        punctuation_split.extend(split_words)

    return punctuation_split

In [None]:
# Test function
text = "NAC has developed a National HIV/AIDS/STI/TB Intervention Strategic Plan (2002-2005) that aims to reduce the HIV prevalence rate among Zambians from 19.3% to 11.7% and improve the health status of people living with HIV/AIDS by 2005."
primitive_tokenizer_results = primitive_tokenizer(
    text
)
print(
    f"Result of primitive tokenizer is: {primitive_tokenizer_results}"
)

In [None]:
# Tokenize on input data
test_data_file = "tokens.txt"
num_lines = 10

tokenized_result = []
with open(test_data_file) as f:
    for i in range(num_lines):
        line = f.readline()
        tokenized_result.extend(
            primitive_tokenizer(line)
        )

print(
    f"Actual test data result: {tokenized_result}"
)

# Question (b)

In [None]:
# Load tokens data
tokens_file = "tokens.txt"
tokenized_result = []
with open(tokens_file) as f:
    lines = f.readlines()
    for line in tqdm(lines):
        tokenized_result.extend(
            primitive_tokenizer(line)
        )

In [None]:
# Calculate stats
num_lines = len(lines)
num_token_types = len(list(set(tokenized_result)))
num_total_tokens = len(list(tokenized_result))

# Obtain token at specific frequency rank
token_freq = Counter(tokenized_result)
token_sorted = pd.DataFrame.from_records(
    list(token_freq.items()),
    columns=["token", "count"],
).sort_values(by="count", ascending=False)
token_100s = token_sorted.head(100)
token_500 = token_sorted.iloc[499, :]
token_1000 = token_sorted.iloc[999, :]
token_5000 = token_sorted.iloc[4999, :]
token_10000 = token_sorted.iloc[9999, :]

# Stats on hapex legomena
tokens_single_freq = token_sorted.loc[
    token_sorted["count"] == 1
]
num_single_freq = len(tokens_single_freq)
percentage_single_freq = (
    num_single_freq * 100 / len(token_sorted)
)

In [None]:
# Print results
print(f"Num lines processed: {num_lines}")
print(f"Vocabulary size: {num_token_types}")
print(f"Collection size: {num_total_tokens}")
print("")
print(
    "Most freq tokens rank 1-100: please see output csv"
)
token_100s.to_csv(
    "Lab1/first100_tokens.csv"
)  # Too long to print
print("")
print("Most freq token at rank 500:")
print(token_500)
print("")
print("Most freq token at rank 1000:")
print(token_1000)
print("")
print("Most freq token at rank 5000:")
print(token_5000)
print("")
print("Most freq token at rank 10000:")
print(token_10000)
print(
    f"Num tokens occuring exactly once: {num_single_freq}"
)
print(
    f"Percentage of tokens occuringe exactly once: {percentage_single_freq}"
)

# Question (c)

In [None]:
# Using output from question (b)
zipf_law_data = token_sorted.reset_index()
zipf_law_data.plot(
    x="index", y="count", kind="line"
)

# Plot graph
plt.title("Rank vs frequency of token types")
plt.xlabel("Rank")
plt.ylabel("Frequency")
plt.show()

# Question (d)

In [None]:
# Helper function to split sentences
def detect_sentence_boundaries(
    filepath: str,
) -> None:
    """
    Detects sentences in file, printing number of sentences found and outputs unicode offset of sentence boundaries.
    """
    with open(filepath) as f:
        lines = f.readlines()

    print(
        f"Number of total lines to parse: {len(lines)}"
    )

    # Closing punctuation + whitespace + capital letter/number to denote sentence end
    end_punctuations = [
        ".",
        "!",
        "?",
        "'",
        '"',
        ";",
        "-",
    ]
    boundary_detections = [
        char + " " + i
        for char in end_punctuations
        for i in string.ascii_uppercase
    ]
    boundary_detections += [
        char + " " + i
        for char in end_punctuations
        for i in string.digits
    ]
    splitting_regex = "|".join(
        map(re.escape, boundary_detections)
    )

    results = []
    for i in tqdm(range(len(lines))):
        line = lines[i]
        sentences = re.split(
            splitting_regex, line
        )
        num_sentences = len(sentences)

        # Calculate offsets excluding last sentence
        offset = 0
        offset_results = []
        for i in range(len(sentences)):
            sentence = sentences[i]
            offset += len(sentence) - 1

            # Add offset without whitespace if first sentence
            if i != len(sentences) - 1:
                offset += 1  # increment to get to end punctuation
                offset_results.append(offset)
                offset += 3  # increment to get to white space + capital letter

            # Offset is last character if last sentence
            else:
                offset_results.append(
                    offset - 1
                )  # decrease to minus white space

        # Obtain unicode
        result = f"{num_sentences} "
        for offset in offset_results:
            result += f"{offset} "
        result = result.strip()

        results.append(result)

    # Write unicode to txt output
    output_filepath = "Lab1/ID.txt"
    with open(output_filepath, "w") as file:
        for result in results:
            file.write(result + "\n")

In [None]:
# Generate results on test file
test_file = (
    "Lab1/resources/Lab1-If-you-run-on-this.txt"
)
test_results = detect_sentence_boundaries(
    test_file
)

In [None]:
# Generate results on sentences file
sentences_file = "Lab1/resources/sentences.txt"
detect_sentence_boundaries(sentences_file)

In [None]:
# Randomly select 15 lines to test for program accuracy
lines_to_validate = [
    random.randint(0, 14980) + 1
    for _ in range(15)
]
print(
    f"Please validate the following lines manually: {lines_to_validate}"
)

# Question (e)

In [None]:
# Load tokens/sentences data
tokens_file = "Lab1/resources/tokens.txt"
sentences_file = "Lab1/resources/sentences.txt"

tokens_raw = open(tokens_file).read()
sentences_raw = open(sentences_file).read()

In [None]:
# Using word_tokenize to parse files
tokens_word = word_tokenize(tokens_raw)
sentences_word = word_tokenize(sentences_raw)

output_filepath = "Lab1/tokens_word.txt"
with open(output_filepath, "w") as file:
    for result in tokens_word:
        file.write(result + "\n")
output_filepath = "Lab1/sentences_word.txt"
with open(output_filepath, "w") as file:
    for result in sentences_word:
        file.write(result + "\n")

In [None]:
# Using sent_tokenize to parse files
sents_token = sent_tokenize(tokens_raw)
sents_sentence = sent_tokenize(sentences_raw)

output_filepath = "Lab1/tokens_sents.txt"
with open(output_filepath, "w") as file:
    for result in sents_token:
        file.write(result + "\n")
output_filepath = "Lab1/sentences_sents.txt"
with open(output_filepath, "w") as file:
    for result in sents_sentence:
        file.write(result + "\n")