<h1>NLP - Tokenization and Sentence Boundary Disambiguation</h1> 

In [None]:
# Imports for tokenization task
import os
import re
from collections import Counter
import pandas as pd
import itertools
from nltk import word_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import treebank
import spacy
from itertools import chain

: 

<h2>Tokenization</h2>

<h3>Function to extract tokenization metrics</h3>

In [None]:
# Get intercection between correct tokenization and others, the missing tokens as well as the wrong tokens contained in other tokenizations
def get_tokenization_accuracy(correct_tokenization, other_tokenization):
    unified_set = set(list(itertools.chain(correct_tokenization, other_tokenization)))
    ot_set = set(other_tokenization)
    cor_set = set(correct_tokenization)
    intersection_len = ot_set.intersection(cor_set)
    int_perc = round((len(intersection_len)*100 / len(unified_set)),2)
    missing_tokens = list(cor_set.difference(ot_set))
    wrong_tokens = list(ot_set.difference(cor_set))
    return int_perc, missing_tokens, wrong_tokens

# Get List of total number of types
def get_tokenization_lens(tok_list):
    return [len(i) for i in tok_list]

# Get list of total number of types
def get_type_lens(tok_list):
    return [len(set(i)) for i in tok_list]

# Get 30 most common tokens for each tokenizaiton
def get_ind_common_tokens(tok_list):
    return list(dict.fromkeys([item for items, c in Counter(tok_list).most_common() for item in [items] * c]))[:30]

# Get 30 most common tokens amongst all tokenizaitons
def get_total_common_tokens(tok_list):
    all_tokens = []
    for i in tok_list:
        all_tokens= all_tokens + i
    return list(dict.fromkeys([item for items, c in Counter(all_tokens).most_common() for item in [items] * c]))[:30]

# Get percent of tokens appearing only once
def get_unique_token_percentage(ind_list):
    unique_instances = len([i for i in ind_list if ind_list.count(i)==1])
    unique_instances_perc = round((unique_instances*100/len(set(ind_list))),2)
    return unique_instances_perc

def batch_unique_token_percentages(list_of_lists):
    return [get_unique_token_percentage(i) for i in list_of_lists]

: 

<br>

<h3>Getting Correct Tokenizations</h3>

In [None]:
# Get correct Tokenizations
wjs_correct_tokenization = [item for sublist in [i for i in treebank.sents(treebank.fileids())] for item in sublist]
vima_correct_tokenization = [i for i in (''.join([(open('assignment1textfiles/sbd/' + i, encoding="utf8").read()) for i in os.listdir('assignment1textfiles/sbd')]).replace('<S>', '').split('\n')) if i != '']

: 

<h3>Tokenization when dealing with English</h3>

In [None]:
# Loading Wall Street Journal
wsj_raw_text = open("assignment1textfiles/wsj_untokenized.txt", "r").read()

: 

In [None]:
# nltk word_tokenize
wsj_nltk_word_tokenize = word_tokenize(wsj_raw_text)

: 

In [None]:
# nltk wordpunct_tokenize
wsj_nltk_tokenize_wordpunct_tokenize = wordpunct_tokenize(wsj_raw_text)

: 

In [None]:
# spacy tokenization
nlp = spacy.load("en_core_web_sm")
spacy_en_tokens = [i.text for i in nlp(wsj_raw_text.strip())]

: 

In [None]:
# custom regex tokenization
regex_en_tokens = re.findall("[-'()]|[^a-z0-9 ](?= )|(?:[a-z0-9]|[^-'()a-z0-9 ](?! ))+", wsj_raw_text, re.IGNORECASE)

: 

<h3>Tokenization when dealing with Greek</h3>

In [None]:
# Loading all texts from Vima
import os
vima = ''.join([(open('assignment1textfiles/raw/' + i, encoding="utf8").read()) for i in os.listdir('assignment1textfiles/raw')]).replace('\n', ' ')                                                                           

: 

In [None]:
# nltk word_tokenize
vima_nltk_word_tokenize = word_tokenize(vima)

: 

In [None]:
# nltk wordpunct_tokenize
vima_nltk_tokenize_wordpunct_tokenize = wordpunct_tokenize(vima)

: 

In [None]:
# spacy tokenization
nlpg = spacy.load("el_core_news_sm")
spacy_gr_tokens = [i.text for i in nlpg(vima)]

: 

In [None]:
# custom regex tokenization
regex_gr_tokens = re.findall("[-'()]|[^α-ω0-9 ](?= )|(?:[α-ωόάώήί0-9]|[^-'()α-ω0-9 ](?! ))+", vima, re.IGNORECASE)
print('\n')

: 

<h3>Tokenization Comparison</h3>
<h4>Wall Street Journal Tokenization<h4>

In [None]:
wsj_tokenizations = [wjs_correct_tokenization, wsj_nltk_word_tokenize, wsj_nltk_tokenize_wordpunct_tokenize, spacy_en_tokens, regex_en_tokens]
tokenizer_names = ['Ground Truth', 'Nltk word_tokenize', 'Nltk tokenize.wordpunct_tokenize', 'Spacy Tokenization', 'Custom Regex Tokenization']
accuracy_perc = [get_tokenization_accuracy(wjs_correct_tokenization, i)[0] for i in wsj_tokenizations]
tokenization_lengths = get_tokenization_lens(wsj_tokenizations)
# This takes a little while
wsj_unique_token_perc = batch_unique_token_percentages(wsj_tokenizations)
type_lengths =  get_type_lens(wsj_tokenizations)

wsj_results = {'Tokenization Accuracy' : [], "Token List Length" : [], "Type List Length":[], "Unique Token Percentage":[]}
for i in range(len(wsj_tokenizations)):
    wsj_results['Tokenization Accuracy'].append(str(accuracy_perc[i]) + '%')
    wsj_results["Token List Length"].append(tokenization_lengths[i])
    wsj_results["Type List Length"].append(type_lengths[i])
    wsj_results["Unique Token Percentage"].append(wsj_unique_token_perc[i])
print('Wall Street Journal Tokenization Results:\n')
pd.DataFrame(wsj_results, index = tokenizer_names)

: 

In [None]:
# Getting the 30 most common tokens per tokenization
for i in range(5):
    print(tokenizer_names[i] + ' 30 most common tokens in order are:\n')
    print(get_ind_common_tokens(wsj_tokenizations[i]))
    print('\n')
    
# Getting 30 most common tokens overall
print('The 30 most common tokens amongst all tokenizetions are:\n')
print(get_total_common_tokens([i for i in wsj_tokenizations]))

: 

In [None]:
# To examine which tokens don't appear in a tokenization
wsj_nltk_word_tokenize_missing, wsj_nltk_tokenize_wordpunct_tokenize_missing, spacy_en_tokens_missing, regex_en_tokens_missing = [get_tokenization_accuracy(wjs_correct_tokenization, i)[1] for i in wsj_tokenizations[1:]]

# For example, here are 10 tokens that word_tokenize() fails to identify
print('10 tokens that word_tokenize() fails to identify: \n')
print(wsj_nltk_word_tokenize_missing[:10])
print('\nA common theme is that both nltk and spacy tokenizers struggle when handling spacial charachters such as "*" and "-".')

: 

<h4>Vima Tokenization<h4>

In [None]:
vima_tokenizations = [vima_correct_tokenization, vima_nltk_word_tokenize, vima_nltk_tokenize_wordpunct_tokenize, spacy_gr_tokens, regex_gr_tokens]
tokenizer_names = ['Ground Truth', 'Nltk word_tokenize', 'Nltk tokenize.wordpunct_tokenize', 'Spacy Tokenization', 'Custom Regex Tokenization']
accuracy_perc = [get_tokenization_accuracy(vima_correct_tokenization, i)[0] for i in vima_tokenizations]
tokenization_lengths = get_tokenization_lens(wsj_tokenizations)
# This takes a little while
unique_token_perc = batch_unique_token_percentages(vima_tokenizations)
type_lengths =  get_type_lens(vima_tokenizations)

vima_results = {'Tokenization Accuracy' : [], "Token List Length" : [], "Type List Length":[], "Unique Token Percentage":[]}
for i in range(len(vima_tokenizations)):
    vima_results['Tokenization Accuracy'].append(str(accuracy_perc[i]) + '%')
    vima_results["Token List Length"].append(tokenization_lengths[i])
    vima_results["Type List Length"].append(type_lengths[i])
    vima_results["Unique Token Percentage"].append(unique_token_perc[i])
print('Vima Tokenization Results:\n')
pd.DataFrame(vima_results, index = tokenizer_names)

: 

In [None]:
# Getting the 30 most common tokens per tokenization
for i in range(5):
    print(tokenizer_names[i] + ' 30 most common tokens in order are:\n')
    print(get_ind_common_tokens(vima_tokenizations[i]))
    print('\n')
    
# Getting 30 most common tokens overall
print('The 30 most common tokens amongst all tokenizetions are:\n')
print(get_total_common_tokens([i for i in vima_tokenizations]))

: 

In [None]:
# Tokens that don't appear in a tokenization
vima_nltk_word_tokenize_missing, vima_nltk_tokenize_wordpunct_tokenize_missing, spacy_gr_tokens_missing, regex_gr_tokens_missing = [get_tokenization_accuracy(vima_correct_tokenization, i)[1] for i in vima_tokenizations[1:]]

# For example, here are 10 tokens that word_tokenize() fails to identify
print('10 tokens that the regex tokenizer fails to identify: \n')
print(regex_gr_tokens_missing[:10])
print('\nAgain, the common theme is a difficulty when dealing with special characters and symbols')

: 

<br><br><br>

<h2>Sentence Boundary Disambiguation</h2>

<h3>Function to extract Sentence Boundary Disambiguation metrics</h3>

In [None]:
# Helper for wsj ground trouth
def remove_if_contains(the_list, character):
    for i in range(len(the_list)):
        list_words = the_list[i].split()

        for j in reversed(range(len(list_words))):

            if character in str(list_words[j]):
                list_words.pop(j) 
        the_list[i] = ' '.join(list_words)
    return the_list
# Get Metrics of a list
def get_presition(gt, other_sbd):
    unified_set = set(list(itertools.chain(gt, other_sbd)))
    ot_set = set(other_sbd)
    cor_set = set(gt)
    tp = ot_set.intersection(cor_set)
    fp = ot_set.difference(cor_set)
    fn = cor_set.difference(ot_set)
    precision = round((len(tp)/ (len(tp)+len(fp))),3)

    return precision

def get_recall(gt, other_sbd):
    unified_set = set(list(itertools.chain(gt, other_sbd)))
    ot_set = set(other_sbd)
    cor_set = set(gt)
    tp = ot_set.intersection(cor_set)
    fp = ot_set.difference(cor_set)
    fn = cor_set.difference(ot_set)
    recall = round((len(tp)/ (len(tp)+len(fn))),3)
    return recall

def get_f1(gt, other_sbd):
    f1_score = 2*(get_presition(gt, other_sbd)*get_recall(gt, other_sbd)/(get_presition(gt, other_sbd)+get_recall(gt, other_sbd)))
    return f1_score


# Get average length of strings of a list
def get_avg_len(sbd_list):
    return round(sum(map(len, sbd_list))/float(len(sbd_list)),2)

# Get min length of strings in lists
def get_min_len(sbd_list):
    return len(min(sbd_list, key=len))

# Get max length of strings in list
def get_max_len(sbd_list):
    return len(max(sbd_list, key=len))

# Get average number of tokens per sentence
def avg_tokens(sbd_list):
    token_len = [len(i) for i in [word_tokenize(i) for i in sbd_list]]
    return round(sum(token_len)/len(token_len),2)

# Get max number of tokens of sbd
def max_tokens(sbd_list):
    return max([len(i) for i in [word_tokenize(i) for i in sbd_list]])

# Get min number of tokens of sbd
def min_tokens(sbd_list):
    return min([len(i) for i in [word_tokenize(i) for i in sbd_list]])

: 

<h3>Getting Correct Sentence Boundaries</h3>

In [None]:
# Get correct Sentence Boundaries
vima_correct_sbd = re.sub('\s*([.;,])', r'\1', ''.join([(open('assignment1textfiles/sbd/' + i, encoding="utf8").read().strip()) for i in os.listdir('assignment1textfiles/sbd')]).replace('\n',' ')).split('<S>')
for i in range(len(vima_correct_sbd)):
    vima_correct_sbd[i] = vima_correct_sbd[i].strip()
# wsj_correct_sbd = [re.sub('\s*([.;,])',r'\1', ' '.join(i)) for i in [item for sublist in [treebank.sents(treebank.fileids())] for item in sublist]]

wsj_correct_sbd = [re.sub('\s*([.;,])',r'\1', ' '.join(i)) for i in treebank.sents()]

: 

<h3>Sentence Boundary Disambiguation when dealing with English</h3>

In [None]:
# Additional imports
from nltk import sent_tokenize
from nltk.tokenize import PunktSentenceTokenizer

: 

In [None]:
# nltk sent_tokenize
wsj_nltk_sent_tokenize = sent_tokenize(wsj_raw_text.strip())

: 

In [None]:
# nltk PunktSentenceTokenizer
wsj_nltk_punkt_sent_tokenize = PunktSentenceTokenizer(wsj_raw_text.strip()).sentences_from_text(wsj_raw_text.strip())

: 

In [None]:
# spacy sentence boundary disambiguation
spacy_en_sbd = [str(i.sent).strip() for i in nlp(wsj_raw_text)]

: 

In [None]:
# custom regex sentence boundary disambiguation
sbd_en_regex = re.findall(r"[A-Z].*?[\.!?] ", wsj_raw_text) # A space is included at the end so that numbers with decimals won't trigger a new sentence
for i in range(len(sbd_en_regex)):
    sbd_en_regex[i] = sbd_en_regex[i][:-1]

: 

<h3>Sentence Boundary Disambiguation when dealing with Greek</h3>

In [None]:
# nltk sent_tokenize
vima_nltk_sent_tokenize = sent_tokenize(vima)

: 

In [None]:
# nltk PunktSentenceTokenizer
vima_nltk_punkt_sent_tokenize = PunktSentenceTokenizer(vima).sentences_from_text(vima)

: 

In [None]:
# spacy sentence boundary disambiguation
spacy_gr_sbd = [str(i.sent) for i in nlp(vima).sents]

: 

In [None]:
# custom regex sentence boundary disambiguation
sbd_gr_regex = re.findall(r"[Α-Ω].*?[\.!?]", vima)

: 

<h3>Sentence Boundary Disambiguation Comparison</h3>
<h4>Wall Street Journal SBD<h4>

In [None]:
wsj_sbds = [wsj_correct_sbd, wsj_nltk_sent_tokenize, wsj_nltk_punkt_sent_tokenize, spacy_en_sbd, sbd_en_regex]
tokenizer_names = ['Ground Truth', 'Nltk sent_tokenize', 'Nltk PunktSentenceTokenizer', 'Spacy SBD', 'Custom Regex SBD']
presc = [get_presition(wsj_correct_sbd, i) for i in wsj_sbds]
recall = [get_recall(wsj_correct_sbd, i) for i in wsj_sbds]
f1 = [get_f1(wsj_correct_sbd, i) for i in wsj_sbds]
min_len = [get_min_len(i) for i in wsj_sbds]
max_len = [get_max_len(i) for i in wsj_sbds]
avg_len = [get_avg_len(i) for i in wsj_sbds]
min_tok = [min_tokens(i) for i in wsj_sbds]
max_tok = [max_tokens(i) for i in wsj_sbds]
avg_tok = [avg_tokens(i) for i in wsj_sbds]

wsj_results = {'Precision' : [], "Recall":[], "F1 Score":[], 'Min Length':[], 'Max Length' : [], 'Average Length' : [], 'Min Tokens':[], 'Max Tokens':[],'Average Tokens':[],}
for i in range(len(wsj_sbds)):
    wsj_results['Precision'].append(presc[i])
    wsj_results["Recall"].append(recall[i])
    wsj_results["F1 Score"].append(f1[i])
    wsj_results["Min Length"].append(min_len[i])
    wsj_results["Max Length"].append(max_len[i])
    wsj_results["Average Length"].append(avg_len[i])
    wsj_results["Min Tokens"].append(min_tok[i])
    wsj_results["Max Tokens"].append(max_tok[i])
    wsj_results["Average Tokens"].append(avg_tok[i])

print('Wall Street Journal SBD Results:\n')
pd.DataFrame(wsj_results, index = tokenizer_names)

: 

In [None]:
vima_sbds = [vima_correct_sbd, vima_nltk_sent_tokenize, vima_nltk_punkt_sent_tokenize, spacy_gr_sbd, sbd_gr_regex]
tokenizer_names = ['Ground Truth', 'Nltk sent_tokenize', 'Nltk PunktSentenceTokenizer', 'Spacy SBD', 'Custom Regex SBD']
presc = [get_presition(vima_correct_sbd, i) for i in vima_sbds]
recall = [get_recall(vima_correct_sbd, i) for i in vima_sbds]
f1 = [get_f1(vima_correct_sbd, i) for i in vima_sbds]
min_len = [get_min_len(i) for i in vima_sbds]
max_len = [get_max_len(i) for i in vima_sbds]
avg_len = [get_avg_len(i) for i in vima_sbds]
min_tok = [min_tokens(i) for i in vima_sbds]
max_tok = [max_tokens(i) for i in vima_sbds]
avg_tok = [avg_tokens(i) for i in vima_sbds]

vima_results = {'Precision' : [], "Recall":[], "F1 Score":[], 'Min Length':[], 'Max Length' : [], 'Average Length' : [], 'Min Tokens':[], 'Max Tokens':[],'Average Tokens':[],}
for i in range(len(wsj_sbds)):
    vima_results['Precision'].append(presc[i])
    vima_results["Recall"].append(recall[i])
    vima_results["F1 Score"].append(f1[i])
    vima_results["Min Length"].append(min_len[i])
    vima_results["Max Length"].append(max_len[i])
    vima_results["Average Length"].append(avg_len[i])
    vima_results["Min Tokens"].append(min_tok[i])
    vima_results["Max Tokens"].append(max_tok[i])
    vima_results["Average Tokens"].append(avg_tok[i])

print('Vima SBD Results:\n')
pd.DataFrame(vima_results, index = tokenizer_names)

: 

: 