# Exploratory Data Analysis 

-Sentence length distribution: The distribution of sentence length can help in selecting the appropriate maximum sequence length for the models. 

-NER tag distribution: The distribution of NER tags can help in understanding the class imbalance in the dataset. If some NER tags have significantly fewer examples compared to OTHERs, then the models may have difficulty in correctly predicting those tags. It may be necessary to balance the classes by upsampling or downsampling the data or using different weighting strategies during training.

-Unique word frequency: The frequency of unique words can help in understanding the vocabulary size of the dataset. If the vocabulary size is large, then the models may require more parameters to capture the nuances of the language. 

-Correlation between Unique word frequency and their NER tags distribution 


In [4]:
# PATHS
from pathlib import Path

IT_TRAIN_FILE = Path("NER/assets/wikineural_corpus/it/train.conllu")
IT_EVAL_FILE =   Path("NER/assets/wikineural_corpus/it/val.conllu")  
IT_TEST_FILE =  Path("NER/assets/wikineural_corpus/it/test.conllu")

# Latin
EN_TRAIN_FILE = Path("NER/assets/wikineural_corpus/en/train.conllu")
EN_EVAL_FILE =   Path("NER/assets/wikineural_corpus/en/val.conllu")
EN_TEST_FILE =  Path("NER/assets/wikineural_corpus/en/test.conllu")

In [5]:

from NER.src.hmm.evaluation import *


train_data = readFileFromName(IT_TRAIN_FILE)
val_data   = readFileFromName(IT_EVAL_FILE)
test_data  = readFileFromName(IT_TEST_FILE)


def formatSentences(dataset):
    sentences = []
    sentences_tags = []
    
    for sentence in dataset:
        sentence_forms = [token['form'] for token in sentence]
        sentence_lemmas = [token['lemma'] for token in sentence]
        sentences.append(sentence_forms)
        sentences_tags.append(sentence_lemmas)
    return sentences, sentences_tags

train_words, train_tags = formatSentences(train_data)

eval_words, eval_tags   = formatSentences(val_data)

test_words, test_tags   = formatSentences(test_data)

dataset_words = train_words + eval_words + test_words

dataset_tags = train_tags +  eval_tags + test_tags

In [6]:
train_data_en = ( readFileFromName(EN_TRAIN_FILE))
val_data_en   = ( readFileFromName(EN_EVAL_FILE))
test_data_en  = ( readFileFromName(EN_TEST_FILE))


train_words_en, train_tags_en = formatSentences(train_data_en)

eval_words_en, eval_tags_en   = formatSentences(val_data_en)

test_words_en, test_tags_en   = formatSentences(test_data_en)

dataset_words_en = train_words_en + eval_words_en + test_words_en

dataset_tags_en = train_tags_en +  eval_tags_en + test_tags_en

In [7]:
len(dataset_words)

110519

In [8]:
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter


def plot_ner_distribution(tags,label):
    ner_distribution = Counter([tag for tag_seq in tags for tag in tag_seq])
    NER_TAGSET = {
        'B-PER': 'CLOSED',
        'B-ORG': 'CLOSED',
        'B-LOC': 'CLOSED',
        'B-MISC': 'CLOSED',
        'I-PER': 'CLOSED',
        'I-ORG': 'CLOSED',
        'I-LOC': 'CLOSED',
        'I-MISC': 'CLOSED',
        'O': 'CLOSED'
    }

    names = ["NER", "OPEN","CLOSED","OTHER"] +  list(ner_distribution)
    parents = ["", "NER","NER","NER"] + [NER_TAGSET[ner] for ner in ner_distribution]
    
    root_total   = sum(ner_distribution.values())
    open_total   = sum([ner_distribution[ner] for ner in ner_distribution if NER_TAGSET[ner] == "OPEN"])
    closed_total = sum([ner_distribution[ner] for ner in ner_distribution if NER_TAGSET[ner] == "CLOSED"])
    other_total  = sum([ner_distribution[ner] for ner in ner_distribution if NER_TAGSET[ner] == "OTHER"])
    
    values = [root_total,open_total,closed_total,other_total] + list(ner_distribution.values())
        
    fig = go.Figure(go.Treemap(
            labels=names,
            parents=parents,
            values=values, branchvalues="total",
            textinfo="label+percent root+percent parent"))
    
    fig.update_layout(width=800,height=500, margin_t=50, margin_b=50, margin_l=15, margin_r=15, 
                      title=f'{label} Dataset: How tokens are distributed in POS classes? ')
    fig.show()



In [9]:
plot_ner_distribution(train_tags,"IT TRAIN")
plot_ner_distribution(eval_tags, "IT EVAL")
plot_ner_distribution(test_tags, "IT TEST")
plot_ner_distribution(dataset_tags, "IT Combined")
plot_ner_distribution(train_tags_en,"EN TRAIN")
plot_ner_distribution(eval_tags_en, "EN EVAL")
plot_ner_distribution(test_tags_en, "EN TEST")
plot_ner_distribution(dataset_tags_en, "EN Combined")


In [10]:
def plot_sentence_length_distribution(dataset_tokens,label):
    ntokens_per_sent_distribution = Counter([len(sent) for sent in dataset_tokens])
    sentences_lengths = np.array(list(ntokens_per_sent_distribution.keys()))
    sentences_lengths_freqs = np.array(list(ntokens_per_sent_distribution.values()))
    sorted_idx = np.argsort(sentences_lengths)
    sentences_lengths = sentences_lengths[sorted_idx]
    sentences_lengths_freqs = sentences_lengths_freqs [sorted_idx]

    fig =  make_subplots(rows=2, cols=3, 
    specs=[[{"colspan":3},None,None],[{"colspan":1},{},{}]])

    fig.add_trace(go.Bar(y=sentences_lengths_freqs, x=sentences_lengths, orientation='v', name="Overall"), row=1, col=1)
    fig.add_trace(go.Bar(y=sentences_lengths_freqs, x=sentences_lengths, orientation='v', name="length < 50"), row=2, col=1)
    fig.add_trace(go.Bar(y=sentences_lengths_freqs, x=sentences_lengths, orientation='v', name="length < 150"), row=2, col=2)
    fig.add_trace(go.Bar(y=sentences_lengths_freqs, x=sentences_lengths, orientation='v', name="length > 150"), row=2, col=3)

    idx_mode = np.argmax(sentences_lengths_freqs)
    mode = sentences_lengths[idx_mode]
    mode_value = sentences_lengths_freqs[idx_mode]

    fig.add_annotation(x=mode, y=mode_value,
                text="Most common<br>sentence length",
                showarrow=True,
                arrowhead=2, row=1, col=1)

    most_long_sentence_idx = np.argmax(sentences_lengths)
    most_long_sentence = sentences_lengths[most_long_sentence_idx]

    fig.add_annotation(x=most_long_sentence, y=1,
                text="Longest <br>sentence",
                showarrow=True,
                arrowhead=2, row=1, col=1)


    fig.update_xaxes(range=[0,50], row=2, col=1)
    fig.update_xaxes(range=[51,150], row=2, col=2)
    fig.update_xaxes(range=[151,270], row=2, col=3)

    fig.update_yaxes(range=[0,1200], row=2, col=1)
    fig.update_yaxes(range=[0,10], row=2, col=2)
    fig.update_yaxes(range=[0,5], row=2, col=3)

    fig.update_yaxes(title="# of scentences", row=1, col=1)
    fig.update_xaxes(title="# of tokens in the sentence", row=1, col=1)

    fig.update_layout(title=f'{label} Dataset: How long are the sentences?', width=1200, height=600)
    fig.show()

In [11]:
plot_sentence_length_distribution(train_words,"IT TRAIN")
plot_sentence_length_distribution(eval_words, "IT EVAL")
plot_sentence_length_distribution(test_words, "IT TEST")
plot_sentence_length_distribution(dataset_words, "IT Combined")
plot_sentence_length_distribution(train_words_en,"EN TRAIN")
plot_sentence_length_distribution(eval_words_en, "EN EVAL")
plot_sentence_length_distribution(test_words_en, "ENTEST")
plot_sentence_length_distribution(dataset_words_en, "EN Combined")

In [12]:
import numpy as np
import plotly.graph_objects as go
from collections import Counter

def uniqueWordsFreq(words,title):
    word_freq = Counter(word for sentence in words for word in sentence)
    unique_word_freqs = list(word_freq.values())

    # calculate the average frequency
    avg_freq = np.mean(unique_word_freqs)

    # calculate the range of frequencies
    freq_range = np.logspace(np.log10(1), np.log10(10000), 50)

    # create the histogram
    hist, edges = np.histogram(unique_word_freqs, bins=freq_range)

    # find the most and least frequent words
    most_freq_word = max(word_freq, key=word_freq.get)
    least_freq_word = min(word_freq, key=word_freq.get)

    # create the plot
    fig = go.Figure(data=[go.Bar(x=edges, y=hist, marker_color='blue')])

    # add markers for the most and least frequent words
    fig.add_trace(go.Scatter(x=[word_freq[most_freq_word]], y=[0, 0], mode='markers', marker=dict(color=['red'], size=15), name='Most Frequent Word:    "' + most_freq_word+ '"'))

    fig.add_trace(go.Scatter(x=[word_freq[least_freq_word]], y=[0, 0], mode='markers', marker=dict(color=['green'], size=15), name='Least Frequent  Word: "' + least_freq_word+ '"'))

    # update the layout
    fig.update_layout(xaxis_type="log", title=f'{title}: Distribution of Unique Word Frequencies', xaxis_title="Unique Word Frequency (log scale)",   yaxis_title="Count")

    # add a horizontal line for the average frequency
    fig.add_shape(type="line", x0=avg_freq, x1=avg_freq, y0=0, y1=max(hist)*1.1,
                  line=dict(color="black", dash="dash"), name="Average Frequency")

    # show the plot
    fig.show()

    # print the most and least frequent words and the average frequency
    print("Most frequent word:", most_freq_word, "(Frequency:", word_freq[most_freq_word], ")")
    print("Least frequent word:", least_freq_word, "(Frequency:", word_freq[least_freq_word], ")")
    print("Average frequency:", avg_freq)


In [13]:
uniqueWordsFreq(train_words,"IT TRAIN")
uniqueWordsFreq(eval_words,"IT EVAL")
uniqueWordsFreq(test_words,"IT TEST")
uniqueWordsFreq(dataset_words,"IT COMBINED")


uniqueWordsFreq(train_words_en,"EN TRAIN" )
uniqueWordsFreq(eval_words_en,"EN EVAL" )
uniqueWordsFreq(test_words_en ,"EN TEST")
uniqueWordsFreq(dataset_words_en,"EN COMBINED" )



Most frequent word: , (Frequency: 122678 )
Least frequent word: silver (Frequency: 1 )
Average frequency: 20.471808903365908


Most frequent word: , (Frequency: 16034 )
Least frequent word: agitazione (Frequency: 1 )
Average frequency: 8.005825395575847


Most frequent word: , (Frequency: 17895 )
Least frequent word: Novalis (Frequency: 1 )
Average frequency: 8.22883427520042


Most frequent word: , (Frequency: 156607 )
Least frequent word: silver (Frequency: 1 )
Average frequency: 22.56163355073563


Most frequent word: the (Frequency: 121876 )
Least frequent word: Practice (Frequency: 1 )
Average frequency: 19.70199294073269


Most frequent word: the (Frequency: 15016 )
Least frequent word: Tuvalu (Frequency: 1 )
Average frequency: 8.56043850677576


Most frequent word: the (Frequency: 14832 )
Least frequent word: Coors (Frequency: 1 )
Average frequency: 8.630988918683165


Most frequent word: the (Frequency: 151724 )
Least frequent word: vista (Frequency: 1 )
Average frequency: 21.66215218513079


In [14]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import defaultdict, Counter

def uniqueWordsNerTagDistribution(words,tags,title):
    # Calculate the frequency of each unique word
    word_freq = Counter(word for sentence in words for word in sentence)

    # Create a dictionary to store NER tags for each unique word
    word_ner_tags = defaultdict(list)

    # Loop through each sentence and word, and append the NER tag to the corresponding word
    for sentence, tags in zip(words, tags):
        for word, tag in zip(sentence, tags):
            word_ner_tags[word].append(tag)

    # Calculate the frequency distribution of NER tags for each unique word
    word_tag_freq = {}
    for word, tags in word_ner_tags.items():
        tag_freq = Counter(tags)
        word_tag_freq[word] = tag_freq

    # Extract the top 20 most frequent words
    top_words = [word for word, freq in word_freq.most_common(20)]

    # Define the threshold for the cumulative frequency
    threshold = 0.9
    matter_factor = 20 # the word freq averages at 24

    # Calculate the cumulative frequency for each word and NER tag
    word_cumulative_freq = {}
    for word, tag_freq in word_tag_freq.items():
        freq_sum = tag_freq.most_common(1)[0][1] / word_freq[word]
        if freq_sum >=  threshold or word_freq[word]<matter_factor:
            continue
        word_cumulative_freq[word] = freq_sum

    # Select the top 10 problematic words
    num_problematic_words = 10
    problematic_words=sorted(word_cumulative_freq.items(),  key=lambda x: x[1])[:num_problematic_words]

    # Create a subplot grid with 2 rows and 5 columns
    fig = make_subplots(rows=2, cols=5)

    # Loop through each problematic word and plot it in a separate subplot
    for i, word in enumerate(problematic_words):
        # Get the frequency distribution of NER tags for the word
        tag_freq = word_tag_freq[word[0]]

        # Extract unique NER tags
        ner_tags = list(tag_freq.keys())

        # Sort the NER tags in descending order of frequency
        sorted_tags = sorted(ner_tags, key=lambda tag: tag_freq[tag], reverse=True)

        # Extract the tag frequencies into a list
        tag_freqs = [tag_freq[tag] for tag in sorted_tags]

        # Add the bar trace to the subplot
        fig.add_trace(go.Bar(x=sorted_tags, y=tag_freqs,name='Word: '+word[0]), row=i // 5 + 1, col=i % 5 + 1)

        # Set the subplot title and axis labels
        fig.update_layout(title=f"{title}: NER Tag Frequency Distribution for problematic words", xaxis_title='NER Tag', 
                          yaxis_title='Frequency',width=1200, height=600)

        # Add mouse hover stats
        fig.update_traces(hovertemplate='Tag: %{x}<br>Frequency: %{y}')

        fig.update_yaxes(range=[0, word_freq[word[0]]], row=i // 5 + 1, col=i % 5 + 1)

        # Rotate the X axis labels for better readability
        fig.update_layout(xaxis_tickangle=-45)

    # Show the stacked subplots
    fig.show()


In [15]:
uniqueWordsNerTagDistribution(train_words,train_tags,"IT TRAIN")
uniqueWordsNerTagDistribution(eval_words,eval_tags,"IT EVAL"  )
uniqueWordsNerTagDistribution(test_words,test_tags,"IT TEST"  )
uniqueWordsNerTagDistribution(dataset_words,dataset_tags,"IT COMBINED"  )

uniqueWordsNerTagDistribution(train_words_en,train_tags_en,"EN TRAIN")
uniqueWordsNerTagDistribution(eval_words_en,eval_tags_en,  "EN EVAL"  )
uniqueWordsNerTagDistribution(test_words_en,test_tags_en,  "EN TEST"  )
uniqueWordsNerTagDistribution(dataset_words_en,dataset_tags_en,"EN COMBINED"  )
