![DISI](../resources/DISI.jpeg)

### Import

In [90]:
import plotly
from plotly.graph_objs import Scatter, Bar, Layout, Margin
plotly.offline.init_notebook_mode(connected=True)
from operator import itemgetter

In [91]:
TRAIN_FILE = "NLSPARQL.train.data"
TRAIN_FEATS_FILE = "NLSPARQL.train.feats.txt"
TEST_FILE = "NLSPARQL.test.data"
TEST_FEATS_FILE = "NLSPARQL.test.feats.txt"

### Train file analysis

In [92]:
train_file = open(TRAIN_FILE, "r", encoding="utf8")

token_count_dictionary = {}
concept_including_prefix_count_dictionary = {}
concept_without_prefix_count_dictionary = {}
sentences_count = 0
train_file_lines = []

for line in train_file:
    line = line.replace("\n", "")
    train_file_lines.append(line)
    
    if len(line) == 0: #check if end of sentence
        sentences_count += 1
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 2)
    
    token = line_split[0]
    concept_including_prefix = line_split[1]
    concept_without_prefix = "X"
    
    if concept_including_prefix.startswith("B") or\
       concept_including_prefix.startswith("I") or\
       concept_including_prefix.startswith("E"):
        concept_without_prefix = concept_including_prefix[2:]
    else:
        concept_without_prefix = concept_including_prefix
    assert(concept_without_prefix != "X")
    
    if token not in token_count_dictionary:
        token_count_dictionary[token] = 1
    else:
        token_count_dictionary[token] += 1
        
    if concept_including_prefix not in concept_including_prefix_count_dictionary:
        concept_including_prefix_count_dictionary[concept_including_prefix] = 1
    else:
        concept_including_prefix_count_dictionary[concept_including_prefix] += 1
    
    if concept_without_prefix not in concept_without_prefix_count_dictionary:
        concept_without_prefix_count_dictionary[concept_without_prefix] = 1
    else:
        concept_without_prefix_count_dictionary[concept_without_prefix] += 1        

train_file.close()    

In [93]:
train_file_length = len(train_file_lines)
print("> length of the train file:\t{0}".format(train_file_length))

> length of the train file:	24791


In [94]:
print("> number of sentences:\t{0}".format(sentences_count))

> number of sentences:	3338


In [95]:
total_number_of_tokens = 0
for token_count in token_count_dictionary.values():
    total_number_of_tokens += token_count
print("> number of tokens:\t{0}".format(total_number_of_tokens)) 

> number of tokens:	21453


In [96]:
number_of_tokens = len(token_count_dictionary)
print("> number of unique tokens:\t{0}".format(number_of_tokens))

> number of unique tokens:	1728


In [97]:
number_of_concepts_including_prefix = len(concept_including_prefix_count_dictionary)
print("> number of unique concepts including prefix:\t{0}".format(number_of_concepts_including_prefix))

> number of unique concepts including prefix:	41


In [98]:
number_of_concepts_without_prefix = len(concept_without_prefix_count_dictionary)
print("> number of unique concepts without prefix:\t{0}".format(number_of_concepts_without_prefix))

> number of unique concepts without prefix:	24


In [100]:
X_axis = []
Y_axis = []
sorted_dict = sorted(token_count_dictionary.items(), key=itemgetter(1), reverse=True)

for token, token_count in sorted_dict:
    if token_count > 10:
        X_axis.append(token)
        Y_axis.append(token_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = "v",
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#9b59b6"))],
    "layout": Layout(title="<b>Train tokens distribution</b>",
                     xaxis=dict(title='<b>Tokens</b>', dtick=3, titlefont=dict(color='#9b59b6')),
                     yaxis=dict(title='<b>Frequency</b>', titlefont=dict(color='#9b59b6')),
                     margin=Margin(b=150)
                    )
})

In [101]:
X_axis = []
Y_axis = []
sorted_dict = sorted(concept_including_prefix_count_dictionary.items(), key=itemgetter(1), reverse=True)

for concept_including_prefix, concept_including_prefix_count in sorted_dict:
    if concept_including_prefix_count > 0:
        X_axis.append(concept_including_prefix)
        Y_axis.append(concept_including_prefix_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#9b59b6"))],
    "layout": Layout(title="<b>Train concepts including prefix distribution</b>", 
                     xaxis=dict(title='<b>Concepts</b>', titlefont=dict(color='#9b59b6')),
                     yaxis=dict(title='<b>Frequency</b>', dtick = 1000, titlefont=dict(color='#9b59b6')),
                     margin=Margin(b=200)
                    )
})

In [102]:
X_axis = []
Y_axis = []
sorted_dict = sorted(concept_without_prefix_count_dictionary.items(), key=itemgetter(1), reverse=True)

for concept_without_prefix, concept_without_prefix_count in sorted_dict:
    if concept_without_prefix_count > 0:
        X_axis.append(concept_without_prefix)
        Y_axis.append(concept_without_prefix_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#9b59b6"))],
    "layout": Layout(title="<b>Train concepts without prefix distribution</b>", 
                     xaxis=dict(title='<b>Concepts</b>', tickangle=90, titlefont=dict(color='#9b59b6')),
                     yaxis=dict(title='<b>Frequency</b>', dtick = 1000, titlefont=dict(color='#9b59b6')),
                     margin=Margin(b=150)
                    )
})

### Train with features file analysis

In [103]:
train_file = open(TRAIN_FEATS_FILE, "r", encoding="utf8")

token_count_dictionary = {}
pos_count_dictionary = {}
lemma_count_dictionary = {}
sentences_count = 0
train_file_lines = []

for line in train_file:
    line = line.replace("\n", "")
    train_file_lines.append(line)
    
    if len(line) == 0: #check if end of sentence
        sentences_count += 1
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 3)
    
    token = line_split[0]
    pos = line_split[1]
    lemma = line_split[2]
    
    if token not in token_count_dictionary:
        token_count_dictionary[token] = 1
    else:
        token_count_dictionary[token] += 1
        
    if pos not in pos_count_dictionary:
        pos_count_dictionary[pos] = 1
    else:
        pos_count_dictionary[pos] += 1
        
    if lemma not in lemma_count_dictionary:
        lemma_count_dictionary[lemma] = 1
    else:
        lemma_count_dictionary[lemma] += 1
    
train_file.close()

In [104]:
train_file_length = len(train_file_lines)
print("> length of the train file:\t{0}".format(train_file_length))

> length of the train file:	24791


In [105]:
print("> number of sentences:\t{0}".format(sentences_count))

> number of sentences:	3338


In [106]:
total_number_of_tokens = 0
for token_count in token_count_dictionary.values():
    total_number_of_tokens += token_count
print("> number of tokens:\t{0}".format(total_number_of_tokens))    

> number of tokens:	21453


In [107]:
number_of_tokens = len(token_count_dictionary)
print("> number of unique tokens:\t{0}".format(number_of_tokens))

> number of unique tokens:	1728


In [108]:
number_of_pos = len(pos_count_dictionary)
print("> number of unique pos:\t{0}".format(number_of_pos))

> number of unique pos:	49


In [109]:
number_of_lemma = len(lemma_count_dictionary)
print("> number of unique lemmas:\t{0}".format(number_of_lemma))

> number of unique lemmas:	1582


In [110]:
X_axis = []
Y_axis = []
sorted_dict = sorted(pos_count_dictionary.items(), key=itemgetter(1), reverse=True)

for pos, pos_count in sorted_dict:
    if pos_count > 0:
        X_axis.append(pos)
        Y_axis.append(pos_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = "v",
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#9b59b6"))],
    "layout": Layout(title="<b>Train feats pos distribution</b>", 
                     xaxis=dict(title='<b>POS-tag</b>', titlefont=dict(color='#9b59b6')),
                     yaxis=dict(title='<b>Frequency</b>', dtick = 500, titlefont=dict(color='#9b59b6')),
                     margin=Margin(b=150)
                    )
})

In [113]:
X_axis = []
Y_axis = []
sorted_dict = sorted(lemma_count_dictionary.items(), key=itemgetter(1), reverse=True)

for lemma, lemma_count in sorted_dict:
    if lemma_count > 10:
        X_axis.append(lemma)
        Y_axis.append(lemma_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = "v",
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#9b59b6"))],
    "layout": Layout(title="<b>Train feats lemma distribution</b>",
                     xaxis=dict(title='<b>Lemma</b>', dtick=3, titlefont=dict(color='#9b59b6')),
                     yaxis=dict(title='<b>Frequency</b>', dtick=100, titlefont=dict(color='#9b59b6')),
                     margin=Margin(b=150)
                    )
})

### Test file analysis

In [114]:
test_file = open(TEST_FILE, "r", encoding="utf8")

token_count_dictionary = {}
concept_including_prefix_count_dictionary = {}
concept_without_prefix_count_dictionary = {}
sentences_count = 0
test_file_lines = []

for line in test_file:
    line = line.replace("\n", "")
    test_file_lines.append(line)
    
    if len(line) == 0: #check if end of sentence
        sentences_count += 1
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 2)
    
    token = line_split[0]
    concept_including_prefix = line_split[1]
    concept_without_prefix = "X"
    
    if concept_including_prefix.startswith("B") or\
       concept_including_prefix.startswith("I") or\
       concept_including_prefix.startswith("E"):
        concept_without_prefix = concept_including_prefix[2:]
    else:
        concept_without_prefix = concept_including_prefix
    assert(concept_without_prefix != "X")
    
    if token not in token_count_dictionary:
        token_count_dictionary[token] = 1
    else:
        token_count_dictionary[token] += 1
        
    if concept_including_prefix not in concept_including_prefix_count_dictionary:
        concept_including_prefix_count_dictionary[concept_including_prefix] = 1
    else:
        concept_including_prefix_count_dictionary[concept_including_prefix] += 1
    
    if concept_without_prefix not in concept_without_prefix_count_dictionary:
        concept_without_prefix_count_dictionary[concept_without_prefix] = 1
    else:
        concept_without_prefix_count_dictionary[concept_without_prefix] += 1        

test_file.close() 

In [115]:
test_file_length = len(test_file_lines)
print("> length of the test file:\t{0}".format(test_file_length))

> length of the test file:	8201


In [116]:
print("> number of sentences:\t{0}".format(sentences_count))

> number of sentences:	1084


In [117]:
total_number_of_tokens = 0
for token_count in token_count_dictionary.values():
    total_number_of_tokens += token_count
print("> number of tokens:\t{0}".format(total_number_of_tokens))  

> number of tokens:	7117


In [118]:
number_of_tokens = len(token_count_dictionary)
print("> number of unique tokens:\t{0}".format(number_of_tokens))

> number of unique tokens:	1039


In [119]:
number_of_concepts_including_prefix = len(concept_including_prefix_count_dictionary)
print("> number of unique concepts including prefix:\t{0}".format(number_of_concepts_including_prefix))

> number of unique concepts including prefix:	39


In [120]:
number_of_concepts_without_prefix = len(concept_without_prefix_count_dictionary)
print("> number of concepts without prefix:\t{0}".format(number_of_concepts_without_prefix))

> number of concepts without prefix:	23


In [123]:
X_axis = []
Y_axis = []
sorted_dict = sorted(token_count_dictionary.items(), key=itemgetter(1), reverse=True)

for token, token_count in sorted_dict:
        if token_count > 5:
            X_axis.append(token)
            Y_axis.append(token_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = "v",
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#3498db"))],
    "layout": Layout(title="<b>Test tokens distribution</b>", 
                     xaxis=dict(title='<b>Tokens</b>', dtick=3, titlefont=dict(color='#3498db')),
                     yaxis=dict(title='<b>Frequency</b>', titlefont=dict(color='#3498db')),
                     margin=Margin(b=150)
                    )
})

In [78]:
X_axis = []
Y_axis = []
sorted_dict = sorted(concept_including_prefix_count_dictionary.items(), key=itemgetter(1), reverse=True)

for concept_including_prefix, concept_including_prefix_count in sorted_dict:
    if concept_including_prefix_count > 0:
        X_axis.append(concept_including_prefix)
        Y_axis.append(concept_including_prefix_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#3498db"))],
    "layout": Layout(title="<b>Test concepts including prefix distribution</b>", 
                     xaxis=dict(title='<b>Concepts</b>', titlefont=dict(color='#3498db')),
                     yaxis=dict(title='<b>Frequency</b>', dtick = 1000, titlefont=dict(color='#3498db')),
                     margin=Margin(b=150)
                    )
})

In [124]:
X_axis = []
Y_axis = []
sorted_dict = sorted(concept_without_prefix_count_dictionary.items(), key=itemgetter(1), reverse=True)

for concept_without_prefix, concept_without_prefix_count in sorted_dict:
    if concept_without_prefix_count > 0:
        X_axis.append(concept_without_prefix)
        Y_axis.append(concept_without_prefix_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#3498db"))],
    "layout": Layout(title="<b>Concepts without prefix distribution</b>", 
                     xaxis=dict(title='<b>Concepts</b>', tickangle=90, dtick=1, titlefont=dict(color='#3498db')),
                     yaxis=dict(title='<b>Frequency</b>', dtick = 1000, titlefont=dict(color='#3498db')),
                     margin=Margin(b=150)
                    )
})

### Test with features file analysis

In [125]:
test_file = open(TEST_FEATS_FILE, "r", encoding="utf8")

token_count_dictionary = {}
pos_count_dictionary = {}
lemma_count_dictionary = {}
sentences_count = 0
test_file_lines = []

for line in test_file:
    line = line.replace("\n", "")
    test_file_lines.append(line)
    
    if len(line) == 0: #check if end of sentence
        sentences_count += 1
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 3)
    
    token = line_split[0]
    pos = line_split[1]
    lemma = line_split[2]
    
    if token not in token_count_dictionary:
        token_count_dictionary[token] = 1
    else:
        token_count_dictionary[token] += 1
        
    if pos not in pos_count_dictionary:
        pos_count_dictionary[pos] = 1
    else:
        pos_count_dictionary[pos] += 1
        
    if lemma not in lemma_count_dictionary:
        lemma_count_dictionary[lemma] = 1
    else:
        lemma_count_dictionary[lemma] += 1
    
test_file.close()

In [126]:
test_file_length = len(test_file_lines)
print("> length of the test file:\t{0}".format(test_file_length))

> length of the test file:	8201


In [127]:
print("> number of sentences:\t{0}".format(sentences_count))

> number of sentences:	1084


In [128]:
total_number_of_tokens = 0
for token_count in token_count_dictionary.values():
    total_number_of_tokens += token_count
print("> number of tokens:\t{0}".format(total_number_of_tokens)) 

> number of tokens:	7117


In [129]:
number_of_tokens = len(token_count_dictionary)
print("> number of unique tokens:\t{0}".format(number_of_tokens))

> number of unique tokens:	1039


In [130]:
number_of_pos = len(pos_count_dictionary)
print("> number of pos:\t{0}".format(number_of_pos))

> number of pos:	46


In [131]:
number_of_lemma = len(lemma_count_dictionary)
print("> number of unique lemmas:\t{0}".format(number_of_lemma))

> number of unique lemmas:	952


In [132]:
X_axis = []
Y_axis = []
sorted_dict = sorted(pos_count_dictionary.items(), key=itemgetter(1), reverse=True)

for pos, pos_count in sorted_dict:
    if pos_count > 0:
        X_axis.append(pos)
        Y_axis.append(pos_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#3498db"))],
    "layout": Layout(title="<b>Test feats pos distribution</b>", 
                     xaxis=dict(title='<b>POS-tag</b>', titlefont=dict(color='#3498db')),
                     yaxis=dict(title='<b>Frequency</b>', dtick = 500, titlefont=dict(color='#3498db')),
                     margin=Margin(b=150)
                    )
})

In [140]:
X_axis = []
Y_axis = []
sorted_dict = sorted(lemma_count_dictionary.items(), key=itemgetter(1), reverse=True)

for lemma, lemma_count in sorted_dict:
    if lemma_count > 3:
        X_axis.append(lemma)
        Y_axis.append(lemma_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = "v",
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#3498db"))],
    "layout": Layout(title="<b>Test feats lemmas distribution</b>", 
                     xaxis=dict(title='<b>Lemma</b>', dtick = 3, titlefont=dict(color='#3498db')),
                     yaxis=dict(title='<b>Frequency</b>', dtick = 100, titlefont=dict(color='#3498db')),
                     margin=Margin(b=150)
                    )
})

### Compute OOV rate

In [89]:
train_file = open(TRAIN_FILE, "r", encoding="utf8")

train_tokens = []

for line in train_file:
    line = line.replace("\n", "")
    
    if len(line) == 0: #check if end of sentence
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 2)
    
    token = line_split[0]
    
    if token not in train_tokens:
        train_tokens.append(token)
        
train_file.close()

test_tokens = []
oov_tokens = []

test_file = open(TEST_FILE, "r", encoding="utf8")

for line in test_file:
    line = line.replace("\n", "")
    
    if len(line) == 0: #check if end of sentence
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 2)
    
    token = line_split[0]
    
    if token not in test_tokens:
        test_tokens.append(token)
    
    if token not in train_tokens:
        if token not in oov_tokens:
            oov_tokens.append(token)
        
test_file.close()

print("> OOV rate:\t{0}".format(len(oov_tokens)/len(test_tokens)))

> OOV rate:	0.23676612127045235
