![DISI](./DISI.jpeg)

### Import

In [218]:
import plotly
from plotly.graph_objs import Bar, Layout, Margin
plotly.offline.init_notebook_mode(connected=True)
from operator import itemgetter

In [220]:
TRAIN_FILE = "NLSPARQL.train.data"
TRAIN_FEATS_FILE = "NLSPARQL.train.feats.txt"
TEST_FILE = "NLSPARQL.test.data"
TEST_FEATS_FILE = "NLSPARQL.test.feats.txt"

### Train file analysis

In [221]:
train_file = open(TRAIN_FILE, "r", encoding="utf8")

token_count_dictionary = {}
concept_including_prefix_count_dictionary = {}
concept_without_prefix_count_dictionary = {}
sentences_count = 0
train_file_lines = []

for line in train_file:
    line = line.replace("\n", "")
    train_file_lines.append(line)
    
    if len(line) == 0: #check if end of sentence
        sentences_count += 1
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 2)
    
    token = line_split[0]
    concept_including_prefix = line_split[1]
    concept_without_prefix = "X"
    
    if concept_including_prefix.startswith("B") or\
       concept_including_prefix.startswith("I") or\
       concept_including_prefix.startswith("E"):
        concept_without_prefix = concept_including_prefix[2:]
    else:
        concept_without_prefix = concept_including_prefix
    assert(concept_without_prefix != "X")
    
    if token not in token_count_dictionary:
        token_count_dictionary[token] = 1
    else:
        token_count_dictionary[token] += 1
        
    if concept_including_prefix not in concept_including_prefix_count_dictionary:
        concept_including_prefix_count_dictionary[concept_including_prefix] = 1
    else:
        concept_including_prefix_count_dictionary[concept_including_prefix] += 1
    
    if concept_without_prefix not in concept_without_prefix_count_dictionary:
        concept_without_prefix_count_dictionary[concept_without_prefix] = 1
    else:
        concept_without_prefix_count_dictionary[concept_without_prefix] += 1        

train_file.close()    

In [222]:
train_file_length = len(train_file_lines)
print("> length of the train file:\t{0}".format(train_file_length))

> length of the train file:	24791


In [223]:
print("> number of sentences:\t{0}".format(sentences_count))

> number of sentences:	3338


In [224]:
number_of_tokens = len(token_count_dictionary)
print("> number of tokens:\t{0}".format(number_of_tokens))

> number of tokens:	1728


In [225]:
number_of_concepts_including_prefix = len(concept_including_prefix_count_dictionary)
print("> number of concepts including prefix:\t{0}".format(number_of_concepts_including_prefix))

> number of concepts including prefix:	41


In [226]:
number_of_concepts_without_prefix = len(concept_without_prefix_count_dictionary)
print("> number of concepts without prefix:\t{0}".format(number_of_concepts_without_prefix))

> number of concepts without prefix:	24


In [227]:
X_axis = []
Y_axis = []
sorted_dict = sorted(concept_including_prefix_count_dictionary.items(), key=itemgetter(1), reverse=True)

for concept_including_prefix, concept_including_prefix_count in sorted_dict:
        X_axis.append(concept_including_prefix)
        Y_axis.append(concept_including_prefix_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#1abc9c"))],
    "layout": Layout(title="concepts including prefix distribution", 
                     xaxis=dict(title='concepts'),
                     yaxis=dict(title='frequency', dtick = 1000),
                     margin=Margin(b=150)
                    )
})

In [228]:
X_axis = []
Y_axis = []
sorted_dict = sorted(concept_without_prefix_count_dictionary.items(), key=itemgetter(1), reverse=True)

for concept_without_prefix, concept_without_prefix_count in sorted_dict:
        X_axis.append(concept_without_prefix)
        Y_axis.append(concept_without_prefix_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#1abc9c"))],
    "layout": Layout(title="concepts without prefix distribution", 
                     xaxis=dict(title='concepts', tickangle=90),
                     yaxis=dict(title='frequency', dtick = 1000),
                     margin=Margin(b=150)
                    )
})

### Train with features file analysis

In [229]:
train_file = open(TRAIN_FEATS_FILE, "r", encoding="utf8")

token_count_dictionary = {}
pos_count_dictionary = {}
sentences_count = 0
train_file_lines = []

for line in train_file:
    line = line.replace("\n", "")
    train_file_lines.append(line)
    
    if len(line) == 0: #check if end of sentence
        sentences_count += 1
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 3)
    
    token = line_split[0]
    pos = line_split[1]
    
    if token not in token_count_dictionary:
        token_count_dictionary[token] = 1
    else:
        token_count_dictionary[token] += 1
        
    if pos not in pos_count_dictionary:
        pos_count_dictionary[pos] = 1
    else:
        pos_count_dictionary[pos] += 1
    
train_file.close()

In [230]:
train_file_length = len(train_file_lines)
print("> length of the train file:\t{0}".format(train_file_length))

> length of the train file:	24791


In [231]:
print("> number of sentences:\t{0}".format(sentences_count))

> number of sentences:	3338


In [232]:
number_of_tokens = len(token_count_dictionary)
print("> number of tokens:\t{0}".format(number_of_tokens))

> number of tokens:	1728


In [233]:
number_of_pos = len(pos_count_dictionary)
print("> number of pos:\t{0}".format(number_of_pos))

> number of pos:	49


In [234]:
X_axis = []
Y_axis = []
sorted_dict = sorted(pos_count_dictionary.items(), key=itemgetter(1), reverse=True)

for pos, pos_count in sorted_dict:
        X_axis.append(pos)
        Y_axis.append(pos_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#1abc9c"))],
    "layout": Layout(title="pos distribution", 
                     xaxis=dict(title='pos'),
                     yaxis=dict(title='frequency', dtick = 1000),
                     margin=Margin(b=150)
                    )
})

### Test file analysis

In [235]:
test_file = open(TEST_FILE, "r", encoding="utf8")

token_count_dictionary = {}
concept_including_prefix_count_dictionary = {}
concept_without_prefix_count_dictionary = {}
sentences_count = 0
test_file_lines = []

for line in test_file:
    line = line.replace("\n", "")
    test_file_lines.append(line)
    
    if len(line) == 0: #check if end of sentence
        sentences_count += 1
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 2)
    
    token = line_split[0]
    concept_including_prefix = line_split[1]
    concept_without_prefix = "X"
    
    if concept_including_prefix.startswith("B") or\
       concept_including_prefix.startswith("I") or\
       concept_including_prefix.startswith("E"):
        concept_without_prefix = concept_including_prefix[2:]
    else:
        concept_without_prefix = concept_including_prefix
    assert(concept_without_prefix != "X")
    
    if token not in token_count_dictionary:
        token_count_dictionary[token] = 1
    else:
        token_count_dictionary[token] += 1
        
    if concept_including_prefix not in concept_including_prefix_count_dictionary:
        concept_including_prefix_count_dictionary[concept_including_prefix] = 1
    else:
        concept_including_prefix_count_dictionary[concept_including_prefix] += 1
    
    if concept_without_prefix not in concept_without_prefix_count_dictionary:
        concept_without_prefix_count_dictionary[concept_without_prefix] = 1
    else:
        concept_without_prefix_count_dictionary[concept_without_prefix] += 1        

test_file.close() 

In [236]:
test_file_length = len(test_file_lines)
print("> length of the test file:\t{0}".format(test_file_length))

> length of the test file:	8201


In [237]:
print("> number of sentences:\t{0}".format(sentences_count))

> number of sentences:	1084


In [238]:
number_of_tokens = len(token_count_dictionary)
print("> number of tokens:\t{0}".format(number_of_tokens))

> number of tokens:	1039


In [239]:
number_of_concepts_including_prefix = len(concept_including_prefix_count_dictionary)
print("> number of concepts including prefix:\t{0}".format(number_of_concepts_including_prefix))

> number of concepts including prefix:	39


In [240]:
number_of_concepts_without_prefix = len(concept_without_prefix_count_dictionary)
print("> number of concepts without prefix:\t{0}".format(number_of_concepts_without_prefix))

> number of concepts without prefix:	23


In [241]:
X_axis = []
Y_axis = []
sorted_dict = sorted(concept_including_prefix_count_dictionary.items(), key=itemgetter(1), reverse=True)

for concept_including_prefix, concept_including_prefix_count in sorted_dict:
        X_axis.append(concept_including_prefix)
        Y_axis.append(concept_including_prefix_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#3498db"))],
    "layout": Layout(title="concepts including prefix distribution", 
                     xaxis=dict(title='concepts'),
                     yaxis=dict(title='frequency', dtick = 1000),
                     margin=Margin(b=150)
                    )
})

In [242]:
X_axis = []
Y_axis = []
sorted_dict = sorted(concept_without_prefix_count_dictionary.items(), key=itemgetter(1), reverse=True)

for concept_without_prefix, concept_without_prefix_count in sorted_dict:
        X_axis.append(concept_without_prefix)
        Y_axis.append(concept_without_prefix_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#3498db"))],
    "layout": Layout(title="concepts without prefix distribution", 
                     xaxis=dict(title='concepts', tickangle=90),
                     yaxis=dict(title='frequency', dtick = 1000),
                     margin=Margin(b=150)
                    )
})

### Test with features file analysis

In [243]:
test_file = open(TEST_FEATS_FILE, "r", encoding="utf8")

token_count_dictionary = {}
pos_count_dictionary = {}
sentences_count = 0
test_file_lines = []

for line in test_file:
    line = line.replace("\n", "")
    test_file_lines.append(line)
    
    if len(line) == 0: #check if end of sentence
        sentences_count += 1
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 3)
    
    token = line_split[0]
    pos = line_split[1]
    
    if token not in token_count_dictionary:
        token_count_dictionary[token] = 1
    else:
        token_count_dictionary[token] += 1
        
    if pos not in pos_count_dictionary:
        pos_count_dictionary[pos] = 1
    else:
        pos_count_dictionary[pos] += 1
    
test_file.close()

In [244]:
test_file_length = len(test_file_lines)
print("> length of the test file:\t{0}".format(test_file_length))

> length of the test file:	8201


In [245]:
print("> number of sentences:\t{0}".format(sentences_count))

> number of sentences:	1084


In [246]:
number_of_tokens = len(token_count_dictionary)
print("> number of tokens:\t{0}".format(number_of_tokens))

> number of tokens:	1039


In [247]:
number_of_pos = len(pos_count_dictionary)
print("> number of pos:\t{0}".format(number_of_pos))

> number of pos:	46


In [248]:
X_axis = []
Y_axis = []
sorted_dict = sorted(pos_count_dictionary.items(), key=itemgetter(1), reverse=True)

for pos, pos_count in sorted_dict:
        X_axis.append(pos)
        Y_axis.append(pos_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#3498db"))],
    "layout": Layout(title="pos distribution", 
                     xaxis=dict(title='pos'),
                     yaxis=dict(title='frequency', dtick = 500),
                     margin=Margin(b=150)
                    )
})

### Compute OOV rate

In [254]:
train_file = open(TRAIN_FILE, "r", encoding="utf8")

train_tokens = []

for line in train_file:
    line = line.replace("\n", "")
    
    if len(line) == 0: #check if end of sentence
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 2)
    
    token = line_split[0]
    
    if token not in train_tokens:
        train_tokens.append(token)
        
train_file.close()

oov_tokens = []

test_file = open(TEST_FILE, "r", encoding="utf8")

for line in test_file:
    line = line.replace("\n", "")
    
    if len(line) == 0: #check if end of sentence
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 2)
    
    token = line_split[0]
    
    if token not in train_tokens:
        oov_tokens.append(token)
        
test_file.close()

print("> OOV rate:\t{0}".format(len(oov_tokens)/len(train_tokens)))

> OOV rate:	0.15046296296296297
