![DISI](jupyter_resources/DISI.jpeg)

### Import

In [48]:
import json

### Load JSON configuration file

In [49]:
json_configuration_file = open("config.json", "r", encoding="utf8")
config = json.load(json_configuration_file)

In [50]:
TRAIN_FILE = "NLSPARQL.train.data"
TRAIN_FEATS_FILE = "NLSPARQL.train.feats.txt"
TEST_FILE = "NLSPARQL.test.data"
TEST_FEATS_FILE = "NLSPARQL.test.feats.txt"

OUTPUT_FOLDER = ""

IMPROVEMENTS = config["improvements"]
ADDITIONAL_FEATURES = config["additional_features"]
SMOOTHING = config["smoothing"]
HANDLE_UNK = config["handle_unk"]

### Check configurations

In [51]:
assert(IMPROVEMENTS == "true" or\
       IMPROVEMENTS == "false"),\
"> IMPROVEMENTS value must be either <true> or <false>, provided value is <{0}>".format(IMPROVEMENTS)

In [52]:
assert(ADDITIONAL_FEATURES == "none" or\
       ADDITIONAL_FEATURES == "postag" or\
       ADDITIONAL_FEATURES == "lemma"),\
"> ADDITIONAL_FEATURES value must be either <none>, <postag> or <lemma>, provided value is <{0}>".format(ADDITIONAL_FEATURES)

In [53]:
assert(SMOOTHING == "witten_bell" or\
       SMOOTHING == "katz" or\
       SMOOTHING == "kneser_ney"),\
"> SMOOTHING value must be either <witten_bell>, <katz> or <kneser_ney>, provided value is <{0}>".format(SMOOTHING)

In [54]:
assert(HANDLE_UNK == "uniform" or\
       HANDLE_UNK.startswith("cut_off_")),\
"> HANDLE_UNK value must be either <uniform> or <cut_off_#>, provided value is <{0}>".format(HANDLE_UNK)

In [55]:
OUTPUT_FOLDER = "IM_{0}_AF_{1}_SM_{2}_HU_{3}"\
                .format(IMPROVEMENTS, ADDITIONAL_FEATURES, SMOOTHING, HANDLE_UNK)
assert(OUTPUT_FOLDER != "")    

### Helper functions

### Main

In [56]:
OUTPUT_FOLDER    

'IM_false_AF_none_SM_witten_bell_HU_uniform'

### Handling additional features

### Train file analysis

In [112]:
train_file = open(TRAIN_FILE, "r", encoding="utf8")

token_count_dictionary = {}
concept_including_prefix_count_dictionary = {}
concept_without_prefix_count_dictionary = {}
sentences_count = 0
train_file_lines = []

for line in train_file:
    line = line.replace("\n", "")
    train_file_lines.append(line)
    
    if len(line) == 0: #check if end of sentence
        sentences_count += 1
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 2)
    
    token = line_split[0]
    concept_including_prefix = line_split[1]
    concept_without_prefix = "X"
    
    if concept_including_prefix.startswith("B") or\
       concept_including_prefix.startswith("I") or\
       concept_including_prefix.startswith("E"):
        concept_without_prefix = concept_including_prefix[2:]
    else:
        concept_without_prefix = concept_including_prefix
    assert(concept_without_prefix != "X")
    
    if token not in token_count_dictionary:
        token_count_dictionary[token] = 1
    else:
        token_count_dictionary[token] += 1
        
    if concept_including_prefix not in concept_including_prefix_count_dictionary:
        concept_including_prefix_count_dictionary[concept_including_prefix] = 1
    else:
        concept_including_prefix_count_dictionary[concept_including_prefix] += 1
    
    if concept_without_prefix not in concept_without_prefix_count_dictionary:
        concept_without_prefix_count_dictionary[concept_without_prefix] = 1
    else:
        concept_without_prefix_count_dictionary[concept_without_prefix] += 1        

train_file.close()    

In [113]:
train_file_length = len(train_file_lines)
print("> length of the train file:\t{0}".format(train_file_length))

> length of the train file:	24791


In [114]:
print("> number of sentences:\t{0}".format(sentences_count))

> number of sentences:	3338


In [188]:
number_of_tokens = len(token_count_dictionary)
print("> number of tokens:\t{0}".format(number_of_tokens))

> number of tokens:	1728


In [116]:
number_of_concepts_including_prefix = len(concept_including_prefix_count_dictionary)
print("> number of concepts including prefix:\t{0}".format(number_of_concepts_including_prefix))

> number of concepts including prefix:	41


In [117]:
number_of_concepts_without_prefix = len(concept_without_prefix_count_dictionary)
print("> number of concepts without prefix:\t{0}".format(number_of_concepts_without_prefix))

> number of concepts without prefix:	24


In [184]:
X_axis = []
Y_axis = []
sorted_dict = sorted(concept_including_prefix_count_dictionary.items(), key=itemgetter(1), reverse=True)

for concept_including_prefix, concept_including_prefix_count in sorted_dict:
        X_axis.append(concept_including_prefix)
        Y_axis.append(concept_including_prefix_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#1abc9c"))],
    "layout": Layout(title="concepts including prefix distribution", 
                     xaxis=dict(title='concepts'),
                     yaxis=dict(title='frequency', dtick = 1000),
                     margin=Margin(b=150)
                    )
})

In [187]:
X_axis = []
Y_axis = []
sorted_dict = sorted(concept_without_prefix_count_dictionary.items(), key=itemgetter(1), reverse=True)

for concept_without_prefix, concept_without_prefix_count in sorted_dict:
        X_axis.append(concept_without_prefix)
        Y_axis.append(concept_without_prefix_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#1abc9c"))],
    "layout": Layout(title="concepts without prefix distribution", 
                     xaxis=dict(title='concepts', tickangle=90),
                     yaxis=dict(title='frequency', dtick = 1000),
                     margin=Margin(b=150)
                    )
})

### Train with features file analysis

In [190]:
train_file = open(TRAIN_FEATS_FILE, "r", encoding="utf8")

token_count_dictionary = {}
pos_count_dictionary = {}
sentences_count = 0
train_file_lines = []

for line in train_file:
    line = line.replace("\n", "")
    train_file_lines.append(line)
    
    if len(line) == 0: #check if end of sentence
        sentences_count += 1
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 3)
    
    token = line_split[0]
    pos = line_split[1]
    
    if token not in token_count_dictionary:
        token_count_dictionary[token] = 1
    else:
        token_count_dictionary[token] += 1
        
    if pos not in pos_count_dictionary:
        pos_count_dictionary[pos] = 1
    else:
        pos_count_dictionary[pos] += 1
    
train_file.close()

In [191]:
train_file_length = len(train_file_lines)
print("> length of the train file:\t{0}".format(train_file_length))

> length of the train file:	24791


In [192]:
print("> number of sentences:\t{0}".format(sentences_count))

> number of sentences:	3338


In [193]:
number_of_tokens = len(token_count_dictionary)
print("> number of tokens:\t{0}".format(number_of_tokens))

> number of tokens:	1728


In [215]:
number_of_pos = len(pos_count_dictionary)
print("> number of pos:\t{0}".format(number_of_pos))

> number of pos:	46


In [195]:
X_axis = []
Y_axis = []
sorted_dict = sorted(pos_count_dictionary.items(), key=itemgetter(1), reverse=True)

for pos, pos_count in sorted_dict:
        X_axis.append(pos)
        Y_axis.append(pos_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#1abc9c"))],
    "layout": Layout(title="pos distribution", 
                     xaxis=dict(title='pos'),
                     yaxis=dict(title='frequency', dtick = 1000),
                     margin=Margin(b=150)
                    )
})

### Test file analysis

In [197]:
test_file = open(TEST_FILE, "r", encoding="utf8")

token_count_dictionary = {}
concept_including_prefix_count_dictionary = {}
concept_without_prefix_count_dictionary = {}
sentences_count = 0
test_file_lines = []

for line in test_file:
    line = line.replace("\n", "")
    test_file_lines.append(line)
    
    if len(line) == 0: #check if end of sentence
        sentences_count += 1
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 2)
    
    token = line_split[0]
    concept_including_prefix = line_split[1]
    concept_without_prefix = "X"
    
    if concept_including_prefix.startswith("B") or\
       concept_including_prefix.startswith("I") or\
       concept_including_prefix.startswith("E"):
        concept_without_prefix = concept_including_prefix[2:]
    else:
        concept_without_prefix = concept_including_prefix
    assert(concept_without_prefix != "X")
    
    if token not in token_count_dictionary:
        token_count_dictionary[token] = 1
    else:
        token_count_dictionary[token] += 1
        
    if concept_including_prefix not in concept_including_prefix_count_dictionary:
        concept_including_prefix_count_dictionary[concept_including_prefix] = 1
    else:
        concept_including_prefix_count_dictionary[concept_including_prefix] += 1
    
    if concept_without_prefix not in concept_without_prefix_count_dictionary:
        concept_without_prefix_count_dictionary[concept_without_prefix] = 1
    else:
        concept_without_prefix_count_dictionary[concept_without_prefix] += 1        

test_file.close() 

In [198]:
test_file_length = len(test_file_lines)
print("> length of the test file:\t{0}".format(test_file_length))

> length of the test file:	8201


In [199]:
print("> number of sentences:\t{0}".format(sentences_count))

> number of sentences:	1084


In [200]:
number_of_tokens = len(token_count_dictionary)
print("> number of tokens:\t{0}".format(number_of_tokens))

> number of tokens:	1039


In [201]:
number_of_concepts_including_prefix = len(concept_including_prefix_count_dictionary)
print("> number of concepts including prefix:\t{0}".format(number_of_concepts_including_prefix))

> number of concepts including prefix:	39


In [202]:
number_of_concepts_without_prefix = len(concept_without_prefix_count_dictionary)
print("> number of concepts without prefix:\t{0}".format(number_of_concepts_without_prefix))

> number of concepts without prefix:	23


In [214]:
X_axis = []
Y_axis = []
sorted_dict = sorted(concept_including_prefix_count_dictionary.items(), key=itemgetter(1), reverse=True)

for concept_including_prefix, concept_including_prefix_count in sorted_dict:
        X_axis.append(concept_including_prefix)
        Y_axis.append(concept_including_prefix_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#3498db"))],
    "layout": Layout(title="concepts including prefix distribution", 
                     xaxis=dict(title='concepts'),
                     yaxis=dict(title='frequency', dtick = 1000),
                     margin=Margin(b=150)
                    )
})

In [213]:
X_axis = []
Y_axis = []
sorted_dict = sorted(concept_without_prefix_count_dictionary.items(), key=itemgetter(1), reverse=True)

for concept_without_prefix, concept_without_prefix_count in sorted_dict:
        X_axis.append(concept_without_prefix)
        Y_axis.append(concept_without_prefix_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#3498db"))],
    "layout": Layout(title="concepts without prefix distribution", 
                     xaxis=dict(title='concepts', tickangle=90),
                     yaxis=dict(title='frequency', dtick = 1000),
                     margin=Margin(b=150)
                    )
})

### Test with features file analysis

In [205]:
test_file = open(TEST_FEATS_FILE, "r", encoding="utf8")

token_count_dictionary = {}
pos_count_dictionary = {}
sentences_count = 0
test_file_lines = []

for line in test_file:
    line = line.replace("\n", "")
    test_file_lines.append(line)
    
    if len(line) == 0: #check if end of sentence
        sentences_count += 1
        continue
    
    line_split = line.split("\t")
    assert(len(line_split) == 3)
    
    token = line_split[0]
    pos = line_split[1]
    
    if token not in token_count_dictionary:
        token_count_dictionary[token] = 1
    else:
        token_count_dictionary[token] += 1
        
    if pos not in pos_count_dictionary:
        pos_count_dictionary[pos] = 1
    else:
        pos_count_dictionary[pos] += 1
    
test_file.close()

In [206]:
test_file_length = len(test_file_lines)
print("> length of the test file:\t{0}".format(test_file_length))

> length of the test file:	8201


In [207]:
print("> number of sentences:\t{0}".format(sentences_count))

> number of sentences:	1084


In [208]:
number_of_tokens = len(token_count_dictionary)
print("> number of tokens:\t{0}".format(number_of_tokens))

> number of tokens:	1039


In [209]:
number_of_pos = len(pos_count_dictionary)
print("> number of pos:\t{0}".format(number_of_pos))

> number of pos:	46


In [212]:
X_axis = []
Y_axis = []
sorted_dict = sorted(pos_count_dictionary.items(), key=itemgetter(1), reverse=True)

for pos, pos_count in sorted_dict:
        X_axis.append(pos)
        Y_axis.append(pos_count)

assert(len(X_axis) == len(Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = 'v',
                 x = X_axis,
                 y = Y_axis,
                 marker = dict(color = "#3498db"))],
    "layout": Layout(title="pos distribution", 
                     xaxis=dict(title='pos'),
                     yaxis=dict(title='frequency', dtick = 500),
                     margin=Margin(b=150)
                    )
})