# Cleaning Data

In [1]:
import pandas as pd
import os, json, pickle, string
from collections import Counter
import numpy as np
MainPath = 'C:/Users/Francesco/Desktop/chatbot/'

In [2]:
# Retrieve Babel Domains Triples
if 'babeldomains3.json' not in os.listdir(MainPath + 'chatbot_data/others'):
    # open file from Babel Domains
    path = 'BabelDomains_full/BabelDomains/babeldomains_babelnet.txt'
    with open(path, 'r') as f:
        lstBabelDomains = [x.strip().split('\t') for x in f.readlines()]
    # Keeping only domains with no-disambiguation
    only3 = list(filter(lambda x: len(x) == 3, lstBabelDomains))
    dic_only3 = {x[0]: x[1] for x in only3}
    with open(MainPath + 'chatbot_data/others/babeldomains3.json', 'w') as f:
        json.dump(dic_only3, f)
else:
    with open(MainPath + 'chatbot_data/others/'+'babeldomains3.json', 'r') as f: dic_only3 = json.load(f)

In [3]:
# Load the Data downloaded from KBS Server
with open('kbs_entries.json', 'r') as f: new = f.read()
json_kbs1M = json.loads(new)
dic_kbs1M = {x['HASH']: x for x in json_kbs1M}

In [4]:
print('Total number of obs: %s' %len(dic_kbs1M))

Total number of obs: 1128884


Remove observations without BabelNet IDs

In [5]:
noBabelID = list(filter(lambda x: 'bn:' not in dic_kbs1M[x]['c1'] or 'bn:' not in dic_kbs1M[x]['c2'],
       dic_kbs1M.keys()))

In [6]:
print('Total number of obs: %s' %(len(dic_kbs1M)- len(noBabelID)))
print ('Observations with No-Babel ID for one of the concepts: %s'%len(noBabelID))
print ('Portion of discarded observations: %s'%(len(noBabelID)/len(dic_kbs1M)))

Total number of obs: 949185
Observations with No-Babel ID for one of the concepts: 179699
Portion of discarded observations: 0.15918287441402304


In [7]:
newKeys = set(list(dic_kbs1M.keys())) - set(noBabelID)
dic_kbs1M = {x : dic_kbs1M[x] for x in newKeys}

In [8]:
# Distribution of the number of concepts involved in each question
countBabelID = Counter([dic_kbs1M[x]['c1'].count('bn:') + dic_kbs1M[x]['c2'].count('bn:') for x in dic_kbs1M] )
tuplesCounter = sorted(countBabelID.items(), key = lambda x: x[1], reverse = True)
dfCounter = pd.DataFrame.from_records(tuplesCounter, columns=[['TotalConcepts', 'Frequency']])
dfCounter['Ratio'] = dfCounter['Frequency']/len(dic_kbs1M)
dfCounter.head()

Unnamed: 0,TotalConcepts,Frequency,Ratio
0,2,843074,0.888208
1,3,20526,0.021625
2,4,19703,0.020758
3,5,16219,0.017087
4,6,11946,0.012586


Since more than the 88% of the observations involve only 2 concept, we prefer to discard all the others.

In [9]:
HashConceptsRaw = [(hash_code,d['c1'], d['c2']) for (hash_code, d) in dic_kbs1M.items() 
                  if d['c1'].count('bn:') == 1 and d['c2'].count('bn:') == 1]
HashConcepts = [(hash_code, 'bn:' + c1.split('bn:')[1][:9], 'bn:' + c2.split('bn:')[1][:9]) 
                for (hash_code, c1, c2) in HashConceptsRaw]

In [10]:
print('Total number of obs: %s' %len(HashConcepts))
print ('Observations with more than 2 concepts: %s'%(len(dic_kbs1M)- len(HashConcepts)))
print ('Portion of discarded observations: %s'%(1 - len(HashConcepts)/len(dic_kbs1M)))

Total number of obs: 843074
Observations with more than 2 concepts: 106111
Portion of discarded observations: 0.11179169498043062


Now, looking at the Domains already retrived, we connect (C1, C2) - Domains

In [11]:
HashConceptsDomains = {}
for (hash_code, c1, c2) in HashConcepts:
    new_d = []
    if c1 in dic_only3:
        new_d.append(dic_only3[c1])
    if c2 in dic_only3:
        new_d.append(dic_only3[c2])
    new_d = list(set(new_d))
    if new_d != []:
        HashConceptsDomains[(hash_code, c1, c2)] = new_d

In [12]:
print('Total number of obs: %s' %len(HashConceptsDomains))
print('Observations without edge Domain - (C1, C2): %s'%(len(HashConcepts) - len(HashConceptsDomains)))
print ('Portion of discarded observations: %s'%(len(HashConcepts)/len(HashConceptsDomains) - 1))

Total number of obs: 658946
Observations without edge Domain - (C1, C2): 184128
Portion of discarded observations: 0.2794280563202447


We lost the 28% of observations, since they don't have an association with any domain.

# IdxRelations, IdxDomains and DataDiz saved

In [13]:
path = MainPath + 'BabelDomains_full/'
with open(path + 'domain_list.txt', 'r') as f: lst_domains = list(map(lambda x: x.strip(), 
                                                                      f.readlines()))

IdxDomains = {value: idx for idx, value in enumerate(lst_domains)}
with open(MainPath +'chatbot_data/others/IdxDomains.json', 'w') as f: json.dump(IdxDomains, f)

DataDiz = {}
for hc in HashConceptsDomains:
    obs = {
        'question' : dic_kbs1M[hc[0]]['question'],
        'relation' : dic_kbs1M[hc[0]]['relation'],
        'domain' : HashConceptsDomains[hc],
        'answer' : dic_kbs1M[hc[0]]['answer'],
        'concepts': (hc[1], hc[2])
    }
    DataDiz[hc[0]] = obs    
    
IdxRelations = {value: idx for idx, value in enumerate(sorted(set([DataDiz[hashcode]['relation'] 
                                                         for hashcode in DataDiz])))}
with open(MainPath +'chatbot_data/others/IdxRelations.json', 'w') as f: json.dump(IdxRelations, f)



path = MainPath + 'chatbot_data/others/'
with open(path + 'DataDiz.json', 'w') as outfile: json.dump(DataDiz, outfile)
pickle.dump(HashConceptsRaw, open(path + 'HashConceptsRaw.p', "wb" ))