# Query Unfolding

In [1]:
import json
import Datalog_Parsing as dp;

In [2]:
# read metadictionary.json
md = json.loads(open('metadictionary.json').read())

In [3]:
def unfold_datalog_body(datalog, metadictionary):
    '''function to unfold datalog string (predicate and atoms) according to mapping metadictionary'''
    parsed_dl = dp.parsePredicateAtoms(datalog)
    pred = parsed_dl['predicate']
    
    # create mapping dictionary
    mapping_dict = {}
    for idx,d in enumerate(metadictionary[pred]['datalog']):
        mapping_dict[d] = parsed_dl['atoms'][idx]
        
    # create source dictionary
    src_dict = {}

    for idx,m in enumerate(metadictionary[pred]['mapping']):
        src = m['source']
        src_tbl = m['table']
        src_dl = m['source.datalog']

        res_dl = ['_'] * len(src_dl)

        for idx,s in enumerate(src_dl):
            res_dl[idx] = mapping_dict[s]

        src_dict[src+'.'+src_tbl] = res_dl
    
    # convert filled source dictionary to datalog
    mapped = ','.join([k+'('+','.join(src_dict[k])+')' for k in src_dict.keys()])
    return mapped

In [4]:
b = 'mlfeatures(n,y,month,_,sales,_,_,sent)'
unfold_datalog_body(b, md)

u'S1.mlfeatures(n,y,month,_,sales,_,_),S2.sentiment(y,month,n,sent)'

In [5]:
ms_relations = ['mlfeatures']
ms_relations

['mlfeatures']

In [7]:
# unfold datalog - unfolds bodies only
def unfold_datalog(datalog, metadictionary):
    processed = dp.processDatalog(datalog)
    processed_unfolded = dp.processDatalog(datalog)
    unfolded_datalog_strings = []

    for idx,parts in enumerate(processed['single_parts']):
        body_unfolded_list = []
        for body in parts['body']:
            body_unfolded = unfold_datalog_body(body, metadictionary)
            body_unfolded_list.extend([body_unfolded])

        processed_unfolded['single_parts'][idx]['body'] = body_unfolded_list
        unfolded_datalog_strings.extend([dp.buildDatalogString(processed_unfolded['single_parts'][idx])])

    unfolded_datalog = '.'.join(unfolded_datalog_strings).lower()
    return unfolded_datalog

# test using ml_predict_query
datalog = dp.ml_predict_query
print datalog
unfold_datalog(datalog, md)

Ans (nodeId, yr, mn, sales, vol, pm_sales, pm_vol, p3m_sales, p3m_vol, 
            p12m_sales, p12m_vol, pm_numreviews, pm_avgrating, p3m_numreviews, p3m_avgrating, p12m_numreviews, 
            p12m_avgrating, pm_avgsntp, p3m_avgsntp, p12m_avgsntp ) :-
        mlfeatures ( nodeId, yr, mn, sales, vol, pm_sales, pm_vol, p3m_sales, p3m_vol, p12m_sales, 
            p12m_vol, pm_numreviews, pm_avgrating, p3m_numreviews, p3m_avgrating, p12m_numreviews, 
            p12m_avgrating, pm_avgsntp, p3m_avgsntp, p12m_avgsntp ) , 
        nodeId in (15, 45, 121), 
        mn=12, 
        yr=2015.


u'ans(nodeid,yr,mn,sales,vol,pm_sales,pm_vol,p3m_sales,p3m_vol,p12m_sales,p12m_vol,pm_numreviews,pm_avgrating,p3m_numreviews,p3m_avgrating,p12m_numreviews,p12m_avgrating,pm_avgsntp,p3m_avgsntp,p12m_avgsntp):-s1.mlfeatures(nodeid,yr,mn,sales,vol,pm_sales,pm_vol),s2.sentiment(yr,mn,nodeid,p3m_sales),mn=12,yr=2015,nodeid in (15,45,121)'

In [9]:
# test datalog with mediated schema relation as subgoal of groupby
datalog  = '''ans (n, y, m, agg_sales) :-
    group_by(mlfeatures ( n, y, m, s, v, pm_sales, pm_vol, p3m_sales, p3m_vol, p12m_sales,
    p12m_vol, pm_numreviews, pm_avgrating, p3m_numreviews, p3m_avgrating, p12m_numreviews,
    p12m_avgrating, pm_avgsntp, p3m_avgsntp, p12m_avgsntp ) , [n], agg_sales=sum(s)),
    nodeid in (1,2,3),
    m=12,
    y=2015.
    '''

In [10]:
# work in progress - eventually turn into function

# turn mediated schema predicates within orderby, groupby, topn into intermediate steps
#datalog = dp.analytic_query_1

processed = dp.processDatalog(datalog)
processed_intermed_step = dp.processDatalog(datalog)

datalog_strings = []

for idx,part in enumerate(processed['single_parts']):
    
    gb_parsed = part['groupby.parsed']
    
    if gb_parsed is not None:
        gb_subgoal = part['groupby.parsed']['predicate']
        if gb_subgoal['predicate'] in ms_relations:
            # build datalog for new intermediate step
            gb_pred = gb_subgoal['predicate']
            gb_atoms = gb_subgoal['atoms']
            intermed_step_body = gb_pred + '(' + ','.join(gb_atoms) + ')'
            intermed_step_head = gb_pred + '_gb_intermed(' + ','.join(gb_atoms) + '):-' # how to ensure always unique?
            intermed_step_datalog = intermed_step_head + intermed_step_body

            # update group by step - use same group by but replace ms_relation subgoal with head of intermediate step
            #processed_intermed_step = ...

        datalog_strings.extend([intermed_step_datalog, dp.buildDatalogString(part)])
        
    # do same "if ob_parsed is not None" block with orderby, topn
    
    datalog_result = '.'.join(datalog_strings).lower()
    
datalog_result

'mlfeatures_gb_intermed(n,y,m,s,v,pm_sales,pm_vol,p3m_sales,p3m_vol,p12m_sales,p12m_vol,pm_numreviews,pm_avgrating,p3m_numreviews,p3m_avgrating,p12m_numreviews,p12m_avgrating,pm_avgsntp,p3m_avgsntp,p12m_avgsntp):-mlfeatures(n,y,m,s,v,pm_sales,pm_vol,p3m_sales,p3m_vol,p12m_sales,p12m_vol,pm_numreviews,pm_avgrating,p3m_numreviews,p3m_avgrating,p12m_numreviews,p12m_avgrating,pm_avgsntp,p3m_avgsntp,p12m_avgsntp).ans(n,y,m,agg_sales):-group_by(mlfeatures(n,y,m,s,v,pm_sales,pm_vol,p3m_sales,p3m_vol,p12m_sales,p12m_vol,pm_numreviews,pm_avgrating,p3m_numreviews,p3m_avgrating,p12m_numreviews,p12m_avgrating,pm_avgsntp,p3m_avgsntp,p12m_avgsntp),[n],agg_sales=sum(s)),m=12,y=2015,nodeid in (1,2,3)'