In [1]:
import numpy as np
import pandas as pd
import os
import json
import markdown

In [2]:
def new_data():
    # get the name of the new dataset file
    
    directory = '/home/aditya/Documents/citibot/newdata'
    data_name = "table.csv"
    DATASET_LOC = os.path.join(directory, data_name)

    # read the data file - csv, excel and json

    if data_name.endswith('.csv'):
        DATASET = pd.read_csv(DATASET_LOC)
    elif data_name.endswith('.xlx') or data_name.endswith('.xlsx'):
        DATASET = pd.read_excel(DATASET_LOC)
    elif data_name.endswith('.json'):
        DATASET = pd.DataFrame(json.load(open(DATASET_LOC, 'r')), index =[1])
    return DATASET

In [4]:
# This function should be triggered by a listener

DATASET = new_data()

# dict of columns in new data with corresponding dtypes

DATASET['Specimen No'] = DATASET['Specimen No'].astype('str')
DATASET.columns =["column " + str(i) for i, column in enumerate(DATASET.columns)] 
DATASET.columns =[column.replace(" ", "_") for column in DATASET.columns] 

FEATURES = {col: DATASET[col].dtype for col in DATASET}
DATASET.head()

In [73]:
# List all the possible entities 
def get_entities(threshold_value):
    PRIMARY_KEY = []
    ENTITIES = {}

    for col in FEATURES.keys():
        if (DATASET[col].unique().shape[0] == DATASET.shape[0]) and (FEATURES[col] == 'O' or FEATURES[col] == 'int64'):           
            PRIMARY_KEY.append(col)                                             # Make this the PRIMARY KEY
            ENTITIES[col] = DATASET[col].unique()[:threshold_value].tolist()

        elif DATASET[col].unique().shape[0] > threshold_value and (FEATURES[col] == 'O' or FEATURES[col] == 'int64'):
            ENTITIES[col] = DATASET[col].unique()[:threshold_value].tolist()

        elif FEATURES[col] == 'O' or FEATURES[col] == 'int64':
            ENTITIES[col] = DATASET[col].unique().tolist()
            
    return ENTITIES, PRIMARY_KEY

threshold_value = 20
ENTITIES, PRIMARY_KEY = get_entities(threshold_value)

In [66]:
# For query generation, any number of previously determined entities (values) can be used.
# For each intent, what should differ between the different sets of queries is the meaning the latter conveys.
# For eg if we have an intent "Cost", it should have questions like :
# what is the cost {intent} for entity_1{value} ?
# Since we have to use the intent / Feature name as a part of the query itself, an important thing to conside is 
# that the feature names should be simple and directly convey their purpose
# For eg, feature name can be "cost" but if the feature name is "cost per head" then we the issue is that
# the intent name wont be the same as the feature name, once this happens we'll have to extract the real intent 
# from the feature names which is not possible to do right now.

def get_questions(intent):
    questions = []
    questions.append("what is {} for {}".format(intent))
    questions.append("Tell me something about {} with {}".format(intent))
    questions.append("Give me information about {} for the {}".format(intent))
    
    return questions

# Dict of intents having a list of questions as their values.

INTENTS = {col:get_questions(col) for col in FEATURES.keys()}
INTENTS

In [87]:
from actions import lis

print(lis)

[9, 0]


In [80]:
# the current nlu.md file has the following information : (1)intents, (2)lookup, (3)synonyms
# created three functions to update each of the these sections respectively
# optimisation for future instead of writing, just append new data


def synonyms_to_md(data_md, entity_dict):
    data = []
    for entity in entity_dict.keys():
        if FEATURES[entity] == 'int64' or entity in PRIMARY_KEY:
            continue
            
        # Now the only entities left are character dtype with limited uniques
        # For each value in entity add synonyms
        
        for val in entity_dict[entity]:
            string = " synonym:{}\n".format(val)
            synonyms = [val.lower(), val.upper(), val.title()]
            for sys in synonyms:
                string += "- {}\n".format(sys)
            string += "\n"
            data.append(string)
    
    data_md += data
    return data_md

# lookup should not include primary_key entity, integer entities

def lookups_to_md(data_md, entity_dict):
    data = []
    for entity in entity_dict.keys():
        string_ent = " lookup:{}\n".format(entity)
        
        if FEATURES[entity] == 'int64' or entity in PRIMARY_KEY:
            continue
            
        for val in entity_dict[entity]:
            string_ent += "- {}\n".format(val)
        string_ent += "\n"
        data.append(string_ent)
    
    data_md += data
    return data_md

def intents_to_md(data_md, intent_dict):
    data_md[-1] = data_md[-1] + '\n'
    data = []
    for intent in intent_dict.keys():
        string_intent = " intent:{}\n".format(intent)
        for ques in intent_dict[intent]:
            string_intent += "- {}\n".format(ques)
        string_intent += "\n"
        data.append(string_intent)
    
    data_md += data
#     print(data)
    return data_md

nlu = open('/home/aditya/Documents/citibot/newdata/nlu.md', 'r')
s=nlu.read().split('##')

nlu_intent = intents_to_md(s, INTENTS)
nlu_lookup = lookups_to_md(nlu_intent, ENTITIES)
# nlu_synonyms = synonyms_to_md(nlu_lookup, ENTITIES)

new_nlu = '##'.join(nlu_lookup)

f = open("demofile2.md", "w")
f.write(new_nlu)
f.close()

In [79]:
# the current stories.md file has storylines with respect to a classified intent
# For the dynamic addition of data, contextual storylines cannot be added due to their increased complexity
# name : ## {intent} path

def new_stories(story, intent_dict):
    data = []
    actions = []
    for intent in intent_dict:
        action = "action_{}".format(intent)
        string =" {} path 1\n* {}\n  - {}\n\n".format(intent, intent, action)
        data.append(string)
        actions.append(action)
        
    story += data
    return story, action
    
story = open('/home/aditya/Documents/citibot/newdata/stories.md', 'r')
s=story.read().split('##')

story_text, Actions = new_stories(s, INTENTS)
new_story = '##'.join(story_text)

f = open("story.md", "w")
f.write(new_story)
f.close()    

In [53]:
mdf = open('/home/aditya/Documents/citibot/newdata/nlu.md', 'r')
# markdown.markdown(mdf.read())
s=mdf.read().split('##')

h, p = "heyaaaa", "how"
string = ""

string1 = " intent:{}\n".format(h)
string1 += "- {}\n".format(p)
string1 += "\n"

string2 = " intent:{}\n".format(p)
string2 += "- {}\n".format(h)
string2 += "\n"
ss= []
ss.append(string1)
ss.append(string2)
print(ss)
v = '##'.join([string, string1, string2])

s[-1] = s[-1] + '\n'
s += ss
print(s)
v = '##'.join(s)

['', ' intent:greet\n- hey\n- hello\n- hi\n- good morning\n- good evening\n- hey there\n\n', ' intent:hello_world\n- hello world\n- programming\n- c++\n- java\n\n', ' intent:goodbye\n- bye\n- goodbye\n- see you around\n- see you later\n\n', ' intent:affirm\n- yes\n- indeed\n- of course\n- that sounds good\n- correct\n\n', " intent:deny\n- no\n- never\n- I don't think so\n- don't like that\n- no way\n- not really\n\n", " intent:mood_great\n- perfect\n- very good\n- great\n- amazing\n- wonderful\n- I am feeling very good\n- I am great\n- I'm good\n\n", ' intent:mood_unhappy\n- sad\n- very sad\n- unhappy\n- bad\n- very bad\n- awful\n- terrible\n- not very good\n- extremely sad\n- so sad\n\n', ' intent:bot_challenge\n- are you a bot?\n- are you a human?\n- am I talking to a bot?\n- am I talking to a human?\n\n', ' intent:mycolor\n- I will choose the color [green](color)\n- how is [blue](color)\n- color is [black](color)\n- color was [blue](color)\n\n', ' lookup:color\n- green\n- black\n- b