In [1]:
import numpy as np
import pandas as pd
import os
import json
import markdown

In [2]:
def new_data():
    # get the name of the new dataset file
    
    directory = '/home/aditya/Documents/citibot/newdata'
    data_name = "table.csv"
    DATASET_LOC = os.path.join(directory, data_name)

    # read the data file - csv, excel and json

    if data_name.endswith('.csv'):
        DATASET = pd.read_csv(DATASET_LOC)
    elif data_name.endswith('.xlx') or data_name.endswith('.xlsx'):
        DATASET = pd.read_excel(DATASET_LOC)
    elif data_name.endswith('.json'):
        DATASET = pd.DataFrame(json.load(open(DATASET_LOC, 'r')), index =[1])
    return DATASET

In [3]:
# This function should be triggered by a listener

DATASET = new_data()
DATASET.head()

Unnamed: 0,Specimen No,WORKING GAP IN MM,Magnetic tool Rotational Speed,Finishing Time (sec),Feed mm/min,Unnamed: 5,Output
0,SPECIMEN1,1.0,300.0,36.0,1.25,10,0.0027
1,SPECIMEN2,2.0,450.0,48.0,1.25,10,0.0029
2,SPECIMEN3,5.0,600.0,64.0,1.25,10,0.0022
3,SPECIMEN4,1.0,300.0,48.0,1.66,20,0.0008
4,SPECIMEN5,2.0,450.0,64.0,1.66,20,0.0012


In [4]:
# dict of columns in new data with corresponding dtypes

DATASET['Specimen No'] = DATASET['Specimen No'].astype('str')
DATASET.columns =["column " + str(i) for i, column in enumerate(DATASET.columns)] 
DATASET.columns =[column.replace(" ", "_") for column in DATASET.columns] 

FEATURES = {col: DATASET[col].dtype for col in DATASET}

In [5]:
# List all the possible entities 
def get_entities():
    threshold_value = 20
    PRIMARY_KEY = []
    ENTITIES = {}

    for col in FEATURES.keys():

        if (DATASET[col].unique().shape[0] == DATASET.shape[0]) and (FEATURES[col] == 'O' or FEATURES[col] == 'int64'):           
            PRIMARY_KEY.append(col)                                             # Make this the PRIMARY KEY
            ENTITIES[col] = DATASET[col].unique()[:threshold_value].tolist()

        elif DATASET[col].unique().shape[0] > threshold_value and (FEATURES[col] == 'O' or FEATURES[col] == 'int64'):
            ENTITIES[col] = DATASET[col].unique()[:threshold_value].tolist()

        elif FEATURES[col] == 'O' or FEATURES[col] == 'int64':
            ENTITIES[col] = DATASET[col].unique().tolist()
    return ENTITIES

In [6]:
ENTITIES = get_entities()

In [10]:
# For query generation, any number of previously determined entities (values) can be used.
# For each intent, what should differ between the different sets of queries is the meaning the latter conveys.
# For eg if we have an intent "Cost", it should have questions like :
# what is the cost {intent} for entity_1{value} ?
# Since we have to use the intent / Feature name as a part of the query itself, an important thing to conside is 
# that the feature names should be simple and directly convey their purpose
# For eg, feature name can be "cost" but if the feature name is "cost per head" then we the issue is that
# the intent name wont be the same as the feature name, once this happens we'll have to extract the real intent 
# from the feature names which is not possible to do right now.

def get_questions(intent):
    questions = []
    questions.append("what is {}".format(intent))
    questions.append("Tell me something about {}".format(intent))
    questions.append("Give me information about {}".format(intent))
    
    return questions

In [13]:
# Dict of intents having a list of questions as their values.

INTENTS = {col:get_questions(col) for col in FEATURES.keys()}
INTENTS

{'column_0': ['what is column_0',
  'Tell me something about column_0',
  'Give me information about column_0'],
 'column_1': ['what is column_1',
  'Tell me something about column_1',
  'Give me information about column_1'],
 'column_2': ['what is column_2',
  'Tell me something about column_2',
  'Give me information about column_2'],
 'column_3': ['what is column_3',
  'Tell me something about column_3',
  'Give me information about column_3'],
 'column_4': ['what is column_4',
  'Tell me something about column_4',
  'Give me information about column_4'],
 'column_5': ['what is column_5',
  'Tell me something about column_5',
  'Give me information about column_5'],
 'column_6': ['what is column_6',
  'Tell me something about column_6',
  'Give me information about column_6']}

In [92]:
mdf = open('/home/aditya/Documents/citibot/newdata/nlu.md', 'r')
# markdown.markdown(mdf.read())
s=mdf.read().split('##')
s

['',
 ' intent:greet\n- hey\n- hello\n- hi\n- good morning\n- good evening\n- hey there\n\n',
 ' intent:hello_world\n- hello world\n- programming\n- c++\n- java\n\n',
 ' intent:goodbye\n- bye\n- goodbye\n- see you around\n- see you later\n\n',
 ' intent:affirm\n- yes\n- indeed\n- of course\n- that sounds good\n- correct\n\n',
 " intent:deny\n- no\n- never\n- I don't think so\n- don't like that\n- no way\n- not really\n\n",
 " intent:mood_great\n- perfect\n- very good\n- great\n- amazing\n- wonderful\n- I am feeling very good\n- I am great\n- I'm good\n\n",
 ' intent:mood_unhappy\n- sad\n- very sad\n- unhappy\n- bad\n- very bad\n- awful\n- terrible\n- not very good\n- extremely sad\n- so sad\n\n',
 ' intent:bot_challenge\n- are you a bot?\n- are you a human?\n- am I talking to a bot?\n- am I talking to a human?\n\n',
 ' intent:mycolor\n- I will choose the color [green](color)\n- how is [blue](color)\n- color is [black](color)\n- color was [blue](color)\n\n',
 ' lookup:color\n- green\n- 

In [104]:
h, p = "heyaaaa", "how"
string = ""

string1 = " intent:{}\n".format(h)
string1 += "- {}\n".format(p)
string1 += "\n"

string2 = " intent:{}\n".format(p)
string2 += "- {}\n".format(h)
string2 += "\n"


v = '##'.join([string, string1, string2])

In [105]:
f = open("demofile2.md", "w")
f.write(v)
f.close()