## Imports

In [1]:
from src.main import InsightsPro
import glob
from core.utils.client_utils import  get_vectordb_client
import json
from src.query_insights.entity_extraction import EntityExtraction
from core.utils.client_utils import get_model_type,get_entity_extraction_client
from core.model.model_factory import ModelFactory
from core.utils.read_config import config,initialize_config
import pandas as pd
import re
import os

  from .autonotebook import tqdm as notebook_tqdm


## Input paths

In [2]:
# Domain name
domain_name = "mcd"

### Config Initialization

In [3]:
user_config, data_config, model_config, debug_config = initialize_config()

### User Inputs

In [4]:
question = "What was the sales of my stores in the last 1 year?"
additional_context = None
language = "english"

### Creating Entity Extraction Data Dictionary 

In [5]:
os.makedirs("../../data/entity_extraction_data_dictionary/",exist_ok=True)
for name in glob.glob( "../../data/data_dictionary/"+ "*.json"):
        print(name)
        with open(name,"r") as file:
            json_file = json.load(file)
        filename=name.replace('data_dictionary','entity_extraction_data_dictionary')
        with open(filename,"w") as file:
            json.dump(json_file,file)

../../data/data_dictionary/playbook_table.json
../../data/data_dictionary/gc_table.json
../../data/data_dictionary/recommendation_table.json
../../data/data_dictionary/summary_category_channel_table.json
../../data/data_dictionary/snapshot_table.json
../../data/data_dictionary/summary_item_channel_price_table.json
../../data/data_dictionary/summary_store_table.json
../../data/data_dictionary/summary_item_channel_view.json


### InsightsPro Initialization

In [6]:
insightspro = InsightsPro(user_config=user_config,
    data_config=data_config,
    model_config=model_config,
    debug_config=debug_config)

### Initializing parameters

In [8]:
business_overview = insightspro.generic_initializations.business_overview
prompts = model_config.extract_formulas.prompts
model_params = model_config.extract_formulas.model_params
data_dictionary = insightspro.dataloader.data_dictionary
data_dictionary_path = data_config.path.data_dictionary_path
entity_extraction_data_dictionary_path = data_config.path.entity_extraction_data_dictionary_path
categorical_dict_path = "../../data/output_folder/categorical_values.json"

#Initializing EntityExtraction class
entityextraction = EntityExtraction()
entity_extraction_model = get_entity_extraction_client(config.entity_extraction_details,data_dictionary)
with open(categorical_dict_path,"r") as file:
    categorical_dict = json.load(file)

## Extraction of formulas from document using LLM

In [11]:
config["llm_model_type"] = "openai"
model_client = get_model_type(config,prompts,None,None,user_config.connection_params,user_config,None,None,business_overview,None,None,)
model_factory = ModelFactory(model_client)
formula_data_dictionary, model_finish,model_tokens,error_message, = model_factory.model_type.model_response(model_params)

In [12]:
# changing output string to JSON format
formula_data_dictionary = eval(formula_data_dictionary)

In [13]:
formula_data_dictionary

{'new_units': ['current_units',
  'price_elasticity',
  'new_price',
  'current_price'],
 'Sales': ['Units', 'Price'],
 'new_sales': ['new_price', 'new_units'],
 'current_sales': ['current_price', 'current_units'],
 'Change in sales': ['new_sales', 'current_sales'],
 'Margin': ['Price', 'Cost', 'Units'],
 'new_margin': ['new_price', 'current_fpc', 'new_units'],
 'current_margin': ['current_price', 'current_fpc', 'current_units'],
 'Increase in margin': ['new_margin', 'current_margin'],
 'Margin impact': ['new_margin', 'current_margin'],
 'WAP at current_price': ['current_price', 'current_units'],
 'WAP at new_price': ['new_price', 'current_units'],
 'WAP impact': ['new_price',
  'current_units',
  'sum_current_units',
  'current_price'],
 'Impact on wap': ['new_price',
  'current_units',
  'sum_current_units',
  'current_price'],
 'new_gc': ['current_gc_per_week', 'gc_elasticity', 'new_wap', 'current_wap'],
 'GC impact': ['gc.current_gc',
  'gc.gc_elasticity',
  'wp.new_wap',
  'wp.cur

# Preparation of Data Dictionary to run Entity Extraction Model

#### To run Entity extraction model following data have to be added to data dictionary :
1)create entity_datecolumn key if the description contains date pattern.<br>
2)create metrics key with metrics if the column is used to calculate any metrics.<br>
3)Add metrics information to description if the column is used for metrics calculation.<br>
4)Processes column name that includes lowercase, replaces hyphens and underscores, lemmatization and replace id string with space if the text ends with id.<br>
5)create entity_description key to store a list of entities for description using entity extraction model.

In [15]:
def add_entities_to_data_dictionaries(entity_extraction_data_dictionary_path, formula_data_dictionary,entityextraction,entity_extraction_model,categorical_dict):

    for name in glob.glob(entity_extraction_data_dictionary_path + "*.json"):
        print(name)
        with open(name,"r") as file:
            json_file = json.load(file)

        for dict2 in json_file['columns']:
            #create entity_datecolumn key if the description contains date pattern

            if entityextraction._contain_datetime(dict2['description'].lower()):
                dict2['entity_datecolumn'] =  'Yes'
            else:
                dict2['entity_datecolumn'] = 'No'

            #create metrics key with metrics if the column is used to calculate any metrics
            if "metrics" not in dict2.keys():
                dict2['metrics'] = []

            #Add metrics information to description if the column is used for metrics calculation
            for lhs,rhs in formula_data_dictionary.items():
                lhs = lhs.lower()
                lhs_flag = False
                rhs = [r.lower() for r in rhs]
                for match in rhs:
                    if "." in match:
                        match = match.split(".")[1]
                    match = match.strip()
                    if dict2['name']==match:
                        if not lhs_flag:
                            dict2['description'] = (dict2['description']+". '"+lhs+"' is calculated using this column.")
                            lhs_flag = True
                        dict2['metrics'].append(lhs)

            dict2['description'] = dict2['description'].replace("..",".").replace(".. ",". ")
            dict2['metrics'] = list(set(dict2['metrics']))

            #Processes column name that includes lowercase, replaces hyphens and underscores, lemmatization and replace id string with space if the text ends with id
            column_name = ''
            if dict2['name'].endswith("_id"):
                column_name = dict2['name'].replace("_id", "")
            elif dict2['name'].endswith("_name"):
                column_name = dict2['name'].replace("_name", "")
            else:
                pass

            if column_name!='':
                dict2['entity_name'] = [entityextraction._lemmatization(column_name)]
            else:
                dict2['entity_name'] = [entityextraction._lemmatization(dict2['name'])]

            #create entity_description key to store a list of entities for description using entity extraction model.

            entity_extraction_model = get_entity_extraction_client(config.entity_extraction_details,data_dictionary)
            entity_extraction_model.get_entities(entityextraction._lemmatization(dict2['description']))

            model_entities = entity_extraction_model.entities

            entities = []
            for tup in model_entities:

                entities.append(tup[0])
            dict2['entity_description'] =  list(set(entities))

            if "entity_categorical_values" not in dict2.keys():
                dict2['entity_categorical_values'] = []


            for key,cat_values in categorical_dict.items():
                tab_name, col_name = (
                        key.split(",")[0],
                        key.split(",")[1],
                    )
                if tab_name==json_file['table_name'] and col_name==dict2['name']:
                    cat_list = []
                    for cat_value in cat_values:
                        if cat_value:
                            cat_list.append(cat_value.replace("-", " ").replace("_", " ").lower())
                    dict2['entity_categorical_values'].extend(cat_list)
        # entity_extraction_data_dictionary_path = data_dictionary_path.replace('data_dictionary','entity_extraction_data_dictionary')
        # os.makedirs(entity_extraction_data_dictionary_path,exist_ok=True)
        # filename = name.replace('data_dictionary','entity_extraction_data_dictionary')
        with open(name,"w") as file:
            json.dump(json_file,file)

add_entities_to_data_dictionaries(entity_extraction_data_dictionary_path,formula_data_dictionary,entityextraction,entity_extraction_model,categorical_dict)

../../data/entity_extraction_data_dictionary/playbook_table.json


../../data/entity_extraction_data_dictionary/gc_table.json
../../data/entity_extraction_data_dictionary/recommendation_table.json
../../data/entity_extraction_data_dictionary/summary_category_channel_table.json
../../data/entity_extraction_data_dictionary/snapshot_table.json
../../data/entity_extraction_data_dictionary/summary_item_channel_price_table.json
../../data/entity_extraction_data_dictionary/summary_store_table.json
../../data/entity_extraction_data_dictionary/summary_item_channel_view.json


## Run Till This Cell To Use Table Selection

## Extraction of logic and examples from document using LLM

In [10]:
with open("../../data/mcd/complex_questions_glossary.txt") as file:
    complex_questions_glossary = file.read()

In [30]:
config["llm_model_type"] = "openai"
prompts = model_config.complex_questions_extract_logic.prompts
model_params = model_config.complex_questions_extract_logic.model_params

model_client = get_model_type(config,prompts,None,None,user_config.connection_params,user_config,None,None,complex_questions_glossary,None,None,)
model_factory = ModelFactory(model_client)
logic_complex_questions, model_finish,model_tokens,error_message, = model_factory.model_type.model_response(model_params)


In [31]:
logic_complex_questions = eval(logic_complex_questions[13:-11])

In [34]:
logic_complex_questions

{'Logic 1: The number of items recommended in a store depends on the number of items with low sensitivity or self-elasticity. Items falling under low sensitivity/self -elasticity bin are likely to receive price change compared to items falling under medium and high sensitivity bins. The reasons behind a price recommendation or lack of it is present in the reason code column. columns required - playbook_table.selfelasticity_bin,playbook_table.reason_code': ['Why does big mac not have a recommendation in my store in delivery channel?',
  'Only 5 cents increase is recommended in item 5 in instore channel. Why is that?',
  'Why does a low sensitive item not get recommendation in my store in delivery channel?',
  'Item 5 has a negative impact in instore channel. Why did it get recommendation?'],
 'Logic 2: To calculate margin impact of a metric if we increase the price of an item by x cents, pls follow the below steps. 1. Increase the new_price for item_y by x cents to get new_price_after_p

### Generating similar questions for the questions in logic

In [None]:
config["llm_model_type"] = "openai"
prompts = model_config.generate_similar_questions.prompts
model_params = model_config.generate_similar_questions.model_params
i = 0
question_df = pd.DataFrame(columns= ['chunk_type','chunk','tables_columns'])
for logic, questions_list in logic_complex_questions.items():
    for ques in questions_list:
        model_client = get_model_type(config,prompts,ques,None,user_config.connection_params,user_config,None,None,None,None,None,)
        model_factory = ModelFactory(model_client)

        output_, model_finish,model_tokens,error_message, = model_factory.model_type.model_response(model_params)
        output_ = output_[9:-7].split(",\n")
        output_= [x.replace("\"","") for x in output_ if x!='' and len(x)>15]
        table_column = logic.split('columns required - ')[1]
        question_df.loc[i]  = logic,re.sub(r'\d+', '', ques),table_column.strip()
        for q in output_:
           i = i+1
           question_df.loc[i] = logic,re.sub(r'\d+', '', q),table_column.strip()
        i=i+1
for text in business_overview.split("\n"):
    if text!="":
        question_df.loc[i] = "Glossary : "+text,text,""
        i=i+1


### Generating Embeddings for questions

In [None]:
question_df['unique_id'] = question_df.index+1
question_df = question_df[['unique_id','chunk_type','chunk','tables_columns']]
vector_db = get_vectordb_client(config.embedding_details, question_df)
vector_db.insert_data()
chunk_embeddings = vector_db.data_frame
chunk_embeddings['embeddings']=chunk_embeddings['embeddings'].apply(lambda x: x.tolist())


Connected to Milvus at localhost:19530
Creating Collection: my_collection_chunk
Creating Index for chunk
Inserting data into chunk


In [None]:
chunk_embeddings.to_excel("../data/mcd/chunk_embeddings.xlsx",index=False)

In [1]:
from core.utils.client_utils import get_entity_extraction_client

In [22]:
entity_extraction = get_entity_extraction_client(
                config.entity_extraction_details, insightspro.dataloader.data_dictionary)


In [23]:
question ="What is the distribution of price change by Category for store 84000006?"

In [24]:
entity_extraction.get_entities(question)

In [26]:
entity_extraction.entities

[('price', 0.417),
 ('84000006', 0.372),
 ('store', 0.3677),
 ('distribution', 0.3673),
 ('category', 0.3376),
 ('price change', 0.6122),
 ('distribution price', 0.5793),
 ('store 84000006', 0.5361),
 ('category store', 0.4711),
 ('price', 0.417)]

In [34]:
import pandas as pd
for table_name, table_data in insightspro.dataloader.data_dictionary.items():
    for dictionary in table_data["columns"]:
        for entity in entity_extraction.entities:
            pattern = re.compile(entity[0], re.IGNORECASE)
            matching_items = [item for item in dictionary['entity_categorical_values'] if pattern.search(item)]
            if matching_items:
                print(entity,matching_items)

('84000006', 0.372) ['84000006']
('store', 0.3677) ['instore']
('84000006', 0.372) ['84000006']
('store', 0.3677) ['instore']
('84000006', 0.372) ['84000006']
('store', 0.3677) ['instore']
('price', 0.417) ['no price change : adjustment beyond optimizer to preserve price architecture rule', 'price change : adjustment beyond optimizer to fix a business rule', 'partial increase: recommended increase to maintain price architecture rules', 'no increase: price architecture rules restrict any recommended increase', 'full increase: recommended increase to maintain price architecture rules', 'no increase: held to avoid high price changes for aligning price architecture', 'partial increase: positive margin and revenue impact limited by price architecture rules']
('store', 0.3677) ['no increase: triggers store performance safeguards, not enough positive revenue and margin impact to justify risk', 'partial increase: further increase limited by store performance safeguards']
('price change', 0.612

In [30]:
pattern

re.compile(r'entity', re.IGNORECASE|re.UNICODE)