In [1]:
!pip install pandas tqdm datasets



In [2]:
import os, pandas as pd, json, sys
import time
# import openai
from tqdm import tqdm
import random
import copy
from collections import Counter
from pprint import pprint
import csv
import math
from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
trending_widget_prefix = "@AvantQueryEDW2:trending_widget - "
top_accounts_prefix = "@AvantQueryEDW2:top_accounts_widget - "
seed_value = 626

###  Training Data Prompts

In [4]:
## Prompt - Human Language Query -> Complete Report JSON - Data Creation
natural_langage_query_to_complete_report_json = """
## Purpose
Your task is to convert Avantlink EDW2 natural language queries ( @AvantQueryEDW2 ), into properly structured JSON queries suitable for the Avantlink API.

## Understanding the Query
1. **Identify Key Elements**: Carefully read the query to identify important elements such as data fields, filters, date ranges, and special conditions like record limits.
2. **Clarify Ambiguities**: If any part of the query is unclear, make reasonable assumptions based on the context.

## Constructing the JSON Query
1. **Initialize a Basic JSON Template**: Start with a JSON structure containing necessary keys like `"performance_summary"`, `"cols"`, and `"filters"`.
2. **Populate Data Fields**:
   - For each required data field in the query, add an object in the `"cols"` array.
   - Assign `id`, `alias`, and `format` for each field, conforming to Avantlink's standards.
3. **Incorporate Filters**:
   - For each filter condition, add an object in the `"filters"` array.
   - Include attributes like `field`, `op`, `values`, and any specific requirements (e.g., `"case_insensitive"`: `true`).
4. **Handle Date Range**:
   - Specifically manage date ranges, ensuring correct formatting and value inclusion.
5. **Apply Special Conditions**:
   - Address any special conditions mentioned in the query, such as limits on the number of records.

## Validation and Finalization
- **Check JSON Syntax**: Ensure that the JSON query is syntactically correct.
- **Verify Accuracy**: Cross-verify that every aspect of the natural language query is accurately reflected in the JSON structure.
- **Ensure API Compatibility**: Make sure the JSON query aligns with the latest specifications of the Avantlink API.

## Continuous Improvement
- **Learn from Examples**: Use examples of successful translations to improve your understanding and approach.
- **Update Regularly**: Stay informed about any changes to Avantlink's API and adjust your method accordingly.

## Additional Notes
- Prioritize the context and intent of the query while translating.
- Be adaptive and open to refining your approach based on feedback and new information.

---

This instruction set is designed to guide you, as an LLM, in accurately translating natural language queries into structured JSON queries for the Avantlink API. It emphasizes understanding the query's context, meticulous construction of the JSON structure, and continuous improvement in your approach.
"""

In [5]:

## Prompt - Human Language Description -> Column JSON - Columns Definition JSON Data Creation
description_to_column_definition_training_prompt = "Given a description of a column in @Avantlink:EDW2, provide the corresponding JSON definition for that column. Column Definitions are unique, but not universal between EDW2 and EDW3."
## Prompt - Human Language Desription -> Fact/Dim Columns I.D. (Human Readable Name Definition) Data Creation
description_to_table_id_training_prompt = "Given a description of a column in an @Avantlink:EDW2, provide the corresponding table I.D. for that column. The Table I.D. is unique, but not universal between EDW2 and EDW3."
## Prompt - Table I.D. -> Fact/Dim Columns Definition (JSON) Data Creation
table_id_to_definition_training_prompt = "Given the table_id of a column in an @Avantlink:EDW2, provide the corresponding JSON definition for that column. The Table I.D. is unique, but not universal between EDW2 and EDW3."

## Prompt - Human Readable Merchant Name -> Merchant I.D.
merchant_name_to_id_training_prompt  = "Given an @Avantlink:merchant_name, provide the corresponding @Avantlink:merchant_id. Merchant I.D.s are unique and universal between EDW2 and EDW3."
## Prompt - Merchant I.D. -> Human Readable Merchant Name
merchant_id_to_name_training_prompt  = "Given an @Avantlink:merchant_id, provide the corresponding human readable @Avantlink:merchant_name. Merchant I.D.s are unique and universal between EDW2 and EDW3."
## Prompt - Human Readable Affiliate Name -> Affiliate I.D.
affiliate_name_to_id_training_prompt = "Given an @Avantlink:affiliate_name, provide the corresponding @Avantlink:affiliate_id. Affiliate I.D.s are unique and universal between EDW2 and EDW3."
## Prompt - Affiliate I.D. -> Human Readable Affiliate Name
affiliate_id_to_name_training_prompt = "Given an @Avantlink:affiliate_id, provide the corresponding human readable @Avantlink:affiliate_name. Merchant I.D.s are unique and universal between EDW2 and EDW3."

## Prompt - Premade Dashboard Reports - Human Language Direction -> Performance Report JSON (Performance Reports) - Data Creation - Trending Widget
trending_widget_report_json_training_prompt = "Given a human readable request for an @Avantlink:EDW2:trending_widget report, provide the corresponding JSON query for the specified trending widget report. Include the appropriate date range, and merchant uuid filter for the report. The rest of this kind of report is premade."
## Prompt - Premade Dashboard Reports - Human Language Direction -> Performance Report JSON (Performance Reports) - Data Creation - Top Accounts Widget
top_accounts_widget_report_json_training_prompt = "Given a human readable request for an @Avantlink:EDW2:top_accounts_widget report, provide the corresponding JSON query for the specified top accounts widget report. Include the appropriate date range, and merchant uuid filter for the report. The rest of this kind of report is premade."

In [6]:
smangrul_data_system_preprompt = """You are a helpful, respectful and honest assistant. 
Always answer as helpfully as possible, while being safe. Your 
answers should not include any harmful, unethical, racist, 
sexist, toxic, dangerous, or illegal content. Please ensure that 
your responses are socially unbiased and positive in nature.\n
\n
If a question does not make any sense, or is not factually 
coherent, explain why instead of answering something not correct. 
If you don’t know the answer to a question, please don’t share 
false information.\n
"""
smangrul_system_prefix = '<|system|>'
smangrul_user_prefix = '<|prompter|>'
smangrul_assistant_prefix = '<|assistant|>'
smangrul_end_decorator = '<|endoftext|>'

In [7]:
def extract_content(json_str):
    try:
        # Parse the JSON string into a dictionary
        json_dict = json.loads(json_str)
        # Navigate to the "content" field and return its value
        content = json_dict['choices'][0]['message']['content']
        return content
    except (KeyError, IndexError, TypeError, json.JSONDecodeError):
        # Handle any errors that occur while parsing the JSON or navigating to the "content" field
        return json_str

In [8]:
def extract_alternatives(alternatives_text):
    if not isinstance(alternatives_text, str):
        return None, None  # Return None for both alternatives if alternatives_text is not a string
    
    try:
        # Split the alternatives_text by newline character to separate the alternatives
        alternatives = alternatives_text.split('\n\n')
        
        # Extract the text for Alternative 1 and Alternative 2
        alternative_1 = alternatives[0].replace('Alternative 1: ', '')
        alternative_2 = alternatives[1].replace('Alternative 2: ', '')
        
        return alternative_1, alternative_2
    except IndexError:
        # Handle any errors that occur if the expected format is not found
        return None, None

In [9]:
def create_report_query_data_list(merged_df, task_instruction, prefix='@AvantQueryEDW2 - '):
    ### Report Query NLP Training Data Creation
    report_query_data_list = []
    for _, row in merged_df.iterrows():
        # Append the 'output' column entry
        report_query_data_list.append({
            "instruction": task_instruction,
            "input": prefix + row['output'],
            "output": row['code']
        })

        # Check for 'alternative_1' and append if not null
        if pd.notnull(row['alternative_1']):
            report_query_data_list.append({
                "instruction": task_instruction,
                "input": prefix + row['alternative_1'],
                "output": row['code']
            })

        # Check for 'alternative_2' and append if not null
        if pd.notnull(row['alternative_2']):
            report_query_data_list.append({
                "instruction": task_instruction,
                "input": prefix + row['alternative_2'],
                "output": row['code']
            })

    return report_query_data_list

In [10]:
def remove_quotes(s):
    if s.startswith('"') and s.endswith('"'):
        return s[1:-1]  # Remove the first and last character (the double quotes)
    return s  # Return the string unchanged if it doesn't start and end with a double quote

In [11]:
def create_col_definition_dict(row):
    output = json.loads(row['definition'])[0]
    return {
        "instruction": description_to_column_definition_training_prompt,
        "input": row['output'],
        "output": output
    }

In [12]:
def create_table_id_dict(row):
    obj = json.loads(row['definition'])[0]
    try:
        id = obj['id']
        return {
        "instruction": description_to_table_id_training_prompt,
        "input": row['output'],
        "output": id
    }
    except KeyError as e:
        print("KeyError: ", e, obj)

In [13]:
def create_column_id_dict(row):
    obj = json.loads(row['definition'])[0]
    try:
        id = obj['id']
        return {
        "instruction": table_id_to_definition_training_prompt,
        "input": id,
        "output": obj
    }
    except KeyError as e:
        print("KeyError: ", e, obj)

In [14]:
def refactor_widget_objects(widget_objects):
    new_widget_dict = {}
    for category in widget_objects:
        for report in list(widget_objects[category]):  # Using list to avoid RuntimeError
            new_keyname = category + "_" + report
            new_widget_dict[new_keyname] = widget_objects[category].pop(report)
    return new_widget_dict

In [15]:
def process_dashboard_objects(file_path):
    # Read the JSON file
    with open(file_path, 'r') as file:
        edw2_dashboard_objects = json.load(file)

    # Process "trending_widget" and "top_affiliates_widget" objects
    trending_widget_objects = edw2_dashboard_objects["trending_widget"]
    top_accounts_objects = edw2_dashboard_objects["top_affiliates_widget"]

    # Refactor objects into dictionaries with modified key names
    trending_widget_dict = refactor_widget_objects(trending_widget_objects)
    top_accounts_dict = refactor_widget_objects(top_accounts_objects)

    return trending_widget_dict, top_accounts_dict


In [16]:
def create_training_data(dataframe, instruction, input_col, output_col):
    training_data = []
    for index, row in dataframe.iterrows():
        training_data.append({
            "instruction": instruction,
            "input": row[input_col],
            "output": row[output_col]
        })
    return training_data

In [17]:
df = pd.read_csv('./OpenAI_Json_query_output.csv') # Merged_df creation
alternatives_df = pd.read_csv('./report_query_variations.csv')
# Use the apply method to apply the function to each row in the 'alternatives' column
alternatives_df['alternatives'] = alternatives_df['alternatives'].apply(extract_content)
# Merge the two DataFrames on common columns using an outer join
merged_df = pd.merge(df, alternatives_df, on=['name', 'code', 'output'], how='outer', suffixes=('', '_alt'))
# Use the apply method to apply the function to each row in the 'alternatives' column
merged_df['content'] = merged_df['alternatives'].apply(extract_content)
# Use the apply method to apply the function to each row in the 'alternatives' column,
# and assign the results to the new columns 'alternative_1' and 'alternative_2'
merged_df['alternative_1'], merged_df['alternative_2'] = zip(*merged_df['alternatives'].apply(extract_alternatives))
merged_df.drop(columns=['content', 'alternatives'], inplace=True)
# Replace the phrases "Alternative 1:" and "Alternative 2:" with an empty string in the respective columns
merged_df.loc[:, 'alternative_1'] = merged_df['alternative_1'].str.replace('Alternative 1:', '', regex=False)
merged_df.loc[:, 'alternative_2'] = merged_df['alternative_2'].str.replace('Alternative 2:', '', regex=False)
merged_df['output'] = merged_df['output'].apply(extract_content)

In [18]:
prepared_columns = pd.read_csv('./data/prepared_columns_processed_v200.csv')
prepared_column_definition_training_data = prepared_columns[['table_id','definition', 'description', 'output', 'display_groups']]
prepared_column_definition_training_data.loc[:, 'output'] = prepared_column_definition_training_data['output'].apply(lambda x: remove_quotes(x))

In [19]:
affiliates = pd.read_csv('./data/source_data/affiliates.csv')
affiliates = affiliates[['affiliate_id', 'affiliate_name']]

In [20]:
merchants = pd.read_csv('./data/source_data/merchants.csv')
merchants = merchants[['merchant_id', 'merchant_name']]

In [21]:
trending_widget_dict, top_accounts_dict = process_dashboard_objects('./data/source_data/EDW2_dashboard_objects.json')

In [22]:
relative_timeframes = pd.read_json('./data/source_data/relative_time_filters.json')

In [23]:
smangrul_dataset = load_dataset("smangrul/code-chat-assistant-v1")

In [24]:
def reverse_relative_timeframes(relative_timeframes):
    """
    Reverse the relative_timeframes dictionary by making the dictionary object of each key
    into a string and using it as the new key, while the old key becomes the new value.
    """
    reversed_timeframes = {}
    for key, value in relative_timeframes.items():
        # Ensure the value is a standard dictionary
        value_dict = value.to_dict() if isinstance(value, pd.Series) else value
        
        # Convert the dictionary object to a string
        value_str = json.dumps(value_dict, sort_keys=True)
        reversed_timeframes[value_str] = key
    return reversed_timeframes

reversed_relative_timeframes = reverse_relative_timeframes(relative_timeframes)

In [25]:
def extract_relative_date_filter_for_matching(dashboard_dict):
    """
    Extract the relative date filter from the request object.
    """
    try:
        report_keyname = list(dashboard_dict.keys())[0]
    except IndexError as e:
        print(dashboard_dict)
        raise e
        
    # Extract the filters from the request object
    filters = dashboard_dict[report_keyname]["filters"]
    for filter in filters:
        if filter["op"] == "relative_date":
            return filter

In [26]:
def match_relative_date_filter(filter1):
    filter_a = copy.deepcopy(filter1)
    del filter_a["alias"]
    for key in relative_timeframes:
        filter_b = copy.deepcopy(dict(relative_timeframes[key]))
        del filter_b["alias"]
        if filter_a == filter_b:
            return key

In [27]:
def randomize_relative_dates(dashboard_dict, relative_timeframes, seed_value):
    random.seed(seed_value)
    randomized_dashboard_dict = {}
    randomized_dashboard_distribution_analysis = Counter()

    timeframe_keys = list(relative_timeframes.keys())

    for key in dashboard_dict:
        report_key = list(dashboard_dict[key].keys())[0]
        report = dashboard_dict[key][report_key]
        randomized_dashboard_dict[key] = {report_key: report.copy()}
        
        for filter in report["filters"]:
            if filter["op"] == "relative_date":
                # Randomly select a timeframe configuration
                random_timeframe = random.choice(timeframe_keys)
                new_timeframe_config = relative_timeframes[random_timeframe]
                
                # Replace the filter configuration
                for field, value in new_timeframe_config.items():
                    filter[field] = value
                
                # Record the selection
                randomized_dashboard_distribution_analysis[random_timeframe] += 1

    return randomized_dashboard_dict, dict(randomized_dashboard_distribution_analysis)

In [28]:
def insert_date_with_preposition(phrase, timeframe):
    """
    First, determine the correct preposition for the timeframe.
    Then, insert the date phrase with the preposition into the phrase.
    If the phrase ends with a punctuation mark, insert before the punctuation.
    Otherwise, append at the end of the phrase.
    """
    # Determine the preposition
    if timeframe in ["today", "yesterday"]:
        date_phrase = f"for {timeframe}"
    else:
        date_phrase = f"for the {timeframe}"
    
    # Insert the date phrase
    if phrase.endswith((".", "?", "!")):
        return phrase[:-1] + " " + date_phrase + phrase[-1]
    else:
        return phrase + " " + date_phrase

In [29]:
def create_dashboard_training_data(dashboard_dict, instruction, prefix, seed=None):
    """
    -Iterate through each key in the given dictionary. 
    -Split the key into category and report name by splitting on the underscore character.
    -Generate 5 'human input' strings based on the category and report name with different phrasings.
    -Track and report the random distribution of the phrasing as a JSON-like dictionary.
    -Accept a seed value for reproducibility of random selections.
    -Include the seed number in the distribution dictionary.
    """
    # Set the seed if provided
    if seed is not None:
        random.seed(seed)

    request_phrasing_list = [
        "Could I get the {report_name} report under {category}?", 
        "I would like to see the {report_name} from the {category} category.", 
        "Can you provide the {report_name} report from {category}?",
        "Show me the {report_name} report from {category}, please",
        "I'm interested in viewing the {report_name} under {category}.",
        "Please generate the {report_name} report from {category}",
        "I need the {report_name} report from {category} for analysis.",
        "Fetch the {report_name} report categorized under {category}.",
        "Could you pull up the {report_name} from {category}?",
        "Display the {report_name} report found in {category}",
        "Access the {report_name} report within the {category} section.",
        "May I have a look at the {report_name} report from {category}?",
        "Bring up the {report_name} report from {category} for review",
        "I’d like to review the {report_name} report from {category}.",
        "Generate a {report_name} report from {category}, if you will.",
        "Can I check out the {report_name} report from {category}?",
        "Let's see the {report_name} report from {category}",
        "I require the {report_name} report from {category}, please.",
        "Pull the {report_name} report from {category}",
        "Load up the {report_name} report from the {category} category."
    ]

    training_data = []
    phrase_distribution = Counter()
    
    for key in dashboard_dict:
        # Functionality to add in the future.
        relative_date_filter = extract_relative_date_filter_for_matching(dashboard_dict[key])
        relative_timeframe = match_relative_date_filter(relative_date_filter)
        

        category, report_name = key.split("_")
        # Generate 5 unique random phrases
        for _ in range(5):
            # Select a random phrasing template
            phrasing_template = random.choice(request_phrasing_list)
            # Record the usage of the phrasing template
            phrase_distribution[phrasing_template] += 1
            
            # Fill the template with the specific category and report name
            human_input = phrasing_template.format(report_name=report_name, category=category)
            input = prefix + human_input
            input = insert_date_with_preposition(input, relative_timeframe)
            output_prefix = f"Sure! Here's the {report_name} report from {category} for {relative_timeframe}:\n\n"
            # Append the generated phrase to the training data
            training_data.append({
                "instruction": instruction,
                "input": input,
                "output": f"{output_prefix}{json.dumps(dashboard_dict[key])}"
            })
    
    # Convert the phrase distribution Counter to a dictionary
    distribution_dict = dict(phrase_distribution)

    # Include the seed number in the distribution dictionary
    distribution_dict['seed'] = seed
    
    # Return the training data and the distribution dictionary as a tuple
    return training_data, distribution_dict


In [30]:
def insert_merchant_name_into_inputs(list_of_dicts):
    for dict in list_of_dicts:
        output = json.loads(dict['output'].split('\n\n')[1])
        report_key = list(output.keys())[0]
        report = output[report_key]
        
        for filter in report['filters']:
            if filter['field'] == 'dim_merchant-merchant_uuid' and filter['op'] == 'eq':
                merchant_id = filter['values'][0]
                merchant_name = merchants[merchants['merchant_id'] == merchant_id]['merchant_name'].iloc[0]
                # Append the merchant name to the input ' - @Avantlink:merchant_name=<merchant_name>'
                dict['input'] += f" - @Avantlink:merchant_name='{merchant_name}'"
    return list_of_dicts

In [31]:
def create_randomized_merchant_distribution(list_of_dicts, seed=None):
    # Set the random seed for reproducibility
    random.seed(seed)
    # Initialize distribution dictionary with seed, merchant counts, indices, and UUIDs
    distribution = {'seed': seed, 'merchant_counts': {}}
    # Create a deep copy of the list_of_dicts to modify
    modified_list_of_dicts = copy.deepcopy(list_of_dicts)
    # Iterate over each dictionary in the copied list
    for dict_index, dict in enumerate(modified_list_of_dicts):
        output = json.loads(dict['output'].split('\n\n')[1])
        report_key = list(output.keys())[0]
        report = output[report_key]
        
        for filter in report['filters']:
            if filter['field'] == 'dim_merchant-merchant_uuid' and filter['op'] == 'eq':
                # Randomly select a new merchant name and get corresponding ID
                random_merchant_row = merchants.sample(n=1, random_state=random.randint(0, 10000)).iloc[0]
                random_merchant_id = random_merchant_row['merchant_id']
                random_merchant_name = random_merchant_row['merchant_name']
                # Update distribution tracking
                if random_merchant_name not in distribution['merchant_counts']:
                    distribution['merchant_counts'][random_merchant_name] = {
                        'count': 0, 
                        'indices': [],
                        'uuid': random_merchant_id
                    }
                
                distribution['merchant_counts'][random_merchant_name]['count'] += 1
                distribution['merchant_counts'][random_merchant_name]['indices'].append(dict_index)
    
    return distribution

In [32]:
def execute_randomized_merchant_distribution(list_of_dicts, distribution):
    # Create a deep copy of the list_of_dicts to modify
    modified_list_of_dicts = copy.deepcopy(list_of_dicts)

    # Iterate over each dictionary in the copied list
    for dict_index, dict in enumerate(modified_list_of_dicts):
        preface_statement = dict['output'].split('\n\n')[0]
        output = json.loads(dict['output'].split('\n\n')[1])
        report_key = list(output.keys())[0]
        report = output[report_key]
        
        for filter in report['filters']:
            if filter['field'] == 'dim_merchant-merchant_uuid' and filter['op'] == 'eq':
                # Find the corresponding merchant assignment for this index
                for merchant_name, data in distribution['merchant_counts'].items():
                    if dict_index in data['indices']:
                        # Update the filter's values with the assigned merchant ID
                        filter['values'][0] = data['uuid']
                        break
        dict['output'] = f"{preface_statement}\n\n{json.dumps(output)}"

    return modified_list_of_dicts

In [33]:
def write_dict_of_lists_to_csv(training_data_dict, seed_value):
    # Initial version number
    version = 1.0

    # Construct the initial directory path
    dir_path = f'./data/training/{seed_value}_v{version}'

    # Increment the version number until an unused directory name is found
    while os.path.exists(dir_path):
        version += 0.1
        dir_path = f'./data/training/{seed_value}_v{round(version, 1)}'

    # Create the directory
    os.makedirs(dir_path)
    print(f"Directory '{dir_path}' was created.")

    # Iterate over each key-value pair in the dictionary
    for data_name, data in training_data_dict.items():
        file_path = os.path.join(dir_path, f'{data_name}_{seed_value}.csv')

        try:
            # Writing the list of dictionaries to a CSV file
            if isinstance(data, list) and data and isinstance(data[0], dict):
                with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
                    writer = csv.DictWriter(csvfile, fieldnames=data[0].keys())
                    writer.writeheader()
                    for dict_data in data:
                        writer.writerow(dict_data)
            elif isinstance(data, (pd.DataFrame, pd.Series)):
                data.to_csv(file_path, index=False)
            else:
                print(f"Data format not recognized for {data_name}, skipping file write.")
        
        except ValueError as e:
            print(f"Error writing file {data_name}: {e}")

        print(f"File written: {file_path}")

    return dir_path

In [34]:
def generate_testing_dict_list(df, seed_value=None, ratio=1.0):
    # Set the seed for reproducibility
    random.seed(seed_value)

    # Calculate the number of rows to be processed
    num_rows = math.ceil(len(df) * ratio)

    # Select a subset of row indices based on the ratio and seed
    selected_indices = random.sample(range(len(df)), num_rows)

    # Create a new DataFrame with only the selected rows
    modified_df = df.iloc[selected_indices].copy(deep=True)

    # Initialize distribution dictionary
    distribution = {'seed': seed_value, 'modified_counts': {}, 'total_modified': 0}

    print(f"Removing 'output' from {num_rows} rows.")

    # Remove 'output' column from the new DataFrame
    if 'output' in modified_df.columns:
        modified_df.drop(columns='output', inplace=True)
        distribution['total_modified'] = len(modified_df)

    # Update modified counts in the distribution dictionary
    for idx in modified_df.index:
        distribution['modified_counts'][idx] = 1  # Each row is modified once

    return modified_df, distribution

In [35]:
def get_first_row_of_files(directory):
    first_rows = {}
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            try:
                df = pd.read_csv(filepath)
                first_rows[filename] = df.iloc[0].to_dict()
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

    return first_rows

In [36]:
def create_smangrul_prompt(row_dict):
    smangrul_prompt_1 = f"{smangrul_data_system_preprompt} \n {row_dict['instruction']}{smangrul_end_decorator}\n"
    smangrul_prompt_2 = f"{smangrul_user_prefix}{row_dict['input']}{smangrul_end_decorator}\n"
    if 'output' in row_dict:
        smangrul_prompt_3 = f"{smangrul_assistant_prefix}{row_dict['output']}{smangrul_end_decorator}"
    else:
        smangrul_prompt_3 = ""
    return smangrul_prompt_1 + smangrul_prompt_2 + smangrul_prompt_3

In [37]:
def create_smangrul_dataset(data_dict):
    """
    Apply create_smangrul_data to each value in each key-value pair in the data_dict. 
    Each value is a list of dictionaries or a pandas series.
    Additionally, print the number of rows skipped, total rows, and the percentage of rows skipped.
    """
    data_dict_copy = copy.deepcopy(data_dict)
    smangrul_data = {}
    
    for key, value in data_dict_copy.items():
        # Initialize counters
        total_rows = 0
        skipped_rows = 0
        
        if isinstance(value, pd.DataFrame):
            value = value.to_dict(orient='records')
        # Convert pandas Series to list if necessary
        if not isinstance(value, list):
            value = value.tolist()  # Convert DataFrame to list of dictionaries
        
        # Ensure each row is a dictionary and then process
        key_values = []
        for row in value:
            total_rows += 1  # Increment total rows
            if isinstance(row, dict):
                key_values.append({"content": create_smangrul_prompt(row)})  # Pass a list with a single dictionary
            else:
                skipped_rows += 1  # Increment skipped rows

        smangrul_data[key] = key_values

        # Calculate percentage of skipped rows
        if total_rows > 0:
            skipped_percentage = (skipped_rows / total_rows) * 100
            print(f"{skipped_rows} out of {total_rows} rows were skipped in key '{key}' -- {skipped_percentage:.2f}%")

    return smangrul_data

In [38]:
def replace_merchant_names_by_distribution(merchants_df, dict_list, seed_value=None, ratio=1.0):
    random.seed(seed_value)  # Set the seed for reproducibility
    modified_list = copy.deepcopy(dict_list)  # Create a deep copy to modify
    distribution = {'seed': seed_value, 'merchant_counts': {}}  # Initialize distribution dictionary

    # Calculate the number of rows to be processed
    num_rows = math.ceil(len(dict_list) * ratio)

    print(f"Replacing merchant names for {num_rows} rows.")

    for dict_index, item in enumerate(modified_list[:num_rows]):
        # Randomly select a merchant
        random_merchant_row = merchants_df.sample(n=1, random_state=random.randint(0, 10000)).iloc[0]
        random_merchant_id = random_merchant_row['merchant_id']
        random_merchant_name = random_merchant_row['merchant_name']

        # Find the merchant name within single quotes and replace it
        new_input = item['input'].replace(item['input'].split("'")[1], random_merchant_name)
        item['input'] = new_input
        del item['output']  # Assuming you want to remove 'output' from each item

        # Update distribution tracking
        if random_merchant_name not in distribution['merchant_counts']:
            distribution['merchant_counts'][random_merchant_name] = {
                'count': 0, 
                'indices': [],
                'uuid': random_merchant_id
            }
        
        distribution['merchant_counts'][random_merchant_name]['count'] += 1
        distribution['merchant_counts'][random_merchant_name]['indices'].append(dict_index)

    return modified_list[:num_rows], distribution



In [39]:
## Randomize Request Object Relative Dates
trending_dashboard_dict, trending_dashboard_distribution_dict = randomize_relative_dates(trending_widget_dict, relative_timeframes, seed_value)
top_accounts_dashboard_dict_trending, top_accounts_dashboard_distribution_dict = randomize_relative_dates(top_accounts_dict, relative_timeframes, seed_value)

In [74]:
print("Trending Dashboard Relative Date Distribution:\n", trending_dashboard_distribution_dict)
print("\nTop Accounts Dashboard Relative Date Distribution:\n", top_accounts_dashboard_distribution_dict)

Trending Dashboard Relative Date Distribution:
 {'last month': 3, 'today': 5, 'quarter to date': 3, 'yesterday': 3, 'previous 3 months': 5, 'month to date': 1, 'previous 7 days': 2, 'last quarter': 1, 'previous 12 months': 5, 'last year': 5, 'previous 30 days': 1, 'year to date': 2}

Top Accounts Dashboard Relative Date Distribution:
 {'last month': 3, 'today': 5, 'quarter to date': 4, 'yesterday': 3, 'previous 3 months': 5, 'month to date': 1, 'previous 7 days': 2, 'last quarter': 1, 'previous 12 months': 5, 'last year': 5, 'previous 30 days': 1, 'year to date': 3}


#### Merchant/Affiliate - Name <--> I.D. - training data creation

In [40]:
## Merchant Name Training Data Creation (Human Readable Merchant Name -> Merchant I.D.)
merchant_name_training_data = create_training_data(merchants, merchant_name_to_id_training_prompt, 'merchant_name', 'merchant_id')
## Merchant I.D. Training Data Creation (Merchant I.D. -> Human Readable Merchant Name)
merchant_id_training_data = create_training_data(merchants, merchant_id_to_name_training_prompt, 'merchant_id', 'merchant_name')
## Affiliate Name Training Data Creation (Human Readable Affiliate Name -> Affiliate I.D.)
affiliate_name_training_data = create_training_data(affiliates, affiliate_name_to_id_training_prompt, 'affiliate_name', 'affiliate_id')
## Affiliate I.D. Training Data Creation (Affiliate I.D. -> Human Readable Affiliate Name)
affiliate_id_training_data = create_training_data(affiliates, affiliate_id_to_name_training_prompt, 'affiliate_id', 'affiliate_name')

In [41]:
pprint(merchant_name_training_data[0])
print('\n')
pprint(merchant_id_training_data[0])
print('\n')
pprint(affiliate_name_training_data[0])
print('\n')
pprint(affiliate_id_training_data[0])

{'input': 'The Robert Axle Project',
 'instruction': 'Given an @Avantlink:merchant_name, provide the corresponding '
                '@Avantlink:merchant_id. Merchant I.D.s are unique and '
                'universal between EDW2 and EDW3.',
 'output': 'ce411983-7346-4dd1-95f2-0d296df100dc'}


{'input': 'ce411983-7346-4dd1-95f2-0d296df100dc',
 'instruction': 'Given an @Avantlink:merchant_id, provide the corresponding '
                'human readable @Avantlink:merchant_name. Merchant I.D.s are '
                'unique and universal between EDW2 and EDW3.',
 'output': 'The Robert Axle Project'}


{'input': 'Good Pedals, LLC',
 'instruction': 'Given an @Avantlink:affiliate_name, provide the corresponding '
                '@Avantlink:affiliate_id. Affiliate I.D.s are unique and '
                'universal between EDW2 and EDW3.',
 'output': '97d20a78-fece-4291-8704-22beb32f0f91'}


{'input': '97d20a78-fece-4291-8704-22beb32f0f91',
 'instruction': 'Given an @Avantlink:affiliate_id, pro

#### Column Name (table_id), Description, and Definition - training data creation

In [42]:
## Column Definition Training Data Creation (Human Language Description -> Column Definition JSON)
col_definition_data_list = prepared_column_definition_training_data.apply(create_col_definition_dict, axis=1).tolist()
## Table I.D. Training Data Creation (Human Language Description -> Table I.D.)
table_id_data_list = prepared_column_definition_training_data.apply(create_table_id_dict, axis=1)
## Column Definition Training Data Creation (Table I.D. -> Column Definition JSON)
column_id_data_list = prepared_column_definition_training_data.apply(create_column_id_dict, axis=1)

In [43]:
pprint(col_definition_data_list[0])
print('\n')
pprint(table_id_data_list[0])
print('\n')
pprint(column_id_data_list[0])

{'input': 'Avantmetrics: Summarised count of channel orders',
 'instruction': 'Given a description of a column in @Avantlink:EDW2, provide '
                'the corresponding JSON definition for that column. Column '
                'Definitions are unique, but not universal between EDW2 and '
                'EDW3.',
 'output': {'aggregate': [{'distinct': False, 'func': 'sum'}],
            'alias': 'channel_summary_order_count',
            'fact': True,
            'id': 'fact_order_channel_summary-order_count',
            'name': 'Orders'}}


{'input': 'Avantmetrics: Summarised count of channel orders',
 'instruction': 'Given a description of a column in an @Avantlink:EDW2, '
                'provide the corresponding table I.D. for that column. The '
                'Table I.D. is unique, but not universal between EDW2 and '
                'EDW3.',
 'output': 'fact_order_channel_summary-order_count'}


{'input': 'fact_order_channel_summary-order_count',
 'instruction': 'Given t

In [44]:
## Trending Widget Training Data Creation (Human Language Direction -> Performance Report JSON)
trending_widget_training_data, trending_widget_distribution_json = create_dashboard_training_data(trending_dashboard_dict, trending_widget_report_json_training_prompt, trending_widget_prefix, seed_value)
## Top Accounts Widget Training Data Creation (Human Language Direction -> Performance Report JSON)
top_accounts_training_data, top_accounts_distribution_json = create_dashboard_training_data(top_accounts_dashboard_dict_trending, top_accounts_widget_report_json_training_prompt, top_accounts_prefix, seed_value)

In [77]:
trending_widget_distribution_json

{'May I have a look at the {report_name} report from {category}?': 7,
 'Show me the {report_name} report from {category}, please': 6,
 'Fetch the {report_name} report categorized under {category}.': 11,
 'Could I get the {report_name} report under {category}?': 10,
 "I'm interested in viewing the {report_name} under {category}.": 7,
 'Access the {report_name} report within the {category} section.': 8,
 "Let's see the {report_name} report from {category}": 11,
 'Bring up the {report_name} report from {category} for review': 10,
 'I need the {report_name} report from {category} for analysis.': 8,
 'Can I check out the {report_name} report from {category}?': 5,
 'Load up the {report_name} report from the {category} category.': 4,
 'Display the {report_name} report found in {category}': 15,
 'Generate a {report_name} report from {category}, if you will.': 8,
 'Can you provide the {report_name} report from {category}?': 12,
 'Could you pull up the {report_name} from {category}?': 11,
 'I re

In [45]:
pprint(trending_widget_training_data[0])

{'input': '@AvantQueryEDW2:trending_widget - May I have a look at the Sales '
          'report from Sales for the last month?',
 'instruction': 'Given a human readable request for an '
                '@Avantlink:EDW2:trending_widget report, provide the '
                'corresponding JSON query for the specified trending widget '
                'report. Include the appropriate date range, and merchant uuid '
                'filter for the report. The rest of this kind of report is '
                'premade.',
 'output': "Sure! Here's the Sales report from Sales for last month:\n"
           '\n'
           '{"trending_widget": {"cols": [{"id": "dim_date-mm_dd_yyyy", '
           '"name": "Day", "alias": "mm_dd_yyyy", "aggregate": [{"func": '
           '"range"}]}, {"id": "calculation", "calc": "sales + adjustments", '
           '"fact": true, "name": "Sales", "vars": {"sales": {"id": '
           '"fact_order_avantlink-order_amount", "aggregate": [{"func": "sum", '
           '

In [46]:
randomized_merchant_trending_widget_distribution = create_randomized_merchant_distribution(trending_widget_training_data, seed_value)
randomized_merchant_trending_widget_training_data = execute_randomized_merchant_distribution(trending_widget_training_data, randomized_merchant_trending_widget_distribution)

In [47]:
randomized_merchant_top_accounts_distribution = create_randomized_merchant_distribution(top_accounts_training_data, seed_value)
randomized_merchant_top_accounts_training_data = execute_randomized_merchant_distribution(top_accounts_training_data, randomized_merchant_top_accounts_distribution)

In [48]:
formatted_trending_widget_training_data = insert_merchant_name_into_inputs(randomized_merchant_trending_widget_training_data)
formatted_top_accounts_training_data = insert_merchant_name_into_inputs(randomized_merchant_top_accounts_training_data)

In [49]:
formatted_trending_widget_training_data[0]

{'instruction': 'Given a human readable request for an @Avantlink:EDW2:trending_widget report, provide the corresponding JSON query for the specified trending widget report. Include the appropriate date range, and merchant uuid filter for the report. The rest of this kind of report is premade.',
 'input': "@AvantQueryEDW2:trending_widget - May I have a look at the Sales report from Sales for the last month? - @Avantlink:merchant_name='Primary Arms'",
 'output': 'Sure! Here\'s the Sales report from Sales for last month:\n\n{"trending_widget": {"cols": [{"id": "dim_date-mm_dd_yyyy", "name": "Day", "alias": "mm_dd_yyyy", "aggregate": [{"func": "range"}]}, {"id": "calculation", "calc": "sales + adjustments", "fact": true, "name": "Sales", "vars": {"sales": {"id": "fact_order_avantlink-order_amount", "aggregate": [{"func": "sum", "distinct": true}], "required_groups": ["sales"]}, "adjustments": {"id": "fact_order_adjustment-order_combined_adjustment_amount", "aggregate": [{"func": "sum", "d

In [50]:
## Report Query Training Data Creation (Natural Language Query -> Complete Report JSON)
custom_report_query_data_list = create_report_query_data_list(merged_df, natural_langage_query_to_complete_report_json)

In [51]:
custom_report_query_data_list[0]

{'instruction': '\n## Purpose\nYour task is to convert Avantlink EDW2 natural language queries ( @AvantQueryEDW2 ), into properly structured JSON queries suitable for the Avantlink API.\n\n## Understanding the Query\n1. **Identify Key Elements**: Carefully read the query to identify important elements such as data fields, filters, date ranges, and special conditions like record limits.\n2. **Clarify Ambiguities**: If any part of the query is unclear, make reasonable assumptions based on the context.\n\n## Constructing the JSON Query\n1. **Initialize a Basic JSON Template**: Start with a JSON structure containing necessary keys like `"performance_summary"`, `"cols"`, and `"filters"`.\n2. **Populate Data Fields**:\n   - For each required data field in the query, add an object in the `"cols"` array.\n   - Assign `id`, `alias`, and `format` for each field, conforming to Avantlink\'s standards.\n3. **Incorporate Filters**:\n   - For each filter condition, add an object in the `"filters"` ar

### With The Training Data Created, lets take a moment to consider the Order in which the data should be learned.
###### Progressing from Basic To Advanced (affiliate + merchant names) --> (prepared column definitions) --> (Trending/Top Accounts Reports - note timeframe) --> (Custom AvantMetrics Reports)

In [52]:
# Gather all training data into a single list - Affiliate data commented out for now to reduce training time/cost
training_data_dict = {
    'merchant_name_training_data': merchant_name_training_data,
    'merchant_id_training_data': merchant_id_training_data,
    # 'affiliate_name_training_data': affiliate_name_training_data,
    # 'affiliate_id_training_data': affiliate_id_training_data,
    'column_definition_data_list': col_definition_data_list,
    'table_id_data_list': table_id_data_list,
    'column_id_data_list': column_id_data_list,
    'formatted_trending_widget_training_data': formatted_trending_widget_training_data,
    'formatted_top_accounts_training_data': formatted_top_accounts_training_data,
    'custom_report_query_data_list': custom_report_query_data_list
}


In [53]:
write_dict_of_lists_to_csv(training_data_dict, seed_value)

Directory './data/training/626_v1.0' was created.
File written: ./data/training/626_v1.0/merchant_name_training_data_626.csv
File written: ./data/training/626_v1.0/merchant_id_training_data_626.csv
File written: ./data/training/626_v1.0/column_definition_data_list_626.csv
File written: ./data/training/626_v1.0/table_id_data_list_626.csv
File written: ./data/training/626_v1.0/column_id_data_list_626.csv
File written: ./data/training/626_v1.0/formatted_trending_widget_training_data_626.csv
File written: ./data/training/626_v1.0/formatted_top_accounts_training_data_626.csv
File written: ./data/training/626_v1.0/custom_report_query_data_list_626.csv


'./data/training/626_v1.0'

In [54]:
# Assuming merchants_df is a large dataframe with many merchants
widget_training_data = training_data_dict['formatted_top_accounts_training_data'] + training_data_dict['formatted_trending_widget_training_data']
ratio = 0.5
avantlink_test_data = replace_merchant_names_by_distribution(merchants, widget_training_data, seed_value=seed_value, ratio=0.4)
# The replace_merchant_names_with_distribution() function will return the modified list and the distribution dictionary
generated_test_performance_reports, generated_test_performance_reports_distribution = replace_merchant_names_by_distribution(merchants, widget_training_data, seed_value, ratio)

Replacing merchant names for 148 rows.
Replacing merchant names for 185 rows.


In [55]:
avantlink_smangrul_data_dict = create_smangrul_dataset(training_data_dict)

0 out of 4970 rows were skipped in key 'merchant_name_training_data' -- 0.00%
0 out of 4970 rows were skipped in key 'merchant_id_training_data' -- 0.00%
0 out of 164 rows were skipped in key 'column_definition_data_list' -- 0.00%
0 out of 164 rows were skipped in key 'table_id_data_list' -- 0.00%
0 out of 164 rows were skipped in key 'column_id_data_list' -- 0.00%
0 out of 180 rows were skipped in key 'formatted_trending_widget_training_data' -- 0.00%
0 out of 190 rows were skipped in key 'formatted_top_accounts_training_data' -- 0.00%
0 out of 1167 rows were skipped in key 'custom_report_query_data_list' -- 0.00%


In [56]:
# Convert existing Datasets to pandas DataFrames
train_df = smangrul_dataset['train'].to_pandas()
test_df = smangrul_dataset['test'].to_pandas()

avantlink_smangrul_training_dataframes = pd.concat([pd.DataFrame(value) for key, value in avantlink_smangrul_data_dict.items()])
avantlink_data_list = pd.concat([pd.DataFrame(value) for key, value in training_data_dict.items()])
avantlink_smangrul_test_data, avantlink_smangrul_test_data_distribution = generate_testing_dict_list(avantlink_data_list, seed_value, ratio=0.22)

Removing 'output' from 2634 rows.


In [57]:
print(len(avantlink_data_list))
print(len(avantlink_smangrul_test_data))

11969
2634


In [58]:
avantlink_smangrul_test_obj = {"data": avantlink_smangrul_test_data}
avantlink_smangrul_test_data = create_smangrul_dataset(avantlink_smangrul_test_obj)

0 out of 2634 rows were skipped in key 'data' -- 0.00%


In [59]:
avantlink_smangrul_test_dataframe = pd.DataFrame(avantlink_smangrul_test_data['data'])

In [70]:
print(len(avantlink_smangrul_test_dataframe))
print(len(avantlink_smangrul_training_dataframes))
print('\n')

print(type(avantlink_smangrul_test_dataframe))
print(type(avantlink_smangrul_training_dataframes))
print('\n')

print(len(smangrul_dataset["test"]))
print(len(smangrul_dataset["train"]))
print('\n')

print(type(smangrul_dataset["test"]))
print(type(smangrul_dataset["train"]))


2634
11969


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


818
10876


<class 'datasets.arrow_dataset.Dataset'>
<class 'datasets.arrow_dataset.Dataset'>


In [61]:
# Convert your DataFrames to Dataset format
test_dataset = Dataset.from_pandas(avantlink_smangrul_test_dataframe)
train_dataset = Dataset.from_pandas(avantlink_smangrul_training_dataframes)

# Retrieve the existing test and train datasets
existing_test_dataset = smangrul_dataset['test']
existing_train_dataset = smangrul_dataset['train']

# Concatenate your datasets with the existing ones
combined_test_dataset = concatenate_datasets([smangrul_dataset["test"], test_dataset])
combined_train_dataset = concatenate_datasets([smangrul_dataset["train"], train_dataset])

# Create a new combined dataset
combined_smangrul_dataset = DatasetDict({
    'test': combined_test_dataset,
    'train': combined_train_dataset
})

In [62]:
combined_smangrul_dataset['train'] = combined_smangrul_dataset['train'].remove_columns(['__index_level_0__'])
combined_smangrul_dataset

DatasetDict({
    test: Dataset({
        features: ['content'],
        num_rows: 3452
    })
    train: Dataset({
        features: ['content'],
        num_rows: 22845
    })
})

In [63]:
# combined_smangrul_dataset.save_to_disk(f'./data/training/avantlink_edw2_smangrul_training_dataset--no_affiliates_{seed_value}')

Saving the dataset (1/1 shards): 100%|██████████| 3452/3452 [00:00<00:00, 516673.35 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 22845/22845 [00:00<00:00, 559425.01 examples/s]


In [69]:
# combined_smangrul_dataset.push_to_hub("avant_assist")

Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 424.33ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.03s/it]
Creating parquet from Arrow format: 100%|██████████| 23/23 [00:00<00:00, 218.76ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:08<00:00,  8.59s/it]


In [64]:
# Gather all Distribution Objects - Just for analysis
all_distributions = {
    'trending_widget_distribution_json': trending_widget_distribution_json,
    'top_accounts_distribution_json': top_accounts_distribution_json,
    'randomized_merchant_trending_widget_distribution': randomized_merchant_trending_widget_distribution,
    'randomized_merchant_top_accounts_distribution': randomized_merchant_top_accounts_distribution,
    'avantlink_smangrul_test_data_distribution': avantlink_smangrul_test_data_distribution,
    'generated_test_performance_reports_distribution': generated_test_performance_reports_distribution,
    'trending_dashboard_distribution_dict': trending_dashboard_distribution_dict,
    'top_accounts_dashboard_distribution_dict': top_accounts_dashboard_distribution_dict
}

In [66]:
print(type(avantlink_smangrul_test_dataframe))
print(type(avantlink_smangrul_training_dataframes))

smangrul_test_data = combined_smangrul_dataset['test'].to_parquet(f'./data/training/avantlink_edw2_smangrul_test_data_{seed_value}.parquet', compression='snappy')
smangrul_train_data = combined_smangrul_dataset['train'].to_parquet(f'./data/training/avantlink_edw2_smangrul_train_data_{seed_value}.parquet', compression='snappy')

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 464.25ba/s]
Creating parquet from Arrow format: 100%|██████████| 23/23 [00:00<00:00, 232.15ba/s]


In [75]:
all_distributions

{'trending_widget_distribution_json': {'May I have a look at the {report_name} report from {category}?': 7,
  'Show me the {report_name} report from {category}, please': 6,
  'Fetch the {report_name} report categorized under {category}.': 11,
  'Could I get the {report_name} report under {category}?': 10,
  "I'm interested in viewing the {report_name} under {category}.": 7,
  'Access the {report_name} report within the {category} section.': 8,
  "Let's see the {report_name} report from {category}": 11,
  'Bring up the {report_name} report from {category} for review': 10,
  'I need the {report_name} report from {category} for analysis.': 8,
  'Can I check out the {report_name} report from {category}?': 5,
  'Load up the {report_name} report from the {category} category.': 4,
  'Display the {report_name} report found in {category}': 15,
  'Generate a {report_name} report from {category}, if you will.': 8,
  'Can you provide the {report_name} report from {category}?': 12,
  'Could you pul