# Generation Token Compression

#### Load Helper Functions and Import Libraries

In [67]:
import datetime
import json
import time
import os
import datetime
import json
import time
from openai import AzureOpenAI
from dotenv import load_dotenv
import json
import copy
import textwrap

# Load environment variables
load_dotenv()

def aoai_call(system_message,prompt,model):
    client = AzureOpenAI(
        api_version=os.getenv("API_VERSION"),
        azure_endpoint=os.getenv("AZURE_ENDPOINT"),
        api_key=os.getenv("API_KEY")
    )

    start_time = time.time()

    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt},
        ],
    )

    end_time = time.time()
    e2e_time = end_time - start_time

    result=json.loads(completion.model_dump_json(indent=2))
    prompt_tokens=result["usage"]["prompt_tokens"]
    completion_tokens=result["usage"]["completion_tokens"]
    completion_text=result["choices"][0]["message"]["content"]

    return result,prompt_tokens,completion_tokens,completion_text,e2e_time

model=os.getenv("MODELGPT432k")

# Read essay from a text file
with open('sales_report.txt', 'r') as f:
    sales_report = f.read()

## Use case: Summarising a report

### A: Naive summarisation, using the model's default verbosity

**Time taken: 20 seconds**

The model has a natural amount of verbosity- that is, the amount it chooses to say. Certain models will give long explanations to questions, other models may tend to give a more succinct answer. The total time taken for the model to finish is largely impacted by this.

In [68]:
system_message="""
You are a helpful AI assistant.
"""
prompt=f"""
Document to summarise:
{sales_report}
Summarise this document. Include the type of game, the sales performance by region, the performance of the launch, and feedback about the launch.
"""

result,prompt_tokens,completion_tokens,completion_text,e2e_time=aoai_call(system_message,prompt,model)
print(f"Prompt Tokens: {prompt_tokens}")
print(f"Completion Tokens: {completion_tokens}")
print(f"Time taken: {e2e_time:.2f} seconds")


Prompt Tokens: 1554
Completion Tokens: 246
Time taken: 20.38 seconds


In [69]:
completion_text

"The document discusses the global launch of a flagship real time strategy (RTS) video game. The game, known for its immersive gameplay, dynamic environment, and strategic planning elements, demonstrated impressive sales performance. \n\nIn the North American market, the game yielded sales of 1.5 million units, far exceeding the initial projections of one million units. The international markets mirrored this success. Europe recorded sales of 1.8 million units and Asia 2 million units. Though comparatively lower, the Latin America and Africa markets showed potential with sales of 0.5 million and 0.3 million units respectively. \n\nDespite the impressive sales, user feedback indicated issues with the game's micro-transaction system. Users across various regions perceived the system negatively, seeing it as a barrier to progress within the game without spending excessive real money. This criticism has led to the company noticing the need for immediate improvements in balancing user satis

### B: Summarisation with a focus on conciseness

**Time taken: 8.5 seconds**

By simply asking the model to be more succinct, the LLM spends less time generating tokens, making the overall repsonse much faster.

Of course, this may mean the answer is less complete, or doesn't fully meet the user's expectations. For backend processes, such as explaining the reason for a decision, this is often acceptable. For customer facing applications, this may also be ok, but testing is required.

In [70]:
system_message="""
You are a helpful AI assistant.
"""
prompt=f"""
Document to summarise:
{sales_report}
Summarise this document. Include the type of game, the sales performance by region, the performance of the launch, and feedback about the launch. Be as succint as possible, using as few words as possible.
"""

result,prompt_tokens,completion_tokens,completion_text,e2e_time=aoai_call(system_message,prompt,model)
print(f"Prompt Tokens: {prompt_tokens}")
print(f"Completion Tokens: {completion_tokens}")
print(f"Time taken: {e2e_time:.2f} seconds")
print(completion_text)


Prompt Tokens: 1568
Completion Tokens: 130
Time taken: 14.86 seconds
The document refers to the global launch of a leading Real Time Strategy (RTS) video game. The game has exceeded sales expectations in North America (1.5 million units), Europe (1.8 million units), and Asia (2 million units), with promising potential in Latin America (0.5 million units) and Africa (0.3 million units). However, feedback from players globally criticizes the game's micro-transaction system, which they find excessively expensive. Despite this, the game's strong sales reflect its overall success, and the company plans to revamp the micro-transaction system and tailor marketing strategies for underperforming markets.



### C: Few shot prompting

**Time taken: 8.2 seconds**

The best approach is to use few shot prompting to help guide the model, to better optimise the balance between succinctness and completeness. This also has the advantage of providing a structured output, which will be consistent across the application (for example, every report being summarised will now be in the same format).

In [71]:
system_message="""
You are a helpful AI assistant.
"""
prompt=f"""


You must use the below structure for your summary.
START_EXAMPLE:
Document:
Product Briefing Document
I. Product Overview
Product: "Battlefield Conquerors"
Battlefield Conquerors is an immersive, action-packed First-Person Shooter (FPS) game that thrusts players into a gritty, fast-paced world of strategic warfare. Skill, precision, and quick thinking are the keys to victory in this adrenaline-fueled gaming experience.
II. Sales Performance
Battlefield Conquerors has achieved robust sales across multiple global regions, effectively penetrating the gaming market.
North America (NA)
In North America, the game has resonated particularly well, with a total of 1.5 million units sold. This success can be attributed to its strategic marketing campaign and the region's affinity for the FPS genre.
Europe (EU)
Europe has emerged as the game's most successful region in terms of sales, with an impressive 2.0 million units sold. The game's realistic graphics, dynamic gameplay, and stimulating storylines have been lauded by European gamers.
Asia-Pacific (APAC)
In the APAC region, Battlefield Conquerors has sold 1.0 million units. This solid performance is a testament to its broad appeal and the successful localization of the game's content for these markets.
III. Financial Performance
Battlefield Conquerors has not only met its financial targets but exceeded them. The game has achieved a performance of 1.2 times its initial budget, demonstrating its profitability and the successful return on investment.
IV. Customer Feedback
Customer feedback for Battlefield Conquerors has been overwhelmingly positive. Players have praised the game's innovative mechanics, immersive environment, and challenging gameplay.
However, some concerns have been raised regarding the game's loading times. These issues have been acknowledged and are currently being addressed by the development team to ensure a seamless and uninterrupted gaming experience for all users moving forward.
In conclusion, Battlefield Conquerors is a successful and profitable product, demonstrating strong sales performance across multiple regions. The few areas of improvement identified are being addressed to ensure continued success and customer satisfaction.

Summary:
Product Type:
FPS

Sales:
NA: 1.5M
EU: 2.0M
APAC: 1.0M

Performance:
1.2 times budget.

Feedback:
Overall positive. Some concerns around loading times.
END_EXAMPLE

Document:
{sales_report}

Summary:
"""

result,prompt_tokens,completion_tokens,completion_text,e2e_time=aoai_call(system_message,prompt,model)
print(f"Prompt Tokens: {prompt_tokens}")
print(f"Completion Tokens: {completion_tokens}")
print(f"Time taken: {e2e_time:.2f} seconds")
print(completion_text)


Prompt Tokens: 2004
Completion Tokens: 72
Time taken: 8.08 seconds
Product Type:
RTS

Sales:
NA: 1.5M
EU: 1.8M
Asia: 2.0M
Latin America: 0.5M
Africa: 0.3M

Performance:
1.5 times projections.

Feedback:
Mixed; praised for mechanics but criticized for micro-transaction system.


## Use case: Classification

The imapact of this technique is proportionate to the number of documents being classified in series. There are other similar concepts that could be applied, by providing "codes" for the LLM to use to save time generating tokens.

For more advanced techniques, see parallelization. These techniques can be combined for even greater speed.

In [72]:
documents_to_classify = """[
    "Scientific research has led to significant advancements in medicine and healthcare.",  
    "CLIP (Contrastive Language-Image Pretraining) - OpenAI's model that understands images in the context of natural language",  
    "Science has contributed to our understanding of the natural world and the universe.",  
    "Codex - OpenAI's AI system that can understand and generate code, powering GitHub Copilot",  
    "GPT-4 - OpenAI's rumored next iteration of their language model with anticipated improvements",  
    "Azure AI - Microsoft's suite of AI services, including machine learning, cognitive services, and conversational AI",  
     "The collaboration and exchange of scientific knowledge across international borders have facilitated global progress in various fields." ,
     "Scientific innovations have improved communication and connectivity through technology.", 
    "Microsoft Turing Models - A series of large-scale language models developed by Microsoft",  
    "Microsoft Project Brainwave - Real-time AI platform for cloud and edge computing",  
    "Microsoft AI for Earth - A program applying AI to environmental challenges",  
    "Microsoft AI for Health - An initiative leveraging AI for health-related research",  
    "Scientific innovations have improved communication and connectivity through technology.",  
    "OpenAI's API - Providing access to GPT-3 and other models for various applications",   
    "Scientific research has led to significant advancements in medicine and healthcare.",  
    "CLIP (Contrastive Language-Image Pretraining) - OpenAI's model that understands images in the context of natural language",  
    "Science has contributed to our understanding of the natural world and the universe.",  
    "Codex - OpenAI's AI system that can understand and generate code, powering GitHub Copilot",  
    "GPT-4 - OpenAI's rumored next iteration of their language model with anticipated improvements",  
    "Azure AI - Microsoft's suite of AI services, including machine learning, cognitive services, and conversational AI",  
     "The collaboration and exchange of scientific knowledge across international borders have facilitated global progress in various fields." ,
     "Scientific innovations have improved communication and connectivity through technology.", 
    "Microsoft Turing Models - A series of large-scale language models developed by Microsoft",  
    "Microsoft Project Brainwave - Real-time AI platform for cloud and edge computing",  
    "Microsoft AI for Earth - A program applying AI to environmental challenges",  
    "Microsoft AI for Health - An initiative leveraging AI for health-related research",  
    "Scientific innovations have improved communication and connectivity through technology.",  
    "OpenAI's API - Providing access to GPT-3 and other models for various applications",   
]"""

### A: Classifying documents using the full category name

**Time taken: 12.5 seconds**

Here the model is providing the classification labels as the full text of the category name. This takes additional time, as more tokens are required per category name classified. The model has already made the determination of the class, but it takes additional time to convey that information to the application.

In [73]:
system_message="""
You are an helpful AI assistant that categorizes text in one of these categories : [A]: SCIENCE, [B]: ARTIFICIAL INTELLIGENCE, [C]: ART, [D]: HUMANITIES. 
Do not add any additional information.

DOCUMENTS_TO_CLASSIFY:
["GPT-3 (Generative Pre-trained Transformer 3) - OpenAI's powerful language model capable of writing like a human",  
"Pigeons’ Backflips Linked to Genetics Scientists have unraveled the genetic basis behind a fascinating avian behavior",
]
Example: [ARTIFICIAL INTELLIGENCE, SCIENCE]
"""
prompt=f"""
DOCUMENTS_TO_CLASSIFY:
{documents_to_classify}
"""

result,prompt_tokens,completion_tokens,completion_text,e2e_time=aoai_call(system_message,prompt,model)
print(f"Prompt Tokens: {prompt_tokens}")
print(f"Completion Tokens: {completion_tokens}")
print(f"Time taken: {e2e_time:.2f} seconds")
print(completion_text)


Prompt Tokens: 694
Completion Tokens: 175
Time taken: 9.67 seconds
[SCIENCE, ARTIFICIAL INTELLIGENCE, SCIENCE, ARTIFICIAL INTELLIGENCE, ARTIFICIAL INTELLIGENCE, ARTIFICIAL INTELLIGENCE, SCIENCE, SCIENCE, ARTIFICIAL INTELLIGENCE, ARTIFICIAL INTELLIGENCE, ARTIFICIAL INTELLIGENCE, ARTIFICIAL INTELLIGENCE, SCIENCE, ARTIFICIAL INTELLIGENCE, SCIENCE, ARTIFICIAL INTELLIGENCE, SCIENCE, ARTIFICIAL INTELLIGENCE, ARTIFICIAL INTELLIGENCE, ARTIFICIAL INTELLIGENCE, SCIENCE, SCIENCE, ARTIFICIAL INTELLIGENCE, ARTIFICIAL INTELLIGENCE, ARTIFICIAL INTELLIGENCE, ARTIFICIAL INTELLIGENCE, SCIENCE, ARTIFICIAL INTELLIGENCE]


### B: Use Categories to reduce the number of tokens generated

**Time taken: 4 seconds**

By assigning codes to each of the categories, the LLM only has to generate a single token per document classified. This is significantly faster. The codes can then be mapped back to the original class.

In [74]:
system_message="""
You are an helpful AI assistant that categorizes text in one of these categories : [A]: SCIENCE, [B]: ARTIFICIAL INTELLIGENCE, [C]: ART, [D]: HUMANITIES. 
Do not add any additional information. Only responde with the code for the category. For example, if it is SCIENCE, respond with [A].

DOCUMENTS_TO_CLASSIFY:
["GPT-3 (Generative Pre-trained Transformer 3) - OpenAI's powerful language model capable of writing like a human",  
"Pigeons’ Backflips Linked to Genetics Scientists have unraveled the genetic basis behind a fascinating avian behavior",
]
Example: ["B", "A"]
"""
prompt=f"""
DOCUMENTS_TO_CLASSIFY:
{documents_to_classify}
"""

result,prompt_tokens,completion_tokens,completion_text,e2e_time=aoai_call(system_message,prompt,model)
print(f"Prompt Tokens: {prompt_tokens}")
print(f"Completion Tokens: {completion_tokens}")
print(f"Time taken: {e2e_time:.2f} seconds")
print(completion_text)


Prompt Tokens: 712
Completion Tokens: 84
Time taken: 7.18 seconds
["A", "B", "A", "B", "B", "B", "A", "A", "B", "B", "B", "B", "A", "B", "A", "B", "A", "B", "B", "B", "A", "A", "B", "B", "B", "B", "A", "B"]


In [75]:
# Reformat to original categories

# Replace single quotes with double quotes
document_classes_list=json.loads(completion_text)

# Define the dictionary
categories = {'A': 'SCIENCE', 'B': 'ARTIFICIAL INTELLIGENCE', 'C': 'ART', 'D': 'HUMANITIES'}

# Replace the letters with the categories
lst = [categories[i] for i in document_classes_list]

print(lst)

['SCIENCE', 'ARTIFICIAL INTELLIGENCE', 'SCIENCE', 'ARTIFICIAL INTELLIGENCE', 'ARTIFICIAL INTELLIGENCE', 'ARTIFICIAL INTELLIGENCE', 'SCIENCE', 'SCIENCE', 'ARTIFICIAL INTELLIGENCE', 'ARTIFICIAL INTELLIGENCE', 'ARTIFICIAL INTELLIGENCE', 'ARTIFICIAL INTELLIGENCE', 'SCIENCE', 'ARTIFICIAL INTELLIGENCE', 'SCIENCE', 'ARTIFICIAL INTELLIGENCE', 'SCIENCE', 'ARTIFICIAL INTELLIGENCE', 'ARTIFICIAL INTELLIGENCE', 'ARTIFICIAL INTELLIGENCE', 'SCIENCE', 'SCIENCE', 'ARTIFICIAL INTELLIGENCE', 'ARTIFICIAL INTELLIGENCE', 'ARTIFICIAL INTELLIGENCE', 'ARTIFICIAL INTELLIGENCE', 'SCIENCE', 'ARTIFICIAL INTELLIGENCE']


## Use case: Generating structured data in lists instead of JSON

Often JSON is used as an output format of an LLM. It provides keys which are clear as to what the value being output is, and easy to use in downstream steps.

However, a significant number of tokens can end up being generated writing out the keys over and over. This takes a significant amount of time for the LLM.

A list is a more efficient data structure, as it simply uses the order of the elements to preserve the meaning. The list can then be restructured into a JSON (or dictionary) using code, if desired.

### A: Base Case: Using JSON

**Time taken: 66 seconds**

This is the typical structure used in online guides, tutorials, and many production applications. An advantage is that the meaning of each value is clearer, and it can be easier to work with.

In [76]:
system_message="""
You are an helpful AI assistant.
"""
prompt=f"""
Generate 20 cars in a JSON format. Each car should have the following attributes: make, model, year, color, and price. The cars should be diverse in terms of make, model, and color.
Example:
[
    {{
        "make": "Toyota",
        "model": "Corolla",
        "year": 2022,
        "color": "blue",
        "price": 25000
    }},
    {{
        "make": "Ford",
        "model": "Mustang",
        "year": 2021,
        "color": "red",
        "price": 35000
    }},
    ...
]
"""

result,prompt_tokens,completion_tokens,completion_text,e2e_time=aoai_call(system_message,prompt,model)
print(f"Prompt Tokens: {prompt_tokens}")
print(f"Completion Tokens: {completion_tokens}")
print(f"Time taken: {e2e_time:.2f} seconds")
print(completion_text)


Prompt Tokens: 153
Completion Tokens: 860
Time taken: 47.09 seconds
[
    {
        "make": "Toyota",
        "model": "Corolla",
        "year": 2022,
        "color": "blue",
        "price": 25000
    },
    {
        "make": "Ford",
        "model": "Mustang",
        "year": 2021,
        "color": "red",
        "price": 35000
    },
    {
        "make": "Chevrolet",
        "model": "Malibu",
        "year": 2019,
        "color": "black",
        "price": 22000
    },
    {
        "make": "Honda",
        "model": "Accord",
        "year": 2020,
        "color": "white",
        "price": 25000
    },
    {
        "make": "Nissan",
        "model": "Altima",
        "year": 2022,
        "color": "gray",
        "price": 26000
    },
    {
        "make": "Mazda",
        "model": "3",
        "year": 2022,
        "color": "blue",
        "price": 23000
    },
    {
        "make": "Subaru",
        "model": "Outback",
        "year": 2021,
        "color": "green",
        "

### B: Using a list

**Time taken: 28 seconds**

Here the output is a list of lists, where each list contains the relevant parameters of the car in an expected order. This is significantly faster, as the LLM does not need to generate output tokens for each key in the dictionary.

Once the task is completed, the list can be converted into a list programmatically, if so desired.

In [77]:
system_message="""
You are an helpful AI assistant.
"""
prompt=f"""
Generate 20 cars in a list. Each car should have the following attributes: make, model, year, color, and price. The cars should be diverse in terms of make, model, and color. Output only exactly the list of cars, no additional text or comments.
Example:
[
    [
        "the first element is the make of the car",
        "the second element is the model of the car",
        "the third element is the year of the car",
        "the fourth element is the color of the car",
        "the fifth element is the price of the car"
    ],
    [
        "the first element is the make of the car",
        "the second element is the model of the car",
        "the third element is the year of the car",
        "the fourth element is the color of the car",
        "the fifth element is the price of the car"
    ],
    ...
]
"""

result,prompt_tokens,completion_tokens,completion_text,e2e_time=aoai_call(system_message,prompt,model)
print(f"Prompt Tokens: {prompt_tokens}")
print(f"Completion Tokens: {completion_tokens}")
print(f"Time taken: {e2e_time:.2f} seconds")
print(completion_text)


Prompt Tokens: 210
Completion Tokens: 403
Time taken: 26.39 seconds
[
    ["Ford", "Mustang", 2020, "Red", 30000],
    ["Chevrolet", "Camero", 2019, "Blue", 28000],
    ["Dodge", "Charger", 2018, "Black", 26000],
    ["Toyota", "Camry", 2017, "White", 24000],
    ["Honda", "Accord", 2016, "Silver", 22000],
    ["Nissan", "Maxima", 2015, "Grey", 20000],
    ["BMW", "3 Series", 2022, "Blue", 40000],
    ["Audi", "A4", 2021, "Red", 38000],
    ["Mercedes", "C Class", 2020, "White", 36000],
    ["Lexus", "IS", 2019, "Silver", 34000],
    ["Acura", "TLX", 2018, "Black", 32000],
    ["Infiniti", "Q50", 2017, "Grey", 30000],
    ["Subaru", "Impreza", 2016, "Blue", 28000],
    ["Jeep", "Cherokee", 2022, "Red", 36000],
    ["Porsche", "911", 2021, "White", 120000],
    ["Maserati", "Ghibli", 2020, "Silver", 75000],
    ["Aston Martin", "DB11", 2019, "Black", 200000],
    ["Rolls Royce", "Ghost", 2018, "White", 320000],
    ["Bentley", "Continental", 2017, "Blue", 220000],
    ["Tesla", "Model 3

In [78]:
# Programatically turn the list back into a dict (JSON)

car_list=json.loads(completion_text)

# Define the keys for the dictionaries
keys = ["make", "model", "year", "color", "price"]

# Convert the list of lists into a list of dictionaries
dict_list = [dict(zip(keys, sublist)) for sublist in car_list]

print(dict_list)

[{'make': 'Ford', 'model': 'Mustang', 'year': 2020, 'color': 'Red', 'price': 30000}, {'make': 'Chevrolet', 'model': 'Camero', 'year': 2019, 'color': 'Blue', 'price': 28000}, {'make': 'Dodge', 'model': 'Charger', 'year': 2018, 'color': 'Black', 'price': 26000}, {'make': 'Toyota', 'model': 'Camry', 'year': 2017, 'color': 'White', 'price': 24000}, {'make': 'Honda', 'model': 'Accord', 'year': 2016, 'color': 'Silver', 'price': 22000}, {'make': 'Nissan', 'model': 'Maxima', 'year': 2015, 'color': 'Grey', 'price': 20000}, {'make': 'BMW', 'model': '3 Series', 'year': 2022, 'color': 'Blue', 'price': 40000}, {'make': 'Audi', 'model': 'A4', 'year': 2021, 'color': 'Red', 'price': 38000}, {'make': 'Mercedes', 'model': 'C Class', 'year': 2020, 'color': 'White', 'price': 36000}, {'make': 'Lexus', 'model': 'IS', 'year': 2019, 'color': 'Silver', 'price': 34000}, {'make': 'Acura', 'model': 'TLX', 'year': 2018, 'color': 'Black', 'price': 32000}, {'make': 'Infiniti', 'model': 'Q50', 'year': 2017, 'color': 