In [1]:
%pip install langchain langchain_openai langchain_mistralai pandas tqdm --upgrade --quiet

Note: you may need to restart the kernel to use updated packages.


## Generate Ground Truth Data with GPT-4 API

Creating the known labels or ground truth data can be time consuming and expensive. You can use GPT-4 to _generate the ground truth data_ for you. This is useful for training your own models, and for evaluating the performance of other models. Then you can use these evals to test whether the open source or smaller / faster / cheaper models are performing as well as the larger / slower / more expensive models.

In [2]:
import pandas as pd
from tqdm import tqdm
import requests
import io

# Dataset URL:
url = "https://storage.googleapis.com/oreilly-content/transaction_data_with_expanded_descriptions.csv"

# Download the file from the URL:
downloaded_file = requests.get(url)

# Load the transactions dataset and only look at 20 transactions:
df = pd.read_csv(io.StringIO(downloaded_file.text))[:20]
df.head()




Unnamed: 0,Transaction Description
0,cash deposit at local branch
1,cash deposit at local branch
2,withdrew money for rent payment
3,withdrew cash for weekend expenses
4,purchased books from the bookstore


In [3]:
df.shape

(20, 1)

In [4]:
df

Unnamed: 0,Transaction Description
0,cash deposit at local branch
1,cash deposit at local branch
2,withdrew money for rent payment
3,withdrew cash for weekend expenses
4,purchased books from the bookstore
5,tax refund deposit
6,refund from clothing store
7,withdrew money for rent payment
8,insurance claim refund
9,paid subscription service fee


In [5]:
# Run through the dataset using GPT-4 to correctly classify the transactions:
from langchain_openai.chat_models import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel
from typing import Literal, Union

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# 1. Define the model:
model = ChatOpenAI(
    model="gpt-4-1106-preview",
    model_kwargs={"response_format": {"type": "json_object"}},
)

system_prompt = """You are are an expert at analyzing bank transactions, 
you will be categorising a single transaction. 
Always return a transaction type and category: do not return None.
Format Instructions:
{format_instructions}"""

user_prompt = """Transaction Text:
{transaction}"""

# 2. Define the prompt:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_prompt,
        ),
        (
            "user",
            user_prompt,
        ),
    ]
)

# 3. Define the pydantic model:
class EnrichedTransactionInformation(BaseModel):
    transaction_type: Union[
        Literal["Purchase", "Withdrawal", "Deposit", "Bill Payment", "Refund"], None
    ]
    transaction_category: Union[
        Literal["Food", "Entertainment", "Transport", "Utilities", "Rent", "Other"],
        None,
    ]


# 4. Define the output parser:
output_parser = PydanticOutputParser(pydantic_object=EnrichedTransactionInformation)

# 5. Create an LCEL chain:
chain = prompt | model | output_parser

# 6. Invoke the chain for the whole dataset:
results = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    transaction = row["Transaction Description"]
    result = chain.invoke(
        {
            "transaction": transaction,
            "format_instructions": output_parser.get_format_instructions(),
        }
    )
    results.append(result)

100%|██████████| 20/20 [00:28<00:00,  1.41s/it]


In [6]:
# 7. Add the results to the dataframe, as columns transaction type and transaction category
transaction_types = []
transaction_categories = []

for result in results:
    transaction_types.append(result.transaction_type)
    transaction_categories.append(result.transaction_category)

df["transaction_type"] = transaction_types
df["transaction_category"] = transaction_categories
df.head()

Unnamed: 0,Transaction Description,transaction_type,transaction_category
0,cash deposit at local branch,Deposit,Other
1,cash deposit at local branch,Deposit,Other
2,withdrew money for rent payment,Withdrawal,Rent
3,withdrew cash for weekend expenses,Withdrawal,Other
4,purchased books from the bookstore,Purchase,Other


In [7]:
df.to_csv("transactions_with_enriched_data_gtd.csv", index=False)

## Obtain the Accuracy of Mistral API:

In [8]:
from dotenv import load_dotenv
load_dotenv()

True

In [9]:
import pandas as pd
from tqdm import tqdm
import requests
import io

# Dataset URL:
url = "https://storage.googleapis.com/oreilly-content/transactions_with_enriched_data.csv"

# Download the file from the URL:
downloaded_file = requests.get(url)

# Load the transactions dataset:
df = pd.read_csv(io.StringIO(downloaded_file.text))
df.head()

Unnamed: 0,Transaction Description,transaction_type,transaction_category
0,cash deposit at local branch,Deposit,Other
1,cash deposit at local branch,Deposit,Other
2,withdrew money for rent payment,Withdrawal,Rent
3,withdrew cash for weekend expenses,Withdrawal,Other
4,purchased books from the bookstore,Purchase,Other


In [None]:
from langchain_mistralai.chat_models import ChatMistralAI
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel
from typing import Literal, Union
import os

# 1. Define the model:
mistral_api_key = os.environ["MISTRAL_API_KEY"]

model = ChatMistralAI(model="mistral-small", mistral_api_key=mistral_api_key)

# 2. Define the prompt:
system_prompt = """You are are an expert at analyzing bank transactions, 
you will be categorising a single transaction. 
Always return a transaction type and category: do not return None.
Format Instructions:
{format_instructions}"""

user_prompt = """Transaction Text:
{transaction}"""

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_prompt,
        ),
        (
            "user",
            user_prompt,
        ),
    ]
)

# 3. Define the pydantic model:
class EnrichedTransactionInformation(BaseModel):
    transaction_type: Union[
        Literal["Purchase", "Withdrawal", "Deposit", "Bill Payment", "Refund"], None
    ]
    transaction_category: Union[
        Literal["Food", "Entertainment", "Transport", "Utilities", "Rent", "Other"],
        None,
    ]


# 4. Define the output parser:
output_parser = PydanticOutputParser(pydantic_object=EnrichedTransactionInformation)

# 5. Create an LCEL chain:
chain = prompt | model | output_parser

# 6. Invoke the chain for the first transaction:
transaction = df.iloc[0]["Transaction Description"]
result = chain.invoke(
        {
            "transaction": transaction,
            "format_instructions": output_parser.get_format_instructions(),
        }
    )
print(result) 
# This will often fail because Mistral puts a backslash in the JSON keys 
# i.e. "transaction\_type" instead of "transaction_type"

In [None]:
from langchain_core.output_parsers import StrOutputParser

# 7. Define a function to try to fix and remove the backslashes:
def remove_back_slashes(string):
    cleaned_string = string.replace("\\", "") # double slash to escape the slash
    return cleaned_string

chain = prompt | model | StrOutputParser() | remove_back_slashes | output_parser

transaction = df.iloc[0]["Transaction Description"]
result = chain.invoke(
        {
            "transaction": transaction,
            "format_instructions": output_parser.get_format_instructions(),
        }
    )
print(result) 

In [None]:
# 8. Invoke the chain for the whole dataset:
results = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    transaction = row["Transaction Description"]
    try:
        result = chain.invoke(
            {
                "transaction": transaction,
                "format_instructions": output_parser.get_format_instructions(),
            }
        )
    except:
        result = EnrichedTransactionInformation(
            transaction_type=None, transaction_category=None
        )
        
    results.append(result)

# 9. Add the results to the dataframe, as columns transaction type and transaction category
transaction_types = []
transaction_categories = []

for result in results:
    transaction_types.append(result.transaction_type)
    transaction_categories.append(result.transaction_category)

df["mistral_transaction_type"] = transaction_types
df["mistral_transaction_category"] = transaction_categories
df.head()

In [None]:
# 10. Evaluate answers using LangChain evaluators
from langchain.evaluation import load_evaluator

evaluator = load_evaluator("exact_match")

# loop through the dataframe and evaluate the predictions
transaction_types = []
transaction_categories = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    transaction_type = row["transaction_type"]
    predicted_transaction_type = row["mistral_transaction_type"]
    transaction_category = row["transaction_category"]
    predicted_transaction_category = row["mistral_transaction_category"]

    transaction_type_score = evaluator.evaluate_strings(
        prediction=predicted_transaction_type,
        reference=transaction_type,
    )

    transaction_category_score = evaluator.evaluate_strings(
        prediction=predicted_transaction_category,
        reference=transaction_category,
    )

    transaction_types.append(transaction_type_score)
    transaction_categories.append(transaction_category_score)

In [None]:
accuracy_score = 0

for transaction_type_score, transaction_category_score in zip(
    transaction_types, transaction_categories
):
    accuracy_score += transaction_type_score['score'] + transaction_category_score['score']

accuracy_score = accuracy_score / (len(transaction_types) * 2)
print(f"Accuracy score: {accuracy_score}")

## Compare Mistral API with GPT-3.5 API:

In [None]:
from langchain_openai.chat_models import ChatOpenAI

# 1. Define the model:
model = ChatOpenAI(
    model="gpt-3.5-turbo-1106",
    model_kwargs={"response_format": {"type": "json_object"}},
)

chain = prompt | model | output_parser

# 2. Invoke the chain for the first transaction:
transaction = df.iloc[0]["Transaction Description"]
result = chain.invoke(
        {
            "transaction": transaction,
            "format_instructions": output_parser.get_format_instructions(),
        }
    )
print(result)

In [None]:
# 3. Invoke the chain for the whole dataset:
results = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    transaction = row["Transaction Description"]
    try:
        result = chain.invoke(
            {
                "transaction": transaction,
                "format_instructions": output_parser.get_format_instructions(),
            }
        )
    except:
        result = EnrichedTransactionInformation(
            transaction_type=None, transaction_category=None
        )
    
    results.append(result)

# 4. Add the results to the dataframe, as columns transaction type and transaction category
transaction_types = []
transaction_categories = []

for result in results:
    transaction_types.append(result.transaction_type)
    transaction_categories.append(result.transaction_category)

df["gpt3.5_transaction_type"] = transaction_types
df["gpt3.5_transaction_category"] = transaction_categories

df.head()

In [None]:
# Loop through the dataframe and evaluate the predictions
transaction_types = []
transaction_categories = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    transaction_type = row["transaction_type"]
    predicted_transaction_type = row["gpt3.5_transaction_type"]
    transaction_category = row["transaction_category"]
    predicted_transaction_category = row["gpt3.5_transaction_category"]

    transaction_type_score = evaluator.evaluate_strings(
        prediction=predicted_transaction_type,
        reference=transaction_type,
    )

    transaction_category_score = evaluator.evaluate_strings(
        prediction=predicted_transaction_category,
        reference=transaction_category,
    )

    transaction_types.append(transaction_type_score)
    transaction_categories.append(transaction_category_score)

accuracy_score = 0

for transaction_type_score, transaction_category_score in zip(
    transaction_types, transaction_categories
):
    accuracy_score += transaction_type_score['score'] + transaction_category_score['score']

accuracy_score = accuracy_score / (len(transaction_types) * 2)
print(f"Accuracy score: {accuracy_score}")

In [None]:
# Pairwise comparison between GPT-4 and Mistral
from langchain.evaluation import load_evaluator
evaluator = load_evaluator("labeled_pairwise_string")

row = df.iloc[0]
transaction = row["Transaction Description"]
gpt3pt5_category = row["gpt3.5_transaction_category"]
gpt3pt5_type = row["gpt3.5_transaction_type"]
mistral_category = row["mistral_transaction_category"]
mistral_type = row["mistral_transaction_type"]
reference_category = row["transaction_category"]
reference_type = row["transaction_type"]

# put the data into JSON format for the evaluator
gpt3pt5_data = f"""{{
    "transaction_category": "{gpt3pt5_category}",
    "transaction_type": "{gpt3pt5_type}"
}}"""

mistral_data = f"""{{
    "transaction_category": "{mistral_category}",
    "transaction_type": "{mistral_type}"
}}"""

reference_data = f"""{{
    "transaction_category": "{reference_category}",
    "transaction_type": "{reference_type}"
}}"""

# set up the prompt input for context for the evaluator
input_prompt = """You are are an expert at analyzing bank transactions, 
you will be categorising a single transaction. 
Always return a transaction type and category: do not return None.
Format Instructions:
{format_instructions}
Transaction Text:
{transaction}
"""

evaluator.evaluate_string_pairs(
    prediction=gpt3pt5_data,
    prediction_b=mistral_data,
    input=input_prompt.format(
        format_instructions=output_parser.get_format_instructions(),
        transaction=transaction),
    reference=reference_data,
)

In [None]:
# Run through the whole dataset and add the pairwise comparison scores to the dataframe:
pairwise_scores = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    transaction = row["Transaction Description"]
    gpt3pt5_category = row["gpt3.5_transaction_category"]
    gpt3pt5_type = row["gpt3.5_transaction_type"]
    mistral_category = row["mistral_transaction_category"]
    mistral_type = row["mistral_transaction_type"]
    reference_category = row["transaction_category"]
    reference_type = row["transaction_type"]

    # put the data into JSON format for the evaluator
    gpt3pt5_data = f"""{{
        "transaction_category": "{gpt3pt5_category}",
        "transaction_type": "{gpt3pt5_type}"
    }}"""

    mistral_data = f"""{{
        "transaction_category": "{mistral_category}",
        "transaction_type": "{mistral_type}"
    }}"""

    reference_data = f"""{{
        "transaction_category": "{reference_category}",
        "transaction_type": "{reference_type}"
    }}"""

    # set up the prompt input for context for the evaluator
    input_prompt = """You are are an expert at analyzing bank transactions, 
    you will be categorising a single transaction. 
    Always return a transaction type and category: do not return None.
    Format Instructions:
    {format_instructions}
    Transaction Text:
    {transaction}
    """

    pairwise = evaluator.evaluate_string_pairs(
        prediction=gpt3pt5_data,
        prediction_b=mistral_data,
        input=input_prompt.format(
            format_instructions=output_parser.get_format_instructions(),
            transaction=transaction),
        reference=reference_data,
    )
    pairwise_scores.append(pairwise)

# Add the pairwise scores to the dataframe:
reasoning = []
scores = []

for score in pairwise_scores:
    reasoning.append(score['reasoning'])
    scores.append(score['score'])

df['pairwise_reasoning'] = reasoning
df['pairwise_score'] = scores

df.head()
    

In [None]:
row = df.iloc[3]
print("Transaction: ", row['Transaction Description'])
print()
print("Reference:", row['transaction_type'], row['transaction_category'])
print("GPT-3.5 (A):", row['gpt3.5_transaction_type'], row['gpt3.5_transaction_category'])
print("Mistral (B):", row['mistral_transaction_type'], row['mistral_transaction_category'])

print()
print(row['pairwise_reasoning'])