<a href="https://colab.research.google.com/github/Dharshan4038/Invoice_Purchase_Order_Match/blob/main/weighted_inv_po.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence_transformers transformers datasets

In [None]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sentence_transformers import CrossEncoder, InputExample
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from transformers import Trainer, TrainingArguments
from torch import nn

  from tqdm.autonotebook import tqdm, trange


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Invoice_PO_Match/validation_formatted 1.csv')
df.head()

In [None]:
invoice = df["invoice_desc"]
purchase_order = df["PO_line_desc"]
true_mapping = df["true_mapping"]

In [None]:
import ast
for i in range(len(invoice)):
    invoice[i] = ast.literal_eval(invoice[i])
    purchase_order[i] = ast.literal_eval(purchase_order[i])
    true_mapping[i] = eval(true_mapping[i])

In [None]:
# Create a DataFrame with all possible invoice/purchase order pairs
final_data = []
for invoice_list, po_list, mapping in zip(invoice, purchase_order, true_mapping):
    mapping_dict = {list(d.keys())[0]: list(d.values())[0] for d in mapping}
    for invoice in invoice_list:
        for po in po_list:
            label = 1 if mapping_dict.get(invoice) == po else 0
            final_data.append([invoice, po, label])

In [None]:
df = pd.DataFrame(final_data, columns=['Invoice', 'Purchase Order', 'Label'])

In [None]:
df.head()

Unnamed: 0,Invoice,Purchase Order,Label
0,25103 ICE CREAM MANGO SORBETTO TRAY,ICE CREAM MANGO SORBETTO TRAY,1
1,25103 ICE CREAM MANGO SORBETTO TRAY,ICE CREAM GELATO MACKINAC FDGE,0
2,25103 ICE CREAM MANGO SORBETTO TRAY,ICE CREAM COOKIE%CRM DUTCH FUD,0
3,25103 ICE CREAM MANGO SORBETTO TRAY,ICE CREAM GELATO BUTR PECAN,0
4,25103 ICE CREAM MANGO SORBETTO TRAY,ICE CREAM SORBET LEMONCELLO,0


In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
model_name = "cross-encoder/stsb-roberta-base"
cross_encoder = CrossEncoder(model_name, num_labels=1)

In [None]:
# Tokenizer for the CrossEncoder model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare the data for InputExample format
train_examples = [InputExample(texts=[row['Invoice'], row['Purchase Order']], label=row['Label']) for _, row in train_df.iterrows()]
val_examples = [InputExample(texts=[row['Invoice'], row['Purchase Order']], label=row['Label']) for _, row in val_df.iterrows()]

In [None]:
# Define a custom data collator
from torch.nn.utils.rnn import pad_sequence

# Custom collate function to handle padding in the DataLoader
def custom_collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]

    # Pad the input_ids and attention_mask sequences to the longest sequence in the batch
    padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)

    invoice_idxs = torch.tensor([item['invoice_idx'] for item in batch])
    po_idxs = torch.tensor([item['po_idx'] for item in batch])

    return {
        "input_ids": padded_input_ids,
        "attention_mask": padded_attention_mask,
        "invoice_idx": invoice_idxs,
        "po_idx": po_idxs
    }

In [None]:
# Define class weights (adjust for class imbalance)
class_counts = df['Label'].value_counts()
total_samples = len(df)
class_weights = [total_samples / class_counts[0], total_samples / class_counts[1]]
class_weights = torch.tensor(class_weights).to(cross_encoder.model.device)

In [None]:
class_weights

tensor([ 1.0845, 12.8367], dtype=torch.float64)

In [None]:
class WeightedLossCrossEncoder(CrossEncoder):
    def __init__(self, *args, **kwargs):
        super(WeightedLossCrossEncoder, self).__init__(*args, **kwargs)
        self.loss_fn = nn.BCEWithLogitsLoss(pos_weight=class_weights[1])

    def forward(self, input_ids, attention_mask, labels):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze()
        loss = self.loss_fn(logits, labels)
        return loss, logits

In [None]:
cross_encoder = WeightedLossCrossEncoder(model_name, num_labels=1)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01
)

# Create a Trainer instance
trainer = Trainer(
    model=cross_encoder.model,
    args=training_args,
    train_dataset=train_examples,
    eval_dataset=val_examples,
    data_collator=custom_collate_fn
)



In [None]:
# trainer.train()

In [None]:
# output_dir = "/content/drive/MyDrive/Invoice_PO_Match/saved_model"
# cross_encoder.model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)

# print(f"Model saved to {output_dir}")

In [None]:
model_name = "/content/drive/MyDrive/Invoice_PO_Match/saved_model"
cross_encoder = CrossEncoder(model_name, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
val_dataloader = DataLoader(val_examples, batch_size=8, shuffle=False, collate_fn=custom_collate_fn)

In [None]:
# Set the model to evaluation mode
cross_encoder.model.eval()

In [None]:
# # Make predictions on the validation set
# val_predictions = []
# val_labels = []
# val_logits = []  # Store logits for ROC-AUC calculation

# # Use a threshold to classify predictions
# threshold = 0.5

# # Evaluate predictions
# with torch.no_grad():
#     for batch in val_dataloader:
#         input_ids = batch["input_ids"].to(cross_encoder.model.device)
#         attention_mask = batch["attention_mask"].to(cross_encoder.model.device)
#         labels = batch["labels"].to(cross_encoder.model.device)

#         # Forward pass
#         outputs = cross_encoder.model(input_ids=input_ids, attention_mask=attention_mask)
#         logits = outputs.logits.squeeze().cpu().numpy()

#         # Save logits for ROC-AUC
#         val_logits.extend(logits)  # Ensure logits are appended correctly

#         # Apply threshold to logits for binary classification
#         predictions = (logits >= threshold).astype(int)
#         val_predictions.extend(predictions)
#         val_labels.extend(labels.cpu().numpy())

# # Check the lengths of val_labels and val_logits
# print(f"Number of labels: {len(val_labels)}, Number of logits: {len(val_logits)}")

In [None]:
# # Calculate metrics
# accuracy = accuracy_score(val_labels, val_predictions)
# precision = precision_score(val_labels, val_predictions)
# recall = recall_score(val_labels, val_predictions)
# f1 = f1_score(val_labels, val_predictions)
# roc_auc = roc_auc_score(val_labels, val_logits)

In [None]:
# print(f"Validation Accuracy: {accuracy:.4f}")
# print(f"Validation Precision: {precision:.4f}")
# print(f"Validation Recall: {recall:.4f}")
# print(f"Validation F1 Score: {f1:.4f}")
# print(f"Validation ROC-AUC: {roc_auc:.4f}")

In [None]:
# from sklearn.metrics import confusion_matrix
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Confusion Matrix
# conf_matrix = confusion_matrix(val_labels, val_predictions)
# print("Confusion Matrix:")
# print(conf_matrix)

# # Optionally, plot the confusion matrix
# sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.title('Confusion Matrix')
# plt.show()

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from scipy.optimize import linear_sum_assignment
import concurrent.futures

# Create a custom dataset to handle pairs of invoices and purchase orders
class InvoicePurchaseOrderDataset(Dataset):
    def __init__(self, invoices, purchase_orders, tokenizer):
        self.invoices = invoices
        self.purchase_orders = purchase_orders
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.invoices) * len(self.purchase_orders)

    def __getitem__(self, idx):
        # Calculate the correct invoice and purchase order index
        invoice_idx = idx // len(self.purchase_orders)
        po_idx = idx % len(self.purchase_orders)

        invoice = self.invoices[invoice_idx]
        purchase_order = self.purchase_orders[po_idx]

        # Tokenize the input pair
        tokenized_pair = self.tokenizer(invoice, purchase_order, padding=True, truncation=True, return_tensors="pt")

        return {
            "invoice": invoice,
            "purchase_order": purchase_order,
            "input_ids": tokenized_pair['input_ids'].squeeze(0),
            "attention_mask": tokenized_pair['attention_mask'].squeeze(0),
            "invoice_idx": invoice_idx,
            "po_idx": po_idx
        }

In [None]:
def map_invoices_to_purchase_orders(invoices, purchase_orders, cross_encoder, tokenizer, batch_size=64):
    # Initialize the dataset and DataLoader
    dataset = InvoicePurchaseOrderDataset(invoices, purchase_orders, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn)

    num_invoices = len(invoices)
    num_pos = len(purchase_orders)

    # Pre-allocate space for similarity matrix
    similarity_matrix = np.zeros((num_invoices, num_pos))

    # Process the dataset in parallel batches
    def process_batch(batch):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(cross_encoder.model.device)
            attention_mask = batch['attention_mask'].to(cross_encoder.model.device)
            invoice_idxs = batch['invoice_idx'].numpy()
            po_idxs = batch['po_idx'].numpy()

            # Forward pass through the model
            logits = cross_encoder.model(input_ids=input_ids, attention_mask=attention_mask)

            # Ensure that logits are squeezed to handle potential extra dimensions
            scores = logits.logits.squeeze().cpu().numpy()

            # Handle case where scores might be scalar or single values for each batch item
            if scores.ndim == 0:  # If it's a scalar, convert to array
                scores = np.array([scores])

            # Fill in the similarity matrix
            for i, (invoice_idx, po_idx) in enumerate(zip(invoice_idxs, po_idxs)):
                similarity_matrix[invoice_idx, po_idx] = scores[i]

    # Process all batches in parallel using ThreadPoolExecutor
    with concurrent.futures.ThreadPoolExecutor() as executor:
        list(tqdm(executor.map(process_batch, dataloader), total=len(dataloader), desc="Processing Batches"))

    # Use the Hungarian algorithm (linear sum assignment) for optimal one-to-one matching
    cost_matrix = -similarity_matrix  # Convert to cost matrix for minimization
    row_idx, col_idx = linear_sum_assignment(cost_matrix)

    # Create the final result with one-to-one mappings
    results = [{invoices[i]: purchase_orders[j]} for i, j in zip(row_idx, col_idx)]

    return results

In [None]:
import pandas as pd
import ast
import torch

# Load the original CSV containing the invoice and purchase order data
df = pd.read_csv('/content/drive/MyDrive/Invoice_PO_Match/validation_formatted 1.csv')

# Extract the necessary columns: 'invoice_id', 'invoice_desc', 'PO_line_desc', 'true_mapping'
df = df[['invoice_id', 'invoice_desc', 'PO_line_desc', 'true_mapping']]

# Parse the 'invoice_desc', 'PO_line_desc', and 'true_mapping' columns from strings to lists/dictionaries
for i in range(len(df)):
    df.at[i, 'invoice_desc'] = ast.literal_eval(df.at[i, 'invoice_desc'])
    df.at[i, 'PO_line_desc'] = ast.literal_eval(df.at[i, 'PO_line_desc'])
    df.at[i, 'true_mapping'] = ast.literal_eval(df.at[i, 'true_mapping'])

In [None]:
# Function to compute and add the 'model_output' for each row
def compute_model_output(df, cross_encoder, tokenizer):
    model_output = []

    for i, row in df.iterrows():
        invoices = row['invoice_desc']
        purchase_orders = row['PO_line_desc']
        print(i)
        # Use the map_invoices_to_purchase_orders function to get the mapping
        mapping_result = map_invoices_to_purchase_orders(invoices, purchase_orders, cross_encoder, tokenizer)
        print(mapping_result)
        # Append the mapping result to the list
        model_output.append(mapping_result)

    # Add the 'model_output' column to the dataframe
    df['model_output'] = model_output

    return df

In [None]:
c = 0
for i in range(946):
  if len(df["invoice_desc"][i]) == len(df["PO_line_desc"][i]):
    c += 1
print(c)

946


In [None]:
# Compute the model output for the dataframe
df_with_model_output = compute_model_output(df, cross_encoder, tokenizer)

# Save the dataframe to a CSV file
output_csv_path = '/content/drive/MyDrive/Invoice_PO_Match/mapped_invoices_po_output.csv'
df_with_model_output.to_csv(output_csv_path, index=False)

print(f"Model output saved to: {output_csv_path}")

In [None]:
fin_df = pd.read_csv("/content/drive/MyDrive/Invoice_PO_Match/mapped_invoices_po_output.csv")
fin_df.head()

In [None]:
df["model_output"][74]

[{'BEEF': 'BEEF TACO FLLNG RED PPR'},
 {'BEEF': 'BEEF PATTY 3-1 BEST EVER'},
 {'PATTIE': 'BEEF PIZZA PATTIES 3.5OZ'},
 {'BRD': 'BEEF BURGER TNDRBT 81/19 3-1'},
 {'BEEF': 'BEEF GRND PTY 81\\19 SILVER IQF'},
 {'BEEF': 'BEEF PATTY SEASONED BIG BITE'},
 {'MEATBALL': 'MEATBALL BEEF PRCKD 1 OZ'},
 {'BEEF': 'BEEF ROAST POT CKD FRZ'},
 {'STEAK': 'STEAK SIRL RESD FRZN'},
 {'BEEF': 'CHICKEN TENDER BRD FRITT DIXIE'},
 {'BRD': 'CHICKEN BNLS BRD WINGS'}]

In [None]:
df["true_mapping"][74]

[{'BEEF': 'BEEF PATTY SEASONED BIG BITE'},
 {'BEEF': 'BEEF PATTY 3-1 BEST EVER'},
 {'PATTIE': 'BEEF PIZZA PATTIES 3.5OZ'},
 {'BRD': 'CHICKEN TENDER BRD FRITT DIXIE'},
 {'BEEF': 'BEEF BURGER TNDRBT 81/19 3-1'},
 {'BEEF': 'BEEF GRND PTY 81\\19 SILVER IQF'},
 {'MEATBALL': 'MEATBALL BEEF PRCKD 1 OZ'},
 {'BEEF': 'BEEF ROAST POT CKD FRZ'},
 {'STEAK': 'STEAK SIRL RESD FRZN'},
 {'BEEF': 'BEEF TACO FLLNG RED PPR'},
 {'BRD': 'CHICKEN BNLS BRD WINGS'}]

In [None]:
def compare_lists_of_dicts(list1, list2):
    # Convert each dict to a sorted tuple of key-value pairs
    list1_normalized = sorted([sorted(d.items()) for d in list1])
    list2_normalized = sorted([sorted(d.items()) for d in list2])

    # Compare the normalized lists
    return list1_normalized == list2_normalized

In [None]:
compare_lists_of_dicts(df["model_output"][30],df["true_mapping"][30])

False

In [None]:
correct = 0
for i in range(len(df)):
  if compare_lists_of_dicts(df["model_output"][i],df["true_mapping"][i]):
    correct = correct + 1
  else:
    print(i)
acc = correct/len(df)

In [None]:
acc = acc*100
acc

96.93446088794926

In [None]:
# Function to calculate precision, recall, and f1 score for a single sample
def calculate_f1_for_sample(predicted, actual):
    # Flatten the list of dicts to get a set of (key, value) tuples
    predicted_set = set((k, v) for d in predicted for k, v in d.items())
    actual_set = set((k, v) for d in actual for k, v in d.items())

    # True Positives (TP): Correct matches found
    tp = len(predicted_set & actual_set)

    # False Positives (FP): Incorrect matches predicted
    fp = len(predicted_set - actual_set)

    # False Negatives (FN): Correct matches that were missed
    fn = len(actual_set - predicted_set)

    # Precision, Recall, F1 Score
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

In [None]:
# Initialize lists to store precision, recall, and f1 scores for all samples
precisions = []
recalls = []
f1_scores = []

# Iterate over the dataset to calculate precision, recall, and F1 for each sample
for i in range(len(df)):
    predicted_mapping = df["model_output"][i]
    actual_mapping = df["true_mapping"][i]

    # Calculate precision, recall, and F1 for the current sample
    precision, recall, f1 = calculate_f1_for_sample(predicted_mapping, actual_mapping)

    # Append the results to the respective lists
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate average precision, recall, and F1 score across all samples
average_precision = sum(precisions) / len(precisions)
average_recall = sum(recalls) / len(recalls)
average_f1_score = sum(f1_scores) / len(f1_scores)

# Output the results
print(f"Average Precision: {average_precision:.4f}")
print(f"Average Recall: {average_recall:.4f}")
print(f"Average F1 Score: {average_f1_score:.4f}")

Average Precision: 0.9898
Average Recall: 0.9898
Average F1 Score: 0.9898


# Human val data

In [None]:
def compute_similarity_cross_encoder(invoices, pos, cross_encoder, tokenizer):
    # Generate all possible combinations of (invoice, po) pairs
    sentence_pairs = [(invoice, po) for invoice in invoices for po in pos]

    # Use the cross-encoder to get similarity scores for each pair
    inputs = tokenizer(sentence_pairs, padding=True, truncation=True, return_tensors="pt")

    # Predict similarity scores for the pairs
    similarity_scores = cross_encoder.predict(sentence_pairs)

    # Reshape the similarity scores into a matrix (len(invoices) x len(pos))
    similarity_matrix = np.array(similarity_scores).reshape(len(invoices), len(pos))

    return similarity_matrix

In [None]:
# Refined function to map invoices to POs based on the similarity scores
def map_invoices_to_pos(invoices, pos, cross_encoder, tokenizer, threshold=0.5):
    similarity_matrix = compute_similarity_cross_encoder(invoices, pos, cross_encoder, tokenizer)

    # To store the final mapping result
    mapping_result = []
    invoice_used = set()
    po_used = set()

    # Sort similarity matrix values to find best matches
    sorted_matches = []
    for i in range(len(invoices)):
        for j in range(len(pos)):
            sorted_matches.append((i, j, similarity_matrix[i, j]))  # (invoice index, PO index, similarity score)

    # Sort based on similarity score in descending order
    sorted_matches = sorted(sorted_matches, key=lambda x: x[2], reverse=True)

    # Map the best matches first
    for i, j, similarity in sorted_matches:
        if i not in invoice_used and j not in po_used:
            if similarity >= threshold:
                mapping_result.append({invoices[i]: pos[j]})
                invoice_used.add(i)
                po_used.add(j)
            else:
                mapping_result.append({invoices[i]: 'NA'})
                invoice_used.add(i)

    # For any remaining invoices or POs that haven't been matched, map to 'NA'
    for i, invoice in enumerate(invoices):
        if i not in invoice_used:
            mapping_result.append({invoice: 'NA'})

    for j, po in enumerate(pos):
        if j not in po_used:
            mapping_result.append({'NA': po})

    return mapping_result

In [None]:
def compare_lists_of_dicts(list1, list2):
    # Convert each dict to a sorted tuple of key-value pairs
    list1_normalized = sorted([sorted(d.items()) for d in list1])
    list2_normalized = sorted([sorted(d.items()) for d in list2])

    # Compare the normalized lists
    return list1_normalized == list2_normalized

In [None]:
val_df = pd.read_csv("/content/drive/MyDrive/Invoice_PO_Match/validation_formatted 1.csv")
val_df.head(2)

Unnamed: 0,invoice_id,invoice_desc,PO_line_desc,true_mapping
0,INV-49266647,"['25103 ICE CREAM MANGO SORBETTO TRAY', 'ICE C...","['ICE CREAM MANGO SORBETTO TRAY', 'ICE CREAM G...",[{'25103 ICE CREAM MANGO SORBETTO TRAY': 'ICE ...
1,INV-49266660,"['ICE CREAM GELATO MACKINAC FDGE 3/5LT', 'ICE ...","['ICE CREAM GELATO MACKINAC FDGE', 'ICE CREAM ...",[{'ICE CREAM GELATO MACKINAC FDGE 3/5LT': 'ICE...


In [None]:
hval = pd.read_csv("/content/drive/MyDrive/Invoice_PO_Match/Test-Data_red_prompt.csv")
hval.head(2)

Unnamed: 0,Cora_Case_Id,Inv_Product_Service_Description,PO_Line_Product_Service_Description,True_Value,Output
0,INV-49272221,['DMQ-21 DQ BLIZ 2023'],['CUP PAPER CLD 21 OZ DQ 2023'],[{'DMQ-21 DQ BLIZ 2023': 'CUP PAPER CLD 21 OZ ...,[{'DMQ-21 DQ BLIZ 2023': 'CUP PAPER CLD 21 OZ ...
1,INV-50292392,"['SMR-10 COMPASS STAND-', 'DMR-22 BURGERFI 202...","['CUP HOT PAPER 10OZ', 'CUP PAPER CLD 22 OZ BU...",[{'DMR-22 JIMMY JOHNS21': 'CUP POLY CLD JMYJ 2...,[{'SMR-10 COMPASS STAND-': 'CUP PAPER HOT SMR-...


In [None]:
# val_df = hval

In [None]:
result = []

for i in range(len(val_df)):
    # print(i)
    invoice = ast.literal_eval(val_df["invoice_desc"][i])
    po = ast.literal_eval(val_df["PO_line_desc"][i])
    res = map_invoices_to_pos(invoice, po, cross_encoder, tokenizer, threshold=0.5)
    result.append(res)

In [None]:
val_df["new_output"] = result

In [None]:
correct = 0
for i in range(len(val_df)):
  if compare_lists_of_dicts(eval(val_df["true_mapping"][i]),result[i]):
    correct = correct + 1
  else:
    print(i)

30
45
52
60
74
75
81
94
106
124
129
130
156
164
221
237
244
288
312
317
423
455
467
473
477
489
506
521
543
549
598
601
605
608
623
627
667
694
749
766
777
790
796
798
800
803
834
843
861
928
930
932


In [None]:
acc = correct/len(val_df)
acc

0.945031712473573

In [None]:
# Function to calculate precision, recall, and f1 score for a single sample
def calculate_f1_for_sample(predicted, actual):
    # Flatten the list of dicts to get a set of (key, value) tuples
    predicted_set = set((k, v) for d in predicted for k, v in d.items())
    actual_set = set((k, v) for d in actual for k, v in d.items())

    # True Positives (TP): Correct matches found
    tp = len(predicted_set & actual_set)

    # False Positives (FP): Incorrect matches predicted
    fp = len(predicted_set - actual_set)

    # False Negatives (FN): Correct matches that were missed
    fn = len(actual_set - predicted_set)

    # Precision, Recall, F1 Score
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

In [None]:
# Initialize lists to store precision, recall, and f1 scores for all samples
precisions = []
recalls = []
f1_scores = []

# Iterate over the dataset to calculate precision, recall, and F1 for each sample
for i in range(len(val_df)):
    predicted_mapping = val_df["new_output"][i]
    actual_mapping = ast.literal_eval(val_df["true_mapping"][i])

    # Calculate precision, recall, and F1 for the current sample
    precision, recall, f1 = calculate_f1_for_sample(predicted_mapping, actual_mapping)

    # Append the results to the respective lists
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

In [None]:
# Calculate average precision, recall, and F1 score across all samples
average_precision = sum(precisions) / len(precisions)
average_recall = sum(recalls) / len(recalls)
average_f1_score = sum(f1_scores) / len(f1_scores)

In [None]:
# Output the results
print(f"Average Precision: {average_precision:.4f}")
print(f"Average Recall: {average_recall:.4f}")
print(f"Average F1 Score: {average_f1_score:.4f}")

Average Precision: 0.9824
Average Recall: 0.9830
Average F1 Score: 0.9826
