<a href="https://colab.research.google.com/github/Dharshan4038/Invoice_Purchase_Order_Match/blob/main/Invoice_PO_Match.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re

In [2]:
# Load CSV file into a DataFrame
df = pd.read_csv('/content/drive/MyDrive/Invoice_PO_Match/invoice_purchase_order.csv')

In [3]:
df.head(5)

Unnamed: 0,Invoice Column,Purchase Order Column,Mapped Result
0,"['14329 ICE CREAM VANILLA BEAN 2/4LT', 'ICE CR...","['ICE CREAM VANILLA BEAN 2/4LT', 'ICE CREAM MI...",{'14329 ICE CREAM VANILLA BEAN 2/4LT': 'ICE CR...
1,"['46323 ICE CREAM SORBET LEMONCELLO 5/5LT', '3...","['ICE CREAM SORBET LEMONCELLO', 'ICE CREAM VAN...",{'46323 ICE CREAM SORBET LEMONCELLO 5/5LT': 'I...
2,"['14329 ICE CREAM VANILLA BEAN 2/4LT', 'ICE CR...","['ICE CREAM VANILLA BEAN 2/4LT', 'ICE CREAM MI...",{'14329 ICE CREAM VANILLA BEAN 2/4LT': 'ICE CR...
3,"['46323 ICE CREAM SORBET LEMONCELLO 5/5LT', '3...","['ICE CREAM SORBET LEMONCELLO', 'ICE CREAM VAN...",{'46323 ICE CREAM SORBET LEMONCELLO 5/5LT': 'I...
4,"['14329 ICE CREAM VANILLA BEAN 2/4LT', 'ICE CR...","['ICE CREAM VANILLA BEAN 2/4LT', 'ICE CREAM MI...",{'14329 ICE CREAM VANILLA BEAN 2/4LT': 'ICE CR...


## Data Cleaning

In [4]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Replace common special characters (& and %) with their word equivalents
    text = text.replace('&', 'and').replace('%', 'percent')
    # Remove any character that is not a letter, number, or space
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove any extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [5]:
df['Invoice Column'] = df['Invoice Column'].apply(clean_text)
df['Purchase Order Column'] = df['Purchase Order Column'].apply(clean_text)

In [6]:
# Tokenization
def tokenize_text(text):
    # Tokenize the text by splitting on commas or spaces
    tokens = re.split(r',\s*|\s+', text)
    return tokens

In [7]:
# Apply tokenization
df['Invoice Tokens'] = df['Invoice Column'].apply(tokenize_text)
df['Purchase Order Tokens'] = df['Purchase Order Column'].apply(tokenize_text)

In [8]:
# Removing numbers and other unwanted tokens
def remove_noise(tokens):
    # Remove any token that is purely numeric
    tokens = [token for token in tokens if not token.isdigit()]
    return tokens

In [9]:
df['Invoice Clean Tokens'] = df['Invoice Tokens'].apply(remove_noise)
df['Purchase Order Clean Tokens'] = df['Purchase Order Tokens'].apply(remove_noise)

In [10]:
df[['Invoice Column', 'Purchase Order Column', 'Invoice Clean Tokens', 'Purchase Order Clean Tokens']].head()

Unnamed: 0,Invoice Column,Purchase Order Column,Invoice Clean Tokens,Purchase Order Clean Tokens
0,14329 ice cream vanilla bean 24lt ice cream mi...,ice cream vanilla bean 24lt ice cream mint cho...,"[ice, cream, vanilla, bean, 24lt, ice, cream, ...","[ice, cream, vanilla, bean, 24lt, ice, cream, ..."
1,46323 ice cream sorbet lemoncello 55lt 36489 i...,ice cream sorbet lemoncello ice cream vanilla ...,"[ice, cream, sorbet, lemoncello, 55lt, ice, cr...","[ice, cream, sorbet, lemoncello, ice, cream, v..."
2,14329 ice cream vanilla bean 24lt ice cream mi...,ice cream vanilla bean 24lt ice cream mint cho...,"[ice, cream, vanilla, bean, 24lt, ice, cream, ...","[ice, cream, vanilla, bean, 24lt, ice, cream, ..."
3,46323 ice cream sorbet lemoncello 55lt 36489 i...,ice cream sorbet lemoncello ice cream vanilla ...,"[ice, cream, sorbet, lemoncello, 55lt, ice, cr...","[ice, cream, sorbet, lemoncello, ice, cream, v..."
4,14329 ice cream vanilla bean 24lt ice cream mi...,ice cream vanilla bean 24lt ice cream mint cho...,"[ice, cream, vanilla, bean, 24lt, ice, cream, ...","[ice, cream, vanilla, bean, 24lt, ice, cream, ..."


In [11]:
df.to_csv('cleaned_invoice_purchase_order_dataset.csv', index=False)

In [12]:
# from google.colab import files
# files.download('cleaned_invoice_purchase_order_dataset.csv')

In [13]:
df = pd.read_csv("/content/drive/MyDrive/Invoice_PO_Match/cleaned_invoice_purchase_order_dataset.csv")
df.head()

Unnamed: 0,Invoice Column,Purchase Order Column,Mapped Result,Invoice Tokens,Purchase Order Tokens,Invoice Clean Tokens,Purchase Order Clean Tokens
0,14329 ice cream vanilla bean 24lt ice cream mi...,ice cream vanilla bean 24lt ice cream mint cho...,{'14329 ICE CREAM VANILLA BEAN 2/4LT': 'ICE CR...,"['14329', 'ice', 'cream', 'vanilla', 'bean', '...","['ice', 'cream', 'vanilla', 'bean', '24lt', 'i...","['ice', 'cream', 'vanilla', 'bean', '24lt', 'i...","['ice', 'cream', 'vanilla', 'bean', '24lt', 'i..."
1,46323 ice cream sorbet lemoncello 55lt 36489 i...,ice cream sorbet lemoncello ice cream vanilla ...,{'46323 ICE CREAM SORBET LEMONCELLO 5/5LT': 'I...,"['46323', 'ice', 'cream', 'sorbet', 'lemoncell...","['ice', 'cream', 'sorbet', 'lemoncello', 'ice'...","['ice', 'cream', 'sorbet', 'lemoncello', '55lt...","['ice', 'cream', 'sorbet', 'lemoncello', 'ice'..."
2,14329 ice cream vanilla bean 24lt ice cream mi...,ice cream vanilla bean 24lt ice cream mint cho...,{'14329 ICE CREAM VANILLA BEAN 2/4LT': 'ICE CR...,"['14329', 'ice', 'cream', 'vanilla', 'bean', '...","['ice', 'cream', 'vanilla', 'bean', '24lt', 'i...","['ice', 'cream', 'vanilla', 'bean', '24lt', 'i...","['ice', 'cream', 'vanilla', 'bean', '24lt', 'i..."
3,46323 ice cream sorbet lemoncello 55lt 36489 i...,ice cream sorbet lemoncello ice cream vanilla ...,{'46323 ICE CREAM SORBET LEMONCELLO 5/5LT': 'I...,"['46323', 'ice', 'cream', 'sorbet', 'lemoncell...","['ice', 'cream', 'sorbet', 'lemoncello', 'ice'...","['ice', 'cream', 'sorbet', 'lemoncello', '55lt...","['ice', 'cream', 'sorbet', 'lemoncello', 'ice'..."
4,14329 ice cream vanilla bean 24lt ice cream mi...,ice cream vanilla bean 24lt ice cream mint cho...,{'14329 ICE CREAM VANILLA BEAN 2/4LT': 'ICE CR...,"['14329', 'ice', 'cream', 'vanilla', 'bean', '...","['ice', 'cream', 'vanilla', 'bean', '24lt', 'i...","['ice', 'cream', 'vanilla', 'bean', '24lt', 'i...","['ice', 'cream', 'vanilla', 'bean', '24lt', 'i..."


## Fine Tuning the SBERT Model

In [None]:
!pip install -U sentence-transformers transformers

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from torch.utils.data import DataLoader
from sentence_transformers import datasets
import random

In [16]:
def create_training_data(df):
    train_examples = []

    for idx, row in df.iterrows():
        # Ensure that 'Invoice Clean Tokens' and 'Purchase Order Clean Tokens' are processed as strings
        cleaned_invoice_items = ' '.join(eval(row['Invoice Clean Tokens']))  # Join tokens into a string
        cleaned_purchase_order_items = ' '.join(eval(row['Purchase Order Clean Tokens']))  # Join tokens into a string

        # Positive example: matching invoice-purchase order pair
        train_examples.append({
            'Text1': cleaned_invoice_items,
            'Text2': cleaned_purchase_order_items,
            'Label': 1.0  # Label for correct matches
        })

        # Negative example: mismatched pair (shuffle purchase orders)
        shuffled_po_items = random.sample(eval(row['Purchase Order Clean Tokens']), len(eval(row['Purchase Order Clean Tokens'])))
        shuffled_po_string = ' '.join(shuffled_po_items)

        if cleaned_invoice_items != shuffled_po_string:
            train_examples.append({
                'Text1': cleaned_invoice_items,
                'Text2': shuffled_po_string,
                'Label': 0.0  # Label for incorrect matches
            })

    return train_examples

In [17]:
# Now, we create the training data
train_examples = create_training_data(df)

In [18]:
train_df = pd.DataFrame(train_examples)

# Display a few examples
train_df.head(10)

Unnamed: 0,Text1,Text2,Label
0,ice cream vanilla bean 24lt ice cream mint cho...,ice cream vanilla bean 24lt ice cream mint cho...,1.0
1,ice cream vanilla bean 24lt ice cream mint cho...,vanilla 24lt gelato sorbet ice mint cream hone...,0.0
2,ice cream sorbet lemoncello 55lt ice cream van...,ice cream sorbet lemoncello ice cream vanilla ...,1.0
3,ice cream sorbet lemoncello 55lt ice cream van...,gelato cream sorbet dutch sorbet bean ice ice ...,0.0
4,ice cream vanilla bean 24lt ice cream mint cho...,ice cream vanilla bean 24lt ice cream mint cho...,1.0
5,ice cream vanilla bean 24lt ice cream mint cho...,ice cream ice cream cream sorbet ice mint chip...,0.0
6,ice cream sorbet lemoncello 55lt ice cream van...,ice cream sorbet lemoncello ice cream vanilla ...,1.0
7,ice cream sorbet lemoncello 55lt ice cream van...,cream ice honeycomb fudge raspberry cream crea...,0.0
8,ice cream vanilla bean 24lt ice cream mint cho...,ice cream vanilla bean 24lt ice cream mint cho...,1.0
9,ice cream vanilla bean 24lt ice cream mint cho...,ice cream 24lt ice cream mint cream cookiesand...,0.0


In [None]:
# Load pre-trained Sentence-BERT model
model = SentenceTransformer('all-mpnet-base-v2')

In [20]:
# Assume `train_df` is your DataFrame with columns: 'Text1', 'Text2', and 'Label'
train_examples = []

for idx, row in train_df.iterrows():
    train_examples.append(InputExample(
        texts=[row['Text1'], row['Text2']],
        label=row['Label']
    ))

# Create a DataLoader from your training examples
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)


In [None]:
!pip install datasets

In [22]:
model = SentenceTransformer('all-mpnet-base-v2')
# Define the loss function
train_loss = losses.CosineSimilarityLoss(model)

In [26]:
# Define the number of epochs and warmup steps (if needed)
num_epochs = 4
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)  # 10% of training steps as warm-up

# Fine-tune the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path='/content/drive/MyDrive/Invoice_PO_Match/output/fine_tuned_model'  # Path where the fine-tuned model will be saved
)

Step,Training Loss
500,0.0001


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [27]:
# Load the fine-tuned model
fine_tuned_model = SentenceTransformer('/content/drive/MyDrive/Invoice_PO_Match/output/fine_tuned_model')

In [28]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the fine-tuned model
model = SentenceTransformer('output/fine_tuned_model')

# Define your invoice orders and purchase orders
invoice_orders = [
    '14329 ICE CREAM VANILLA BEAN 2/4LT',
    'ICE CREAM MINT CHOCO CHIP 3/5LT',
    'ICE CREAM GELATO HONEYCOMB 3/5LT',
    'ICE CREAM SORBET RASPBERRY 3/5LT',
    'ICE CREAM GELATO STRAWBERRY 2/4LT',
    'ICE CREAM COOKIES&CREAM 2/4LT'
]

purchase_orders = [
    'ICE CREAM VANILLA BEAN 2/4LT',
    'ICE CREAM GELATO HONEYCOMB 3/5LT',
    'ICE CREAM MINT CHOCO CHIP',
    'ICE CREAM SORBET RASPBERRY',
    'ICE CREAM GELATO STRAWBERRY',
    'ICE CREAM COOKIES&CREAM'
]

# Encode the texts
invoice_embeddings = model.encode(invoice_orders, convert_to_tensor=True)
purchase_embeddings = model.encode(purchase_orders, convert_to_tensor=True)

# Move tensors to CPU and convert to numpy arrays
invoice_embeddings_np = invoice_embeddings.cpu().numpy()
purchase_embeddings_np = purchase_embeddings.cpu().numpy()

# Compute cosine similarity
similarity_matrix = cosine_similarity(invoice_embeddings_np, purchase_embeddings_np)

# Map each invoice order to the most similar purchase order
mapped_result = {}
for i, invoice in enumerate(invoice_orders):
    most_similar_idx = np.argmax(similarity_matrix[i])
    mapped_result[invoice] = purchase_orders[most_similar_idx]

print(mapped_result)

{'14329 ICE CREAM VANILLA BEAN 2/4LT': 'ICE CREAM VANILLA BEAN 2/4LT', 'ICE CREAM MINT CHOCO CHIP 3/5LT': 'ICE CREAM MINT CHOCO CHIP', 'ICE CREAM GELATO HONEYCOMB 3/5LT': 'ICE CREAM GELATO HONEYCOMB 3/5LT', 'ICE CREAM SORBET RASPBERRY 3/5LT': 'ICE CREAM SORBET RASPBERRY', 'ICE CREAM GELATO STRAWBERRY 2/4LT': 'ICE CREAM GELATO HONEYCOMB 3/5LT', 'ICE CREAM COOKIES&CREAM 2/4LT': 'ICE CREAM COOKIES&CREAM'}
