In [35]:
import pytesseract
from PIL import Image
import pdf2image
from huggingface_hub import InferenceClient
import json
import re
import pandas as pd
from transformers import BertTokenizer
import fasttext
import fasttext.util
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
fasttext.util.download_model('en', if_exists='ignore')  # Download English model
ft_model = fasttext.load_model('cc.en.300.bin')

In [37]:
API_TOKEN = "hf_xxxxx" # replace with your API token

client = InferenceClient(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
    token=API_TOKEN,
)

In [51]:
def OCR_extraction(pages):
    # Extract text from image
    text = []
    for page in pages:
        text.append(pytesseract.image_to_string(page))
    return text

def extract_line_items_llama(text):
    prompt_template = """Here's my OCR output from a pdf doc for the purpose of industrial quote generation. 
                        Generate a json file containing list of industrial product_name alphabetical only without special characters 
                        (eg. "Corner Bead paper faced 10"; "5 8 Firecode Core"; "Easy Clip S545") 
                        based on the parsed output (use common sense and imagining the parsed format from a table where related information may be parsed in separate lines), 
                        ignore the quantity (eg. 35,288.66) and unit (eg. 1,000 LF or 1,000 SF or 37.00 EA) in the parsed text:
                        
                        Only output something like this structured output without explanation and code:
                        [
                            {{ "product": "Product A"}},
                            {{ "product": "Product B"}},
                            {{ "product": "Product C"}}, 
                            ...
                        ]:
                        
                        {text}
                        """
    prompt = prompt_template.format(text=text)
    messages = [{"role": "system", "content": "You are a bot that responds only with the extracted product list."},
                {"role": "user", "content": prompt}]
    print("Waiting for Meta-Llama-3.1-8B-Instruct to generate output...")
    response = client.chat_completion(
        messages=messages,
        max_tokens=1520,
        stream=True,
        temperature=0.0,
    )

    structured_line_items = ""
    for message in response:
        structured_line_items += message.choices[0].delta.content
    
    return structured_line_items.strip()


def extract_product_names(structured_line_items_str):
    try:
        # Use a regular expression to extract the product names
        product_names = re.findall(r'"product":\s*"(?:[^"\\]|\\.)*"', structured_line_items_str)
        # Remove the "product": and quotes from the matches
        product_names = [match[12:-1] for match in product_names]
        return product_names
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return None

In [52]:
# Convert PDF to images
request_1 = pdf2image.convert_from_path('/Users/evanwyf/Downloads/AutoQuote/Data/Request-Response 1/Request 1.pdf')
request_2 = pdf2image.convert_from_path('/Users/evanwyf/Downloads/AutoQuote/Data/Request-Response 2/Request 2.pdf')
product_db = pd.read_csv('/Users/evanwyf/Downloads/AutoQuote/Data/ProductDB.csv')

# Extract text from each image
text = OCR_extraction(request_2)
# Combine the extracted text into a single string
combined_text = " ".join(text)

# Extract the line items from the combined text
structured_line_items_str = extract_line_items_llama(combined_text)
if structured_line_items_str:
    print("Extracted Items:", structured_line_items_str)
    product_names = extract_product_names(structured_line_items_str)
    product_names = [name for name in product_names if len(name) > 4] # Filter out short product names
    print("# of extracted products:", len(product_names))

    # Load the product database
    product_names_db = product_db.iloc[:, 2].tolist()  # Assuming the product name is in the third column
    print("# of DB products:", len(product_names_db))
    # Initialize BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Waiting for Meta-Llama-3.1-8B-Instruct to generate output...
Extracted Items: [
  { "product": "Easy Clip S545" },
  { "product": "Uni-Clip UCEC" },
  { "product": "Extended FastClip FCEC12" },
  { "product": "Extended FastClip FCEC8" },
  { "product": "2 16ga Flat Strapping FS216" },
  { "product": "10 x 2 x 16ga Ext Stud S1000S200-54" },
  { "product": "2-1/2 x 2 x 16ga Ext Stud S250S200-54" },
  { "product": "3-5/8 x 1-5/8 x 16ga Ext Stud S362S162-54" },
  { "product": "4 x 1-5/8 x 18ga Ext Stud S400S162-43" },
  { "product": "4 x 1-5/8 x 16ga Ext Stud S400S162-54" },
  { "product": "4 x 2 x 16ga Ext Stud S400S200-54" },
  { "product": "6 x 1 5/8 x 16ga Ext Stud S600S162-54" },
  { "product": "6 x 2 x 16ga Ext Stud S600S200-54" },
  { "product": "8 x 1-5/8 x 16ga Ext Stud S800S162-54" },
  { "product": "2-1/2 x 16ga Ext Track T250T125-54" },
  { "product": "3-5/8 x 16ga Ext Track T362T125-54" },
  { "product": "4 x 18ga Ext Track T400T125-43" },
  { "product": "4 x 16ga Ext Track T4



In [54]:
def custom_tokenize(text, tokenizer):
    # Use BERT tokenizer to tokenize text
    tokens = tokenizer.tokenize(text)
    return tokens

def compute_similarity_scores_fasttext(extracted_names, product_names, tokenizer, model):
    # Tokenize and vectorize product names using BERT tokenizer
    extracted_vectors = []
    for name in extracted_names:
        tokens = custom_tokenize(name, tokenizer)
        vectors = [model.get_word_vector(token) for token in tokens]
        extracted_vectors.append(np.mean(vectors, axis=0))
    
    product_vectors = []
    for name in product_names:
        tokens = custom_tokenize(name, tokenizer)
        vectors = [model.get_word_vector(token) for token in tokens]
        product_vectors.append(np.mean(vectors, axis=0))
    
    # Compute cosine similarity scores
    similarity_scores = cosine_similarity(np.array(extracted_vectors), np.array(product_vectors))
    
    return similarity_scores

def get_top_n_recommendations_fasttext(similarity_scores, product_names, n=10):
    recommendations = []
    for scores in similarity_scores:
        # Get the indices of the top-n scores
        top_indices = scores.argsort()[-n:][::-1]
        top_recommendations = [product_names[i] for i in top_indices]
        recommendations.append(top_recommendations)
    return recommendations

In [55]:
# Compute similarity scores using FastText with BERT tokenizer
similarity_scores = compute_similarity_scores_fasttext(product_names, product_names_db, tokenizer, ft_model)

# Get top-10 recommendations for each extracted product name
top_10_recommendations = get_top_n_recommendations_fasttext(similarity_scores, product_names_db)
for i, recommendations in enumerate(top_10_recommendations):
    print(f"Top-10 Recommendations for '{product_names[i]}': {recommendations}")

Top-10 Recommendations for 'Easy Clip S545': ['S414', 'S610', 'S310', 'MITEK S/PHD4', 'MITEK S/PHD9', 'MITEK S/PHD6', 'T12', 'TGHW4', 'TPA4G', 'NP-S']
Top-10 Recommendations for 'Uni-Clip UCEC': ['UCEC UNICLIP END CLIP', 'UCEC UNICLIP END CLIP', 'MP-UNMT', 'UNI-TRACK - 8FT', 'UNI-TRACK - 10 FT', 'FLAT DIAMOND SECURITY LATH - CDBS-SL', 'RCK-MNT-BKT L PLATE MID CHASSIS X STRONG', 'DS/DR58-FWS6678 NW-10', 'UXRC12 UNICLIP EXT RIGID CLIP', 'UXRC10 UNICLIP EXT RIGID CLIP']
Top-10 Recommendations for 'Extended FastClip FCEC12': ['FCS58-100EC', 'FCS25-100EC', 'FCS75-100EC', 'FCS38-100EC', 'FCS58-200EC', 'FCS58-150EC', 'FCS25-200EC', 'FCS25-150EC', 'FCS75-150EC', 'FCS75-200EC']
Top-10 Recommendations for 'Extended FastClip FCEC8': ['FCS78-100EC', 'FCS78-150EC', 'FCS75-100EC', 'FCS78-200EC', 'FCS58-100EC', 'FCS75-150EC', 'FCS58-150EC', 'FCS25-100EC', 'FCS25-150EC', 'FCS58-200EC']
Top-10 Recommendations for '2 16ga Flat Strapping FS216': ['FSB125-3IN 20G FLATSTRAPBACKER 1.25 TAPE', 'ProTRAK 22MIL