In [14]:
import pytesseract
from PIL import Image
import pdf2image
from huggingface_hub import InferenceClient
import json
import re
import pandas as pd
import fasttext.util
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer


In [15]:
API_TOKEN = "hf_xxxxx" # replace with your API token

client = InferenceClient(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
    token=API_TOKEN,
)

In [16]:
def OCR_extraction(pages):
    # Extract text from image
    text = []
    for page in pages:
        text.append(pytesseract.image_to_string(page))
    return text

def extract_line_items_llama(text):
    prompt_template = """Here's my OCR output from a pdf doc for the purpose of industrial quote generation. 
                        Generate a json file containing list of industrial product_name alphabetical only without special characters 
                        (eg. "Corner Bead paper faced 10"; "5 8 Firecode Core"; "Easy Clip S545") 
                        based on the parsed output (use common sense and imagining the parsed format from a table where related information may be parsed in separate lines), 
                        ignore the quantity (eg. 35,288.66) and unit (eg. 1,000 LF or 1,000 SF or 37.00 EA) in the parsed text:
                        
                        Only output something like this structured output without explanation and code:
                        [
                            {{ "product": "Product A"}},
                            {{ "product": "Product B"}},
                            {{ "product": "Product C"}}, 
                            ...
                        ]:
                        
                        {text}
                        """
    prompt = prompt_template.format(text=text)
    messages = [{"role": "system", "content": "You are a bot that responds only with the extracted product list."},
                {"role": "user", "content": prompt}]
    print("Waiting for Meta-Llama-3.1-8B-Instruct to generate output...")
    response = client.chat_completion(
        messages=messages,
        max_tokens=1650,
        stream=True,
        # temperature=0.0,
    )

    structured_line_items = ""
    for message in response:
        structured_line_items += message.choices[0].delta.content
    
    return structured_line_items.strip()


def extract_product_names(structured_line_items_str):
    try:
        # Use a regular expression to extract the product names
        product_names = re.findall(r'"product":\s*"(?:[^"\\]|\\.)*"', structured_line_items_str)
        # Remove the "product": and quotes from the matches
        product_names = [match[12:-1] for match in product_names]
        return product_names
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return None

In [17]:
# Convert PDF to images
request_1 = pdf2image.convert_from_path('/Users/evanwyf/Downloads/AutoQuote/Data/Request-Response 1/Request 1.pdf')
request_2 = pdf2image.convert_from_path('/Users/evanwyf/Downloads/AutoQuote/Data/Request-Response 2/Request 2.pdf')
product_db = pd.read_csv('/Users/evanwyf/Downloads/AutoQuote/Data/ProductDB.csv')

# Extract text from each image
text = OCR_extraction(request_2)
# Combine the extracted text into a single string
combined_text = " ".join(text)

# Extract the line items from the combined text
structured_line_items_str = extract_line_items_llama(combined_text)
if structured_line_items_str:
    print("Extracted Items:", structured_line_items_str)
    product_names = extract_product_names(structured_line_items_str)
    print("# of extracted products:", len(product_names))

    # Load the product database
    product_names_db = product_db.iloc[:, 2].tolist()  # Assuming the product name is in the third column
    print("# of DB products:", len(product_names_db))

    # Initialize BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


Waiting for Meta-Llama-3.1-8B-Instruct to generate output...
Extracted Items: [
    {"product": "2 x 4 Fire Treated"},
    {"product": "2 x 4 Pressure Treated Roof Blocking"},
    {"product": "2 x 4 Pressure Treated Window Blkg"},
    {"product": "2 x 6 Fire Treated"},
    {"product": "2 x 6 Pressure Treated Roof Blocking"},
    {"product": "2 x 6 Pressure Treated Window Bikg"},
    {"product": "2 x 8 Fire Treated"},
    {"product": "2 x 8 Pressure Treated Roof Blocking"},
    {"product": "2 x 8 Pressure Treated, Window Blkg"},
    {"product": "4 x 4 fire treated plywood (PLY 58FT)"},
    {"product": "4  x 8' (PLY12FT)"},
    {"product": "4' x 8' (PLY34FT)"},
    {"product": "4' x 8' CDX F/R Plywood"},
    {"product": "4' x 8' Plywood 11/8\" (PLY136FT)"},
    {"product": "4' x 8' T&G Underlayment (PLY 34UNDTG)"},
    {"product": "5  8\" Cement Backer Board ( 4' x 4' ) (CBB58)"},
    {"product": "5 8 fire treated plywood, 4’ x 8' (PLY58FT)"},
    {"product": "5/8  - Cement Board Sheathi



In [18]:
def custom_tokenize(text, tokenizer):
    # Use BERT tokenizer to tokenize text
    tokens = tokenizer.tokenize(text)
    return tokens

def compute_similarity_scores_tfidf(extracted_names, product_names, tokenizer):
    # Combine all names into a single list
    all_names = extracted_names + product_names

    # Create a TF-IDF vectorizer with custom tokenization using BERT tokenizer
    vectorizer = TfidfVectorizer(tokenizer=lambda text: custom_tokenize(text, tokenizer), stop_words='english', ngram_range=(1, 3))

    # Fit and transform the vectorizer on all names
    tfidf_matrix = vectorizer.fit_transform(all_names)

    # Split the TF-IDF matrix back into extracted names and product names
    extracted_tfidf = tfidf_matrix[:len(extracted_names)]
    product_tfidf = tfidf_matrix[len(extracted_names):]

    # Compute cosine similarity scores
    similarity_scores = cosine_similarity(extracted_tfidf, product_tfidf)

    return similarity_scores

def get_top_n_recommendations_tfidf(similarity_scores, product_names, n=10):
    recommendations = []
    for scores in similarity_scores:
        # Get the indices of the top-n scores
        top_indices = scores.argsort()[-n:][::-1]
        top_recommendations = [product_names[i] for i in top_indices]
        recommendations.append(top_recommendations)
    return recommendations

In [19]:
# Compute similarity scores using TF-IDF
similarity_scores = compute_similarity_scores_tfidf(product_names, product_names_db, tokenizer)

# Get top-10 recommendations for each extracted product name
top_10_recommendations = get_top_n_recommendations_tfidf(similarity_scores, product_names_db)
for i, recommendations in enumerate(top_10_recommendations):
    print(f"Top-10 Recommendations for Request No.{i+1} '{product_names[i]}': {recommendations}")



Top-10 Recommendations for Request No.1 '2 x 4 Fire Treated': ['ANGL 2 X 4-16', 'ANGL 2 X 4-14', 'ANGL 2 X 4 16GA', 'CORNERITE - 2 IN X 2 IN X 4FT', 'ANGL 2 X 4 (18G)', 'ANGL 2 X 4.5-20', 'ANGL 2 X 4 (12GA)', 'DZF 2 X 4 X 2-16GA', 'DZF 2 X 4 X 2-20GA', 'ANGL 2 X 4 (25GA)']
Top-10 Recommendations for Request No.2 '2 x 4 Pressure Treated Roof Blocking': ['ANGL 2 X 4-16', 'ANGL 2 X 4-14', 'ANGL 2 X 4 16GA', 'CORNERITE - 2 IN X 2 IN X 4FT', 'ANGL 2 X 4 (18G)', 'ANGL 2 X 4.5-20', 'ANGL 2 X 4 (12GA)', 'DZF 2 X 4 X 2-16GA', 'DZF 2 X 4 X 2-20GA', 'ANGL 2 X 4 (25GA)']
Top-10 Recommendations for Request No.3 '2 x 4 Pressure Treated Window Blkg': ['ANGL 2 X 4-16', 'ANGL 2 X 4-14', 'ANGL 2 X 4 16GA', 'DCV58-200S BLK - 10FT', 'ANGL 2 X 4 (18G)', 'CORNERITE - 2 IN X 2 IN X 4FT', 'ANGL 2 X 4.5-20', 'ANGL 2 X 4 (12GA)', 'DZF 2 X 4 X 2-16GA', 'DZF 2 X 4 X 2-20GA']
Top-10 Recommendations for Request No.4 '2 x 6 Fire Treated': ['ANGL 2 X 6-16', 'ANGL 2 X 6-14', 'ANGL 2 X 6-12', 'CDMB 2 X 6', 'CDBV 2 X 6'