# Introduction

This project implements a full pipeline to extract text from scanned business card images,
classify the extracted text into structured fields (such as Name, Phone, Email, Address, Company, etc.),
and output organized results.

Using a combination of OpenCV, Pytesseract OCR, a PyTorch classifier, and rule-based post-processing,
the system achieves approximately 80% field extraction accuracy on real-world noisy business card images.


In [37]:
!pip install pytesseract
!pip install faker
import cv2
import pytesseract
from pytesseract import Output
from faker import Faker
import random
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch import tensor
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import re



In [38]:
# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## OpenCV + Pytesseract (image preprocessing)

In [39]:

#function for getting the text from the image using openCV and pytessearct

def get_text_lines(image_path):
    image = cv2.imread(image_path)

    if image is None:
        print("Unable to read image")
        return []

    #preprocess the image for RGB version and rnu the result through tesseract
    rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = pytesseract.image_to_data(rgb,output_type = Output.DICT)

    # group the words by its block, line and paragraph so words in each line is combined as sentence
    # only use the words that have confidence score > 60
    result_word = {}
    for i in range(len(results['text'])):
        if int(results['conf'][i]) > 60 and results['text'][i].strip() != '':
            key = (results['block_num'][i], results['par_num'][i], results['line_num'][i])
            value = (results['left'][i], results['text'][i])
            if key in result_word:
                result_word[key].append(value)
            else:
                result_word[key] = [value]

    #sort words in each line by their position(left coordinate) and then join them
    line_texts = []
    for words in result_word.values():
        sorted_words = sorted(words, key=lambda x: x[0])  # Sort by left position
        line = " ".join([w[1] for w in sorted_words])
        line_texts.append(line)

    return line_texts


## Rule Based Classification

In [40]:
#This function set rules to category input texts to different output categories using rules
def rule_predict_label(line):
    line = line.strip()
    lower = line.lower()
    digits = re.sub(r"[^\d]", "", line)

    # check for keywords as title
    if any(word in lower for word in ["ceo", "cto", "manager", "engineer", "director", "president", "developer", "founder", "professor", "consultant", "attorney", "analyst"]):
        return "Title"

    #check for @ as email
    if line.count("@") == 1:
        return "Email"

    #check for website
    if lower.startswith("www.") or ".com" in lower or ".org" in lower or ".net" in lower:
        return "Website"

    # check and seperate fax and phone number
    if digits.isdigit() and len(digits) >= 6:
        if "fax" in lower:
            return "Fax"
        if lower.startswith(('p', 'ph', 'phone', 'cell', 't')) or "phone" in lower or "tel" in lower:
            return "Phone"
        cleaned_line = re.sub(r"[-().+ ]", "", line)
        if cleaned_line.isdigit():
            return "Phone"
        char_count = len(line.replace(" ", ""))
        if char_count > 0 and (len(digits) / char_count) >= 0.5:
            return "Phone"
        #if contain more than 6 digits in line but not phone or fax, treat it as address
        else:
            return "Address"

    #check for Address
    if re.match(r"^\d{1,5} [A-Za-z]", line):
        return "Address"
    if re.search(r"\b\w+\s+\d{1,5}\s+\w+\b", line):
        return "Address"

    #check for name
    if re.match(r"^[A-Z][A-Z]+( [A-Z][A-Z]+)+$", line) or re.match(r"^[A-Z][a-z]+ [A-Z][a-z]+$", line):
        return "Name"

    if any(word in lower for word in ["inc", "llc", "corp", "solutions", "group", "technologies", "communications", "systems", "company", "university"]):
        return "Company"

    return "Other"



# generate 1000 test cases for model

In [41]:
faker = Faker()
dataset = []
#generate 1000 fake name,phone,address etc.
fields = [
      ("Name", faker.name),
      ("Phone", faker.phone_number),
      ("Address", faker.address),
      ("Email", faker.email),
      ("Company", faker.company),
      ("Website", faker.url),
      ("Other",faker.sentence),
  ]

for i in range(1000):
  for label,value in fields:
    dataset.append((value(), label))

#shuffle the dataset so model does not see same order
random.shuffle(dataset)


# Encode Labels

In [42]:
#assign each label with a corresponding number
classes = ["Name", "Phone", "Address", "Email", "Company", "Website", "Title", "Other"]
class_to_idx = {cls: idx for idx, cls in enumerate(classes)}
idx_to_class = {idx: cls for cls, idx in class_to_idx.items()}

#encode every label inside dataset to corresponding number
targets = []
for text, label in dataset:
    targets.append(class_to_idx.get(label, class_to_idx["Other"]))

targets = torch.tensor(targets)

## Tokenization(transfer words to ID)

In [43]:
padded_sequences = []
label_list = []

#count each words
word_counter = Counter()
for text, label in dataset:
    tokens = text.lower().split()
    word_counter.update(tokens)

# Build vocab
#pad 0 is used for future step so each words have same length
#unknown is default to 1
vocab = {"<pad>": 0, "<unk>": 1}
i = 2
for word in word_counter:
    vocab[word] = i
    i += 1

# Now tokenize each text separately
for text, label in dataset:
    tokens = text.lower().split()
    word_ids = [vocab.get(word, 1) for word in tokens]
    padded_sequences.append(tensor(word_ids))
    label_list.append(class_to_idx.get(label, class_to_idx["Other"]))

# Pad sequences
padded_inputs = pad_sequence(padded_sequences, batch_first=True, padding_value=0)
targets = torch.tensor(label_list)

# Create DataLoader

In [44]:
#combine them as dataset
full_dataset = TensorDataset(padded_inputs, targets)
batch_size = 32
train_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True)

# Build Model

In [45]:
#class for building the business card classification model
class BusinessCardClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        mean_embedded = embedded.mean(dim=1)  # Average pooling
        return self.fc(mean_embedded)

model = BusinessCardClassifier(len(vocab), 64, len(classes)).to(device)

# PyTorch Training

In [46]:
#Training the pytorch model and calculating the loss and accuracy
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, Accuracy: {correct/total:.4f}")

Epoch 1, Loss: 330.6282, Accuracy: 0.4329
Epoch 2, Loss: 172.2023, Accuracy: 0.7147
Epoch 3, Loss: 93.7138, Accuracy: 0.8866
Epoch 4, Loss: 28.5673, Accuracy: 0.9884
Epoch 5, Loss: 6.5609, Accuracy: 0.9999
Epoch 6, Loss: 2.7519, Accuracy: 1.0000
Epoch 7, Loss: 1.5869, Accuracy: 1.0000
Epoch 8, Loss: 1.0434, Accuracy: 1.0000
Epoch 9, Loss: 0.7392, Accuracy: 1.0000
Epoch 10, Loss: 0.5494, Accuracy: 1.0000
Epoch 11, Loss: 0.4223, Accuracy: 1.0000
Epoch 12, Loss: 0.3323, Accuracy: 1.0000
Epoch 13, Loss: 0.2666, Accuracy: 1.0000
Epoch 14, Loss: 0.2172, Accuracy: 1.0000
Epoch 15, Loss: 0.1789, Accuracy: 1.0000
Epoch 16, Loss: 0.1488, Accuracy: 1.0000
Epoch 17, Loss: 0.1250, Accuracy: 1.0000
Epoch 18, Loss: 0.1055, Accuracy: 1.0000
Epoch 19, Loss: 0.0896, Accuracy: 1.0000
Epoch 20, Loss: 0.0766, Accuracy: 1.0000


# Save Model

In [47]:
import pickle

# Save model parameters
torch.save(model.state_dict(), "/content/business_card_classifier.pth")

# Save vocab and idx_to_class mappings

with open("/content/vocab.pkl", "wb") as f:
    pickle.dump(vocab, f)

with open("/content/idx_to_class.pkl", "wb") as f:
    pickle.dump(idx_to_class, f)

# Prediction Using Model

In [48]:
import zipfile
import os

zip_path = "/content/business_card.zip"

# Extract the zip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("/content/business_card_folder")

In [49]:
# Load vocab and idx_to_class
with open("/content/vocab.pkl", "rb") as f:
    vocab = pickle.load(f)

with open("/content/idx_to_class.pkl", "rb") as f:
    idx_to_class = pickle.load(f)

# --- Rebuild model and load weights ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BusinessCardClassifier(len(vocab), 64, len(idx_to_class)).to(device)
model.load_state_dict(torch.load("/content/business_card_classifier.pth", map_location=device))
model.eval()


BusinessCardClassifier(
  (embedding): Embedding(9410, 64, padding_idx=0)
  (fc): Linear(in_features=64, out_features=8, bias=True)
)

### Function for combine two classification model

In [50]:
def predict_text_line_combined(line):
    # input for Model prediction
    tokens = line.lower().split()
    word_ids = [vocab.get(word, 1) for word in tokens]
    input_tensor = torch.tensor([word_ids], device=device)

    with torch.no_grad():
        output = model(input_tensor)
        pred_idx = output.argmax(dim=1).item()
        model_label = idx_to_class[pred_idx]

    # Rule-based prediction
    rule_label = rule_predict_label(line)

    # Combine decision
    if rule_label != "Other":
        # If rule is confident (not Other), trust the rule
        final_label = rule_label
    else:
        # Otherwise trust the model
        final_label = model_label

    return final_label


### Revalidation prediction function

In [51]:
# re validate after the prediction is made
def validate_prediction(line, label):
    line = line.strip()
    lower = line.lower()
    digits = re.sub(r"[^\d]", "", line)

    if label == "Phone":
        # If not enough digits, maybe not a phone
        if len(digits) < 6:
            return "Other"
    if label == "Email":
        # If no @ symbol, not an email
        if "@" not in line:
            return "Other"
    if label == "Website":
        # If no www or .com/.org/.net, not a website
        if not ("www." in lower or ".com" in lower or ".org" in lower or ".net" in lower):
            return "Other"
    return label


### Specific function for phone, fax number

In [52]:
# function to extract Phone and Fax numbers from messy lines.
def extract_phone_fax_numbers_and_leftover(line):

    line = line.strip()

    # allow messy phone number due to possible mistake in OCR
    phone_patterns = re.findall(r"(\d{2,4}[\s\-\.]?\d{2,4}[\s\-\.]?\d{3,4}[\s\-\.]?\d{0,4})", line)

    fax = None
    phone = None
    leftover = None

    if re.search(r"[Ff][aA]?[xX]?", line):
        parts = re.split(r"[Ff][aA]?[xX]?[ :]*", line)
        #if more than one lines in parts
        if len(parts) > 1:
            phone_part = parts[0]
            fax_part = parts[1]
            # define phone number pattern
            phones_in_phone_part = re.findall(r"(\d{2,4}[\s\-\.]?\d{2,4}[\s\-\.]?\d{3,4}[\s\-\.]?\d{0,4})", phone_part)
            #define fax number patter
            phones_in_fax_part = re.findall(r"(\d{2,4}[\s\-\.]?\d{2,4}[\s\-\.]?\d{3,4}[\s\-\.]?\d{0,4})", fax_part)

            # rule: only one line is phone or fax
            # avoid multiple lines of phone number details
            if phones_in_phone_part:
                phone = phones_in_phone_part[0]
            if phones_in_fax_part:
                fax = phones_in_fax_part[0]

            # Remove detected numbers from leftover
            leftover = re.sub(r"(\d{2,4}[\s\-\.]?\d{2,4}[\s\-\.]?\d{3,4}[\s\-\.]?\d{0,4})", "", phone_part).strip()
    else:
        # if only one line in parts, then it is phone number
        if phone_patterns:
            phone = phone_patterns[0]
            leftover = re.sub(r"(\d{2,4}[\s\-\.]?\d{2,4}[\s\-\.]?\d{3,4}[\s\-\.]?\d{0,4})", "", line).strip()
        else:
            leftover = line  # fallback: if no good number, keep full line

    return phone, fax, leftover




In [53]:
#group the lines with same category together
def group_and_print_predictions(image_path, lines_and_labels):
    print(f"\nBusiness Card: {image_path}")

    # Initialize fields
    fields = {
        "Name": [],
        "Title": [],
        "Company": [],
        "Address": [],
        "Phone": [],
        "Fax": [],
        "Email": [],
        "Website": [],
        "Other": []
    }

    # Fill fields
    for line, label in lines_and_labels:
        fields[label].append(line)

    merged_company = " ".join(fields["Company"]) if fields["Company"] else None
    merged_address = " ".join(fields["Address"]) if fields["Address"] else None

    if fields["Name"]:
        print(f"  Name: {' | '.join(fields['Name'])}")
    if fields["Title"]:
        print(f"  Title: {' | '.join(fields['Title'])}")
    if merged_company:
        print(f"  Company: {merged_company}")
    if merged_address:
        print(f"  Address: {merged_address}")

    # handle phone, fax and leftover
    if fields["Phone"]:
      phone_line = fields["Phone"][0]
      phone, fax, leftover = extract_phone_fax_numbers_and_leftover(phone_line)

      if phone:
        print(f"  Phone: {phone}")
      if fax:
        print(f"  Fax: {fax}")
      if leftover:
        fields["Address"].append(leftover)  # the leftover move to Address since it contain number
    elif fields["Fax"]:
      fax_line = fields["Fax"][0]
      _, fax, leftover = extract_phone_fax_numbers_and_leftover(fax_line)
      if fax:
        print(f"  Fax: {fax}")
      if leftover:
        fields["Address"].append(leftover)  # the leftover move to Address since it contain number


    if fields["Email"]:
        print(f"  Email: {fields['Email'][0]}")
    if fields["Website"]:
        print(f"  Website: {' | '.join(fields['Website'])}")
    if fields["Other"]:
        print(f"  Other: {' | '.join(fields['Other'])}")


In [54]:
def find_deepest_folder_with_images(root):
    for dirpath, dirnames, filenames in os.walk(root):
        # Skip __MACOSX folder since it is empty folder
        if "__MACOSX" in dirpath:
            continue
        # Check if real image files exist
        if any(
            fname.lower().endswith(('.jpg', '.jpeg', '.png')) and not fname.startswith("._")
            for fname in filenames
        ):
            return dirpath
    return root  # fallback if nothing found


image_folder = find_deepest_folder_with_images("/content/business_card_folder")

#find the images inside image_folder
image_paths = sorted([
    os.path.join(image_folder, f)
    for f in os.listdir(image_folder)
    if f.lower().endswith(('.jpg', '.jpeg', '.png')) and not f.startswith("._")
])


for image_path in image_paths:
    lines = get_text_lines(image_path)
    results = []
    for line in lines:
        pred_label = predict_text_line_combined(line)
        pred_label = validate_prediction(line, pred_label)
        results.append((line, pred_label))

    # Group and print results
    group_and_print_predictions(image_path, results)


Business Card: /content/business_card_folder/business_card/002.jpg
  Name: MEXICAN GRILL | CHRIS SALCEDO
  Address: 2675 EL CAMINO REAL
  Phone: 650.462.9154
  Other: APPRENTICE | CHIPOTLE MEXICAN GRILL, I! | PALO ALTO, CA 94306

Business Card: /content/business_card_folder/business_card/004.jpg
  Name: VIJAY CHANDRASEKHAR
  Title: Electrical Engineering
  Address: 17 Comstock Circle
  Phone: 916.221.0411
  Email: E-mail: vijayc@stanford.edu
  Other: STANFORD | Apt 101 | Stanford, CA 94305

Business Card: /content/business_card_folder/business_card/005.jpg
  Name: carol soh
  Company: ‘COMMUNICATIONS
  Email: carol@acecomm.sg
  Other: ace sdaytons

Business Card: /content/business_card_folder/business_card/006.jpg
  Name: RAFAEL ULATE
  Title: DIRECTOR OF ADMISSIONS | DEPARTMENT OF ELECTRICAL ENGINEERING | DAVID PACKARD ELECTRICAL ENGINEERING
  Email: STANFORD, CALIFORNIA 94305-9505 ulate@ee.stanford.edu

Business Card: /content/business_card_folder/business_card/007.jpg
  Address: Of