In [None]:

import cv2
import pytesseract
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os

In [None]:


# 1. Initialize Tesseract path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# 2. Load Roberta model
def load_model(model_path="roberta_spam_model.pt", base_model="roberta-base"):
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=2)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return tokenizer, model

tokenizer, model = load_model()

# 3. Load and preprocess the image
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    if image is None:
        raise Exception(f"Image not found at {image_path}")
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return gray

# 4. Extract text via OCR
def extract_text_from_image(image):
    text = pytesseract.image_to_string(image)
    return text.strip()

# 5. Classify text using Roberta
def classify_text(text, tokenizer, model, max_length=128):
    encoded_inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**encoded_inputs)
    probs = torch.softmax(outputs.logits, dim=1)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return {
        'text': text,
        'prediction': 'Spam' if prediction == 1 else 'Ham',
        'spam_probability': probs[0][1].item(),
        'ham_probability': probs[0][0].item()
    }

# 6. Main Function
def main(image_path):
    try:
        image = preprocess_image(image_path)
        ocr_text = extract_text_from_image(image)
        
        if not ocr_text:
            print("No text detected in image.")
            return
        
        result = classify_text(ocr_text, tokenizer, model)
        
        print(f"\nExtracted Text:\n{result['text']}")
        print(f"Prediction: {result['prediction']}")
        print(f"Spam Probability: {result['spam_probability']:.4f}")
        print(f"Ham Probability: {result['ham_probability']:.4f}")
    
    except Exception as e:
        print(f"Error: {e}")

# 7. Run
if __name__ == "__main__":
    img_path = r"C:\Users\jhanv\bd1\fda\test_image2.jpg"
    main(img_path)
    
