In [7]:
import re
from datetime import datetime
import torch
from transformers import AutoModel, AutoTokenizer

def extract_mrp_exp_date_(image_file):
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load the pre-trained model and tokenizer
    model_name = "stepfun-ai/GOT-OCR2_0"
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    # Perform OCR on the input image
    res = model.chat(tokenizer, image_file, ocr_type='ocr')
    text = res  # Adjust based on the structure of your OCR output
    print("OCR Text:", text)

    # Function to correct common OCR errors
    def correct_ocr_errors(text):
        text = text.replace("O", "0")  # Correct 'O' to '0'
        text = text.replace("l", "1")  # Correct 'l' to '1'
        return text

    # Function to handle 'MR PRs.' to 'MRPRs.' and other MRP-related corrections
    def convert_to_mrpr(text):
        text = text.replace('MR PRs.', 'MRPRs.')
        # Use regex to find numbers and combine them like '2 0/-' to '20/-'
        match = re.search(r'(\d)\s*(0/\s*-)', text)
        if match:
            text = text.replace(match.group(0), '20/-')
        return text

    # Extract the MRP from the text
    def extract_mrp(text):
        text = convert_to_mrpr(text)
        text = correct_ocr_errors(text)
        mrp_pattern = r'(?i)(?:mr\s*r\s*|mr\s*prs?\s*|rs\.?|₹)\s*[:\-]?\s*([\d/]+)'
        match = re.search(mrp_pattern, text)

        if match:
            mrp = match.group(1)
            corrected_mrp = correct_ocr_errors(mrp)
            corrected_mrp = re.sub(r'[^\d.,]', '', corrected_mrp)
            return corrected_mrp
        else:
            return None



    # Preprocess text to normalize date-related keywords and remove noise
    def preprocess_text(text):
        keywords_map = {
            "EXP": ["XP", "EX", "EXR", "Expires"],
            "MFD": ["MFR", "MF", "MFG", "PROD"]
        }
        month_map = {
            "JAN": ["JAN", "JNA", "JANUARY"], "FEB": ["FEB", "FEBRUARY"],
            "MAR": ["MAR", "MARCH"], "APR": ["APR", "APRL", "APRIL"],
            "MAY": ["MAY", "MA"], "JUN": ["JUN", "JN", "JUNE"],
            "JUL": ["JUL", "JLY", "JULY"], "AUG": ["AUG", "AG", "AUGUST"],
            "SEP": ["SEP", "SEPT", "SEPTEMBER"], "OCT": ["OCT", "OCTOBER"],
            "NOV": ["NOV", "NOVEMBER"], "DEC": ["DEC", "DECM", "DECEMBER"]
        }

        for correct_word, variations in keywords_map.items():
            for variant in variations:
                text = re.sub(r'\b' + variant + r'\b', correct_word, text, flags=re.IGNORECASE)

        for correct_month, variations in month_map.items():
            for variant in variations:
                text = re.sub(r'\b' + variant + r'\b', correct_month, text, flags=re.IGNORECASE)

        text = text.upper()
        text = re.sub(r'[^A-Z0-9\s:.-/]', ' ', text, flags=re.IGNORECASE)
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    # Function to parse date string into datetime object
    def parse_date(date_string):
        formats = ['%d/%m/%y', '%d/%m/%Y', '%d-%m-%y', '%d-%m-%Y', '%d %b %Y', '%d %b %y', '%b %d %Y', '%b %d, %Y']
        for fmt in formats:
            try:
                return datetime.strptime(date_string, fmt)
            except ValueError:
                continue
        return None

    # Find the most recent date in the text
    def find_most_recent_date(text):
        preprocessed_text = preprocess_text(text)
        print("Preprocessed Text for Date Extraction:", preprocessed_text)

        date_pattern = r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{1,2}\s*[A-Z]{3}\s*\d{2,4}|\b[A-Z]{3,9}\s*\d{1,2},?\s*\d{4})\b'
        matches = re.findall(date_pattern, preprocessed_text)
        print("Potential Dates Found:", matches)

        dates = []
        for date_str in matches:
            parsed_date = parse_date(date_str)
            if parsed_date:
                dates.append(parsed_date)

        if dates:
            most_recent_date = max(dates)
            return most_recent_date.strftime('%d/%m/%Y')
        else:
            return "No valid date found"

    # Extract MRP and find the most recent expiry/manufacture date
    extracted_mrp = extract_mrp(text)
    most_recent_date = find_most_recent_date(text)

    # Output results
    if extracted_mrp:
        print("Extracted MRP: Rs.", extracted_mrp)
    else:
        print("MRP not found.")
    print("Most Recent Date (Expiry/Manufacture):", most_recent_date)

    result = {
        "MRP": extracted_mrp,
        "Expiry_Date": most_recent_date
    }

    return result


In [8]:
pip install verovio tiktoken



In [9]:
pip install ngrok flask flask-cors jsonify pyngrok pillow



In [None]:
from flask import Flask, request,jsonify
from pyngrok import ngrok
from flask_cors import CORS  # Import CORS
from PIL import Image
import io

# Set your ngrok authentication token
ngrok.set_auth_token("2ncfppNP1GrHBwgPQeI4wcm8j0y_866M6vWJHvA5m8MeonAeo")

app = Flask(__name__)

# Enable CORS for the app
CORS(app)

@app.route('/webhook/mrpexp', methods=['POST'])
def webhook():
    # Get the image from the request
    if 'file' not in request.files:
        print("No image part", 400)

    image_file = request.files['file']

    if image_file.filename == '':
        print("No image selected", 400)

    # Read the image using PIL
    image = Image.open(image_file)
    temp_image_path = "/tmp/temp_image.jpg"  # Temporary path to store the image
    image.save(temp_image_path)

    # Extract JSON data from the form
    data = request.form.to_dict()
    print("Received data:", data)

    # Process the image with the product_name function
    result = extract_mrp_exp_date_(temp_image_path)  # Ensure product_name is defined

    # return result, 200
    print(result)

    # Return the response from the second server to the client
    # if response.status_code == 200:
    # else:
    #     return "Error analyzing image on second server", response.status_code
    return jsonify({"analysis": result}), 200

# Start ngrok
ngrok_tunnel = ngrok.connect(5000)
print("Ngrok URL:", ngrok_tunnel.public_url)

# Run the Flask app
app.run(port=5000, debug=True, use_reloader=False)

Ngrok URL: https://c69b-35-240-158-120.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [18/Oct/2024 19:15:44] "[33mGET / HTTP/1.1[0m" 404 -


Received data: {'key': 'Front packet', 'another_key': 'dsfgs'}


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
INFO:werkzeug:127.0.0.1 - - [18/Oct/2024 19:16:20] "POST /webhook/mrpexp HTTP/1.1" 200 -


OCR Text: NET 0 TY.  MFD& USE BY PER PACK B. NO. :  NO. OF SERVES UNTIS AL EP RICE.  MR PRS. 2 O/ ( IN CL. OF ALL TAXES)  489 27109/24&24/02/25 24/ N 2270924 RS. 0.50/ - PER 9 88 08:20
Preprocessed Text for Date Extraction: NET 0 TY. MFD USE BY PER PACK B. NO. : NO. OF SERVES UNTIS AL EP RICE. MR PRS. 2 O/ IN CL. OF ALL TAXES 489 27109/24 24/02/25 24/ N 2270924 RS. 0.50/ PER 9 88 08:20
Potential Dates Found: ['24/02/25']
Extracted MRP: Rs. 20
Most Recent Date (Expiry/Manufacture): 24/02/2025
{'MRP': 20, 'Expiry_Date': '24/02/2025'}
