In [None]:
# Install all necessary packages

!pip install keras-ocr
!pip install matplotlib
!pip install opencv-python
!pip install --force-reinstall -v "tensorflow==2.15.1"
!pip install tensorflow==2.15
!pip install groq

# Import all necessary libraries

!pip install keras_ocr


Defaulting to user installation because normal site-packages is not writeable
Collecting keras-ocr
  Downloading keras_ocr-0.9.3-py3-none-any.whl.metadata (8.6 kB)
Collecting editdistance (from keras-ocr)
  Downloading editdistance-0.8.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (3.9 kB)
Collecting efficientnet==1.0.0 (from keras-ocr)
  Downloading efficientnet-1.0.0-py3-none-any.whl.metadata (6.1 kB)
Collecting essential_generators (from keras-ocr)
  Downloading essential_generators-1.0-py3-none-any.whl.metadata (14 kB)
Collecting imgaug (from keras-ocr)
  Downloading imgaug-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting pyclipper (from keras-ocr)
  Downloading pyclipper-1.3.0.post6-cp39-cp39-macosx_10_9_universal2.whl.metadata (9.0 kB)
Collecting shapely (from keras-ocr)
  Downloading shapely-2.0.7-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting validators (from keras-ocr)
  Downloading validators-0.34.0-py3-none-any.whl.metadata (3.8 kB)
Collecting keras-applica

In [5]:
import math, keras_ocr,re, os
from groq import Groq



In [1]:
# Function to get distances for detections
def get_distance(predictions):
    x0, y0 = 0, 0
    detections = []
    for group in predictions:
        top_left_x, top_left_y = group[1][0]
        bottom_right_x = group[1][1][0]
        bottom_right_y = group[1][3][1] # correct bottom-right coordinates
        center_x = (top_left_x + bottom_right_x) / 2
        center_y = (top_left_y + bottom_right_y) / 2
        distance_from_origin = math.dist([x0, y0], [center_x, center_y])
        distance_y = center_y - y0
        detections.append({
            "text": group[0],
            "center_x": center_x,
            "center_y": center_y,
            "distance_from_origin": distance_from_origin,
            "distance_y": distance_y,
        })
    return detections

# Function to distinguish rows
def distinguish_rows(lst, thresh=15):
    sublists = []
    for i in range(len(lst) - 1):
        if lst[i + 1]["distance_y"] - lst[i]["distance_y"] <= thresh:
            if lst[i] not in sublists:
                sublists.append(lst[i])
            sublists.append(lst[i + 1])
        else:
            yield sublists
            sublists = [lst[i + 1]]
    yield sublists
    
# Check for brand keywords
def contains_expression(word_list, expressions):
    result_list = []
    for key in expressions:
        key_lower = re.escape(key.lower())
        # Check if the key exists as a complete word
        if re.search(rf'\b{key_lower}\b', word_list):
            result_list.append(key)
        # Check if any of the associated values exist as complete words
        for value in expressions[key]:
            value_lower = re.escape(value.lower())
            if re.search(rf'\b{value_lower}\b', word_list):
                result_list.append(key)
    if(not result_list):
        return None
    return result_list

In [6]:
# Define the brands to be detected
def process_images_in_directory(directory, brands, brands_dict, api_key):
    for dirname, _, filenames in os.walk(directory):
        for filename in filenames:
            image_path = os.path.join(dirname, filename)

            # Initialize OCR pipeline
            pipeline = keras_ocr.pipeline.Pipeline()

            # Read image
            try:
                print(image_path)
                read_image = keras_ocr.tools.read(image_path)
                if read_image is None or read_image.size == 0:
                    raise ValueError("Image is empty or None")
            except Exception as e:
                print(f"Failed to read image: {image_path}. Error: {e}")
                continue

            # Recognize text in image
            prediction_groups = pipeline.recognize([read_image])

            # Process predictions
            predictions = get_distance(prediction_groups[0])

            # Order text detections
            predictions = distinguish_rows(predictions)
            if predictions is None:
                predictions = []

            predictions = list(filter(lambda x: x != [], predictions))
            ordered_preds = [each["text"] for row in predictions for each in sorted(row, key=lambda x: x["distance_from_origin"])]
            ordered_preds = " ".join(ordered_preds).lower()

            result_list = contains_expression(ordered_preds, brands)
            if result_list:
                for result in result_list:
                    # Prepare message for Groq API
                    if result in ["apple", "oracle", "amazon", "tesla", "oracle", "visa", "zara", "ge", "ford", "corona", "intel", "linkedin", "hp", "hermes"]:
                        system_message = f"you decide if the sentence is about {result} as a brand or the context says otherwise. Your answers can be: 'yes, no, cannot decide'"

                        # Proceed with the API call
                        client = Groq(api_key=api_key)
                        chat_completion = client.chat.completions.create(
                            messages=[
                                {"role": "system", "content": system_message},
                                {"role": "user", "content": ' '.join(ordered_preds)}
                            ],
                            model="llama3-70b-8192",
                        )

                        response_content = chat_completion.choices[0].message.content
                        if "yes" in response_content.lower():
                            if filename not in brands_dict[result]:
                                brands_dict[result].append(filename)
                    else:
                        if result is not None and filename not in brands_dict[result]:
                            brands_dict[result].append(filename)

    print(brands_dict)

# Example usage
process_images_in_directory('fake_deploy', brands, brands_dict, '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')

Looking for /Users/beszabo/.keras-ocr/craft_mlt_25k.h5
Downloading /Users/beszabo/.keras-ocr/craft_mlt_25k.h5
Instructions for updating:
Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.
Looking for /Users/beszabo/.keras-ocr/crnn_kurapan.h5
Downloading /Users/beszabo/.keras-ocr/crnn_kurapan.h5
fake_deploy/Image_40_Amazon images.jpg
Looking for /Users/beszabo/.keras-ocr/craft_mlt_25k.h5
Looking for /Users/beszabo/.keras-ocr/crnn_kurapan.h5
fake_deploy/Image_90_Amazon images.jpg
Looking for /Users/beszabo/.keras-ocr/craft_mlt_25k.h5
Looking for /Users/beszabo/.keras-ocr/crnn_kurapan.h5
fake_deploy/Amazon_image20_5_Amazon images.jpg
Looking for /Users/beszabo/.keras-ocr/craft_mlt_25k.h5
Looking for /Users/beszabo/.keras-ocr/crnn_kurapan.h5
fake_deploy/Image_68_Amazon images.png
Looking for /Users/beszabo/.keras-ocr/craft_mlt_25k.h5
Looking for /Users/beszabo/.keras-ocr/crnn_kurapan.h5
fake_deploy/Amazon_image78_16_Amazon images.jpg
Looking for /Users/beszabo/.keras-ocr/cr

In [10]:
non_empty_brands_dict = {key: value for key, value in brands_dict.items() if value}

print("Non-empty brands dictionary:")
for key, value in non_empty_brands_dict.items():
    print(f"{key}: {len(value)}")

Non-empty brands dictionary:
amazon: 16
sony: 1
hewlett packard enterprise: 2


In [3]:


# Brands dictionary
brands = {
    "apple": ["iphone", "macbook", "ipad"],
    "microsoft": ["windows", "teams", "xbox"],
    "amazon": ["echo", "kindle", "amazon prime"],
    "google": ["gmail", "chrome"],
    "samsung": ["galaxy s"],
    "toyota": ["camry", "corolla"],
    "mercedes-benz": ["glc-class", "amg", "mercedes"],
    "coca-cola": ["be less white", "coke", "cocacola", "coca cola"],
    "nike": ["air max", "air jordan", "dri-fit", "just do it"],
    "bmw": ["Rolls-Royce", "x5", "m3"],
    "mcdonald's": ["big mac", "mcnuggets", "happy meal", "mcdonalds", "mcflurries"],
    "tesla": ["model s", "cyber truck", "model x"],
    "disney": ["mickey mouse", "encanto", "doorables"],
    "louis vuitton": ["speedy bag", "neverfull tote", "keepall bag"],
    "cisco": ["catalyst switches", "webex", "meraki"],
    "instagram": ["reels", "igtv"],
    "adobe": ["photoshop", "acrobat", "premiere pro"],
    "ibm": ["spss", "watson"],
    "oracle": ["java"],
    "sap": ["s/4hana", "successfactors", "ariba"],
    "facebook": ["messenger"],
    "chanel": ["no. 5", "boy bag", "classic flap"],
    "hermes": ["birkin"],
    "intel": ["core i7", "xeon", "pentium"],
    "youtube": [],
    "j.p. morgan": ["jp morgan"],
    "honda": ["civic", "cr-v"],
    "american express": ["platinum card", "amex"],
    "ikea": ["tempelhof", "poäng chair", "kallax"],
    "accenture": [],
    "allianz": [],
    "hyundai": ["elantra"],
    "ups": [],
    "gucci": ["gg marmont", "princetown loafers", "dionysus bag"],
    "pepsi": [],
    "sony": ["playstation", "bravia", "xperia"],
    "visa": [],
    "salesforce": [],
    "netflix": [],
    "paypal": ["zettle", "venmo", "braintree"],
    "mastercard": [],
    "adidas": ["ultraboost", "nmd", "superstar"],
    "zara": ["red temptation"],
    "axa": [],
    "audi": ["q5", "q7", "a8"],
    "airbnb": [],
    "porsche": ["taycan", "cayenne", "panamera"],
    "starbucks": ["frappuccino"],
    "ge": ["general electric"],
    "volkswagen": ["passat", "jetta", "tiguan"],
    "ford": ["mach e", "mach-e", "bronco", "f150"],
    "nescafé": ["dolce gusto"],
    "siemens": ["healthineers"],
    "goldman sachs": ["goldmansachs"],
    "pampers": ["swaddlers"],
    "h&m": ["conscious collection", "h and m", "hem"],
    "l’oréal paris": [
        "metal detox",
        "wonder water",
        "dream lengths",
        "loreal",
        "l'oreal",
    ],
    "citi": ["citigroup", "citibank"],
    "lego": ["piece 26047", "piece 32557", "piece 11031", "legos"],
    "red bull": [],
    "budweiser": ["bud light"],
    "ebay": [],
    "nissan": ["altima", "sentra", "pathfinder"],
    "hp": ["reverb g2", "envy 6055", "victus"],
    "hsbc": ["citizens bank", "zelle"],
    "morgan stanley": ["shareworks", "morganstanley"],
    "nestle": ["kitkat", "maggi", "toll house"],
    "philips": ["3200 lattego", "sonicare", "avent"],
    "spotify": [],
    "ferrari": ["488 gtb", "sf90", "daytona sp3", "purosangue"],
    "nintendo": ["switch lite", "switch oled", "switch games", "zelda", "mario"],
    "gillette": ["king c", "exfoliating razor"],
    "colgate": ["hum toothbrush", "optic white", "overnight pen"],
    "cartier": ["tank watch", "juste un clou", "tank must"],
    "3m": ["mmm"],
    "dior": ["sauvage", "dossier"],
    "santander": [],
    "danone": ["activia", "evian", "oikos", "danon"],
    "kellogg's": ["frosted flakes", "special k", "kelloggs","kellogg", "kellogs"],
    "linkedin": ["linked in"],
    "corona": [],
    "fedex": [],
    "caterpillar": ["cat 229d3", "cat 259d3", "cat engine"],
    "dhl": [],
    "jack daniel's": [
        "old no. 7",
        "jack honey",
        "tennessee honey",
        "jack daniels",
        "jack whiskey",
    ],
    "prada": ["paradoxe", "cleo bag", "cloudbust"],
    "xiaomi": ["mi 11", "redmi", "mi band"],
    "kia": ["sorento", "sportage", "telluride"],
    "tiffany & co.": ["tiffany co", "tiffany and co"],
    "panasonic": ["lumix", "toughbook", "s5 ii"],
    "hewlett packard enterprise": [
        "proliant",
        "nimble storage",
        "hewlett hackard",
        "hpe",
    ],
    "huawei": ["mate 60", "p30", "matebook"],
    "hennessy": ["pink whitney", "casamigos"],
    "burberry": ["goddess perfume", "her elixir", "her london dream"],
    "kfc": ["kentucky fried", "zinger", "kentucky chicken"],
    "johnson & johnson": ["jnj", "johnson johnson", "johnson and johnson", "johnson 8 johnson"],
    "sephora": [],
    "nespresso": ["vertuoline", "originalline", "vertuo"],
    "heineken": [],
    "canon": ["ts3322", "pixma", "ts3522"],
}

brands_dict = {
    "apple": [],
    "microsoft": [],
    "amazon": [],
    "google": [],
    "samsung": [],
    "toyota": [],
    "mercedes-benz": [],
    "coca-cola": [],
    "nike": [],
    "bmw": [],
    "mcdonald's": [],
    "tesla": [],
    "disney": [],
    "louis vuitton": [],
    "cisco": [],
    "instagram": [],
    "adobe": [],
    "ibm": [],
    "oracle": [],
    "sap": [],
    "facebook": [],
    "chanel": [],
    "hermes": [],
    "intel": [],
    "youtube": [],
    "j.p. morgan": [],
    "honda": [],
    "american express": [],
    "ikea": [],
    "accenture": [],
    "allianz": [],
    "hyundai": [],
    "ups": [],
    "gucci": [],
    "pepsi": [],
    "sony": [],
    "visa": [],
    "salesforce": [],
    "netflix": [],
    "paypal": [],
    "mastercard": [],
    "adidas": [],
    "zara": [],
    "axa": [],
    "audi": [],
    "airbnb": [],
    "porsche": [],
    "starbucks": [],
    "ge": [],
    "volkswagen": [],
    "ford": [],
    "nescafé": [],
    "siemens": [],
    "goldman sachs": [],
    "pampers": [],
    "h&m": [],
    "l’oréal paris": [],
    "citi": [],
    "lego": [],
    "red bull": [],
    "budweiser": [],
    "ebay": [],
    "nissan": [],
    "hp": [],
    "hsbc": [],
    "morgan stanley": [],
    "nestle": [],
    "philips": [],
    "spotify": [],
    "ferrari": [],
    "nintendo": [],
    "gillette": [],
    "colgate": [],
    "cartier": [],
    "3m": [],
    "dior": [],
    "santander": [],
    "danone": [],
    "kellogg's": [],
    "linkedin": [],
    "corona": [],
    "fedex": [],
    "caterpillar": [],
    "dhl": [],
    "jack daniel's": [],
    "prada": [],
    "xiaomi": [],
    "kia": [],
    "tiffany & co.": [],
    "panasonic": [],
    "hewlett packard enterprise": ["hp enterprise", "hewlett packard"],
    "huawei": [],
    "hennessy": [],
    "burberry": [],
    "kfc": [],
    "johnson & johnson": [],
    "sephora": [],
    "nespresso": [],
    "heineken": [],
    "canon": []
}
