In [1]:
import joblib
from ensemble import MajorityVotingEnsemble 
label_encoder = joblib.load("label_encode_main.joblib")
vectorizer = joblib.load("tfidf_vectorizer_main.joblib")

In [2]:
def patch_ensemble(ensemble):
    if not hasattr(ensemble, "weights"):
        ensemble.weights = [1.0] * len(ensemble.models)
    if not hasattr(ensemble, "tie_breaker_order"):
        ensemble.tie_breaker_order = list(range(len(ensemble.models)))
    if not hasattr(ensemble, "n_classes"):
        ensemble.n_classes = None
    return ensemble

# ensemble = joblib.load("majority_ensemble.joblib")
# model = patch_ensemble(ensemble)

# # Usage
# import joblib
# ensemble = joblib.load("majority_ensemble.pkl")
# ensemble = patch_ensemble(ensemble)


In [3]:
# Load the ensemble (which contains its own model instances)
model = joblib.load("majority_ensemble.pkl")
# if not hasattr(ensemble, "weights"):
#     ensemble.weights = [1.0] * len(ensemble.models)
# if not hasattr(ensemble, "tie_breaker_order"):
#     ensemble.tie_breaker_order = list(range(len(ensemble.models)))
# if not hasattr(ensemble, "n_classes"):
#     ensemble.n_classes = None

In [4]:
import pytesseract
import cv2
import pubchempy as pcp
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\dhruv\AppData\Local\Programs\Tesseract-OCR\tesseract.exe' #tesseract path

In [6]:
#image then convert it into 
image = cv2.imread('Screenshot 2025-06-08 161620.png')  
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# remove noise and improve clarity
gray = cv2.GaussianBlur(gray, (3, 3), 0)

In [7]:
#Ocr: it will extract text from img
text = pytesseract.image_to_string(gray)
print("Extracted Ingredients:")
print(len(text))

Extracted Ingredients:
964


In [8]:
import re
#increase this preprocess step further
def preprocess_text(text):
    # Remove special characters, digits, extra whitespace
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip().lower()
    return text
preprocess_text(text)
print((text))

CALIFORNIA PROPOSITION 65
stote of Colifornio to couse birth defects or other reproductive horm

Tngr “ym Silas Hota. Nicotine, FCC Grade Vegetable Glycerin,

[etre Fos, lv Ac. Westin conten 6 8 per
contri

Made in China.
Flavors mode in USA
©2012 LOEC, in.

blu™ ond blu eCigs® are trademarks of Lorillord Technologies, Inc.

1 serving per pack
Serving size 1 pack (32.35)
cesar appara

Calories 150

Trans Fat 09
Cholesterol m9
Sodium 1059 ot

INGREDIENTS
MILK CHOCOLATE (SUGAR
CHOCOLATE MILK. COCOA
BL T T. SOY
LE

(WHEAT FLOUR. NIACIN

IRON, AMINE M

RIBOFLAVIN, FOLIC ACID)

MODIFIED SAGO STARCH. SAI

SOYBEAN OIL, CORN SYRUP.

BARLEY MALT EXTRACT

LEAVENING (BAKING SODA. YEAS

AND/OR AMMONIUM

BICARBONATE)]. LESS THAN 2

CORNSTARCH, CORN SYRUP.

DEXTRIN. SALT. COLORING

(INCLUDES BLUE 1 LAKE, RED 40.
LLOW 6, YELLOW S$. BLUE 1, RED

40 LAKE, YELLOW 6 LAK!

SLAKE

CARNAUBA WAX. GUM ACACIA




In [9]:
def predict_product_category(text):
    clean_text = preprocess_text(text)
    X_vec = vectorizer.transform([clean_text])
    predicted_label = model.predict(X_vec)[0]
    return predicted_label

In [10]:
ocr_output_text = "Aqua, Glycerin, Paraben, Fragrance"
ocr_output2 = "Calcium Carbonate, Microcrystalline Cellulose, Talc, Magnesium Stearate"

predicted_category = predict_product_category(text)
print("Predicted Category:", predicted_category)
# chemical_list = ["Paraben", "Fragrance", "Glycerin"]
# for chem in chemical_list:
#     score = get_contextual_harm_score(chem, predicted_category)
#     print(f"{chem} in {predicted_category} → Harm Score: {score}%")


Predicted Category: 2


In [11]:
def predict_category_with_fallback(text, ensemble_model, vectorizer, label_encoder, threshold=0.45):
    clean_text = preprocess_text(text)
    X_vec = vectorizer.transform([clean_text])

    # Get soft-voted probabilities (prefer ensemble's predict_proba if available)
    if hasattr(ensemble_model, "predict_proba"):
        proba = ensemble_model.predict_proba(X_vec)[0]
    else:
        proba = get_proba(ensemble_model, X_vec)

    max_confidence = float(np.max(proba))
    label_idx = int(np.argmax(proba))

    # Decode to original string label using label encoder
    try:
        predicted_label = label_encoder.inverse_transform([label_idx])[0]
    except Exception:
        # fallback if label encoder can't map: use numeric index
        predicted_label = label_idx

    if max_confidence < threshold:
        return "others", proba
    return predicted_label, proba


In [12]:
def get_proba(model, X):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[0]
    else:
        pred = model.predict(X)[0]
        if hasattr(label_encoder, "classes_"):
            # assume label_encoder was used to train underlying models
            idx = list(label_encoder.transform(label_encoder.classes_)).index(label_encoder.transform([pred])[0])
            proba = np.zeros(len(label_encoder.classes_), dtype=float)
            proba[idx] = 1.0
            return proba
        else:
            raise AttributeError("Cannot build probability vector: no predict_proba and no label encoder classes available.")


In [13]:
category, proba = predict_category_with_fallback(ocr_output2, model, vectorizer, label_encoder)
print("Predicted Category:", category)

Predicted Category: stationery


In [14]:
# print("Confidence Scores:", dict(zip(model.classes_, proba)))
print("Confidence Scores:", dict(zip(label_encoder.classes_, proba)))


Confidence Scores: {'cleaning': 0.040672822044312495, 'cosmetic': 0.023018081011089125, 'food': 0.12923677488670474, 'household': 0.012567232254505003, 'medicine': 0.028478053859106917, 'personal_care': 0.0111475007332833, 'stationery': 0.7548795352109984}


In [15]:
import spacy
nlp = spacy.load("en_ner_bc5cdr_md")
text = "Sodium Lauryl Sulfate, Propylparaben, Aloe Vera"
doc = nlp(text)

chemicals = [ent.text for ent in doc.ents if ent.label_ == "CHEMICAL"]
print("Extracted chemicals:", chemicals)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


Extracted chemicals: ['Sodium Lauryl', 'Propylparaben', 'Aloe']


In [16]:
ocr_output = "nicotine, Nicotine, FCC, Glycerin, Cholesterol, Sodium"

In [17]:

def preprocess_text(text):
    return text.lower().replace(",", " ")

def predict_category_with_fallback(text, model, vectorizer, label_encoder, threshold=0.45):
    clean_text = preprocess_text(text)
    X_vec = vectorizer.transform([clean_text])
    
    proba = model.predict_proba(X_vec)[0]
    max_confidence = np.max(proba)
    predicted_index = np.argmax(proba)
    predicted_label = label_encoder.inverse_transform([predicted_index])[0]

    if max_confidence < threshold:
        return "others", max_confidence
    return predicted_label, max_confidence


category, confidence = predict_category_with_fallback(ocr_output, model, vectorizer, label_encoder)
print(f"Predicted Category: {category} | Confidence: {confidence:.2f}")


Predicted Category: cleaning | Confidence: 0.58


In [18]:
examples = [
    "Aqua, Glycerin, Paraben, Fragrance, Dimethicone",
    "Sodium Benzoate, Citric Acid, Natural Flavours, Xanthan Gum, Sugar",
    "Water, Sodium Laureth Sulfate, Cocamidopropyl Betaine, PEG-40, Perfume",
    "Paracetamol, Caffeine, Starch, Povidone, Magnesium Stearate",
    "Sodium Hypochlorite, Limonene, Alcohol Ethoxylate, Sodium Carbonate, Water",
    "Polyvinyl Alcohol, Acetone, Isopropyl Alcohol, Ethylene Glycol, Pigment Red 22",
    "Citronellol, Linalool, D-Limonene, Butylphenyl Methylpropional, Coumarin",
    "Polystyrene, Formaldehyde, Ethanolamine, Dipropylene Glycol, BHT"
]

for text in examples:
    category, confidence = predict_category_with_fallback(text, model, vectorizer, label_encoder)
    print(f"Text: {text}\n → Predicted Category: {category} | Confidence: {confidence:.2f}\n")

Text: Aqua, Glycerin, Paraben, Fragrance, Dimethicone
 → Predicted Category: personal_care | Confidence: 0.51

Text: Sodium Benzoate, Citric Acid, Natural Flavours, Xanthan Gum, Sugar
 → Predicted Category: food | Confidence: 0.66

Text: Water, Sodium Laureth Sulfate, Cocamidopropyl Betaine, PEG-40, Perfume
 → Predicted Category: food | Confidence: 0.56

Text: Paracetamol, Caffeine, Starch, Povidone, Magnesium Stearate
 → Predicted Category: stationery | Confidence: 0.63

Text: Sodium Hypochlorite, Limonene, Alcohol Ethoxylate, Sodium Carbonate, Water
 → Predicted Category: cleaning | Confidence: 0.80

Text: Polyvinyl Alcohol, Acetone, Isopropyl Alcohol, Ethylene Glycol, Pigment Red 22
 → Predicted Category: cleaning | Confidence: 0.45

Text: Citronellol, Linalool, D-Limonene, Butylphenyl Methylpropional, Coumarin
 → Predicted Category: medicine | Confidence: 0.53

Text: Polystyrene, Formaldehyde, Ethanolamine, Dipropylene Glycol, BHT
 → Predicted Category: household | Confidence: 0.77

In [19]:
def calculate_harm_score(chemicals, category, harm_scores_csv):
    harm_df = pd.read_csv(harm_scores_csv)
    
    # Column for the specific category
    category_col = f"harm_score_{category}"
    if category_col not in harm_df.columns:
        raise ValueError(f"Category '{category}' not found in harm score table.")

    scores = []
    for chem in chemicals:
        row = harm_df[harm_df['chemical_name'].str.lower() == chem.lower()]
        if not row.empty:
            score = row[category_col].values[0]
            scores.append(score)

    if not scores:
        return None, None  # No chemicals matched

    avg_score = sum(scores) / len(scores)

    # Risk Level
    if avg_score <= 30:
        risk = "Low Risk"
    elif avg_score <= 70:
        risk = "Moderate Risk"
    else:
        risk = "High Risk"

    return avg_score, risk


chemicals_list = ["Sodium Benzoate", "Citric Acid"," Natural Flavours", "Xanthan Gum", "Sugar"]  # From OCR
predicted_category = "cosmetic"

avg_score, risk_level = calculate_harm_score(
    chemicals_list, 
    predicted_category, 
    "chemical_harmness_category.csv"
)

print(f"Average Harm Score: {avg_score:.2f}%")
print(f"Risk Level: {risk_level}")
print(f"Category :{predicted_category}")

Average Harm Score: 20.00%
Risk Level: Low Risk
Category :cosmetic


In [20]:
import requests

def get_general_product_info(barcode):
    url = f"https://api.upcitemdb.com/prod/trial/lookup?upc={barcode}"
    
    response = requests.get(url)
    if response.status_code != 200:
        print("❌ Failed to fetch data from UPCitemdb API.")
        return None
    
    data = response.json()
    
    if not data.get('items'):
        print("❌ No product found for this barcode.")
        return None

    item = data['items'][0]
    title = item.get('title', 'N/A')
    brand = item.get('brand', 'N/A')
    category = item.get('category', 'N/A')
    description = item.get('description', 'N/A')

    print(f"✅ Product Title: {title}")
    print(f"Brand: {brand}")
    print(f"Category: {category}")
    print(f"Description: {description}")

    return {
        'title': title,
        'brand': brand,
        'category': category,
        'description': description
    }

# Example usage:
# get_general_product_info("028400642511")
# get_general_product_info("012993441012")  # Sample barcode
# get_general_product_info(data)

✅ Product Title: LaCroix Enhanced Sparkling Water Passionfruit - 12 fl oz
Brand: LaCroix
Category: Food, Beverages & Tobacco > Beverages > Water > Carbonated Water
Description: Passionfruit now available nationwide! Check your local retailers as our newest flavor is added to store shelves throughout early 2015 Gender: unisex.


{'title': 'LaCroix Enhanced Sparkling Water Passionfruit - 12 fl oz',
 'brand': 'LaCroix',
 'category': 'Food, Beverages & Tobacco > Beverages > Water > Carbonated Water',
 'description': 'Passionfruit now available nationwide! Check your local retailers as our newest flavor is added to store shelves throughout early 2015 Gender: unisex.'}