In [1]:
import pandas as pd

# Load the dataset
df_ingredients = pd.read_csv('/kaggle/input/ingredient-toxicity-and-health-impact-dataset/ingredient_effects.csv')

# Display the first few rows of the DataFrame
print("First 5 rows of df_ingredients:")
print(df_ingredients.head())

# Print the shape of the DataFrame
print(f"\nShape of df_ingredients: {df_ingredients.shape}")

First 5 rows of df_ingredients:
    Ingredient_Name  Harmfulness_Score  \
0             Water                  1   
1          Glycerin                  1   
2       Xanthan Gum                  1   
3          Carbomer                  1   
4  Sodium Gluconate                  1   

                                Effect_On_Human_Body  
0  Safe solvent used as a base for dissolving oth...  
1  Hydrates skin and helps maintain moisture balance  
2         Natural thickener; non-toxic and skin-safe  
3   Thickening agent; generally safe for topical use  
4    Chelating agent that improves product stability  

Shape of df_ingredients: (429, 3)


In [2]:
import re
import string

def clean_text(text):
    """Converts text to lowercase, removes punctuation, and strips whitespace."""
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = text.strip()
    return text

df_ingredients['cleaned_effects'] = df_ingredients['Effect_On_Human_Body'].apply(clean_text)

print("Original 'Effect_On_Human_Body' sample:")
print(df_ingredients['Effect_On_Human_Body'].head().tolist())
print("\nCleaned 'Effect_On_Human_Body' sample:")
print(df_ingredients['cleaned_effects'].head().tolist())

Original 'Effect_On_Human_Body' sample:
['Safe solvent used as a base for dissolving other ingredients', 'Hydrates skin and helps maintain moisture balance', 'Natural thickener; non-toxic and skin-safe', 'Thickening agent; generally safe for topical use', 'Chelating agent that improves product stability']

Cleaned 'Effect_On_Human_Body' sample:
['safe solvent used as a base for dissolving other ingredients', 'hydrates skin and helps maintain moisture balance', 'natural thickener nontoxic and skinsafe', 'thickening agent generally safe for topical use', 'chelating agent that improves product stability']


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Instantiate a Tokenizer object. Limiting to top 1000 words for now, can be adjusted.
# The 'df_ingredients' variable is available from previous steps.
tokenizer = Tokenizer(num_words=1000, oov_token="<unk>")

# Fit the Tokenizer on the cleaned effects text
tokenizer.fit_on_texts(df_ingredients['cleaned_effects'])

# Convert text data into sequences of integers
encoded_docs = tokenizer.texts_to_sequences(df_ingredients['cleaned_effects'])

print(f"Vocabulary size: {len(tokenizer.word_index)}")
print("Sample of encoded documents (first 5):")
for i, doc in enumerate(encoded_docs[:5]):
    print(f"Doc {i+1}: {doc}")

2026-01-20 14:44:19.810844: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768920260.172455      17 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768920260.289600      17 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768920261.179479      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768920261.179540      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768920261.179544      17 computation_placer.cc:177] computation placer alr

Vocabulary size: 442
Sample of encoded documents (first 5):
Doc 1: [5, 43, 107, 163, 234, 30, 24, 305, 135, 306]
Doc 2: [235, 3, 2, 47, 108, 62, 307]
Doc 3: [16, 36, 181, 2, 308]
Doc 4: [109, 9, 33, 5, 24, 58, 31]
Doc 5: [44, 9, 17, 37, 164, 309]


In [4]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Determine the maximum sequence length (e.g., max length in encoded_docs or a chosen value)
# For simplicity, let's find the max length in the current dataset
max_sequence_length = max([len(x) for x in encoded_docs])

# Pad the sequences to a uniform length
padded_sequences = pad_sequences(encoded_docs, maxlen=max_sequence_length, padding='post')

print(f"Shape of padded sequences: {padded_sequences.shape}")
print("Sample of padded sequences (first 5):")
for i, seq in enumerate(padded_sequences[:5]):
    print(f"Sequence {i+1}: {seq}")

Shape of padded sequences: (429, 12)
Sample of padded sequences (first 5):
Sequence 1: [  5  43 107 163 234  30  24 305 135 306   0   0]
Sequence 2: [235   3   2  47 108  62 307   0   0   0   0   0]
Sequence 3: [ 16  36 181   2 308   0   0   0   0   0   0   0]
Sequence 4: [109   9  33   5  24  58  31   0   0   0   0   0]
Sequence 5: [ 44   9  17  37 164 309   0   0   0   0   0   0]


In [5]:
from sklearn.model_selection import train_test_split

X = padded_sequences
y = df_ingredients['Harmfulness_Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (343, 12)
Shape of X_test: (86, 12)
Shape of y_train: (343,)
Shape of y_test: (86,)


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input

# 2. Define vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# 3. Define embedding dimension
embedding_dim = 100

# 4. Define input length (from previous step)
input_length = max_sequence_length # This was determined in the padding step

# 5. Instantiate a Sequential model
model = Sequential()

# Add an Input layer to explicitly define the input shape
model.add(Input(shape=(input_length,)))

# 6. Add an Embedding layer
# input_length is no longer needed here as it's defined by the Input layer
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))

# 7. Add an LSTM layer, it will now infer its input shape from the Embedding layer
model.add(LSTM(units=128)) # Using 128 units as a reasonable starting point

# 8. Add a Dense output layer with 1 unit for regression
model.add(Dense(units=1, activation=None)) # No activation for regression tasks

# 9. Compile the model
model.compile(loss='mse', optimizer='adam')

# 10. Print the model summary
print("Deep Learning Model Summary:")
model.summary()

Deep Learning Model Summary:


2026-01-20 14:44:41.448202: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [7]:
from tensorflow.keras.callbacks import EarlyStopping

print("Training the deep learning model with EarlyStopping...")

# Instantiate EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss', # Monitor validation loss
    patience=5,         # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True # Restore model weights from the epoch with the best value of the monitored quantity
)

# Train the model
history = model.fit(
    X_train,
    y_train,
    epochs=100,  # Maximum number of training epochs
    batch_size=32, # Batch size for training
    validation_data=(X_test, y_test), # Validation data to monitor performance
    callbacks=[early_stopping], # Include the EarlyStopping callback
    verbose=1 # Show progress bar during training
)

print("Model training complete.")

Training the deep learning model with EarlyStopping...
Epoch 1/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 51ms/step - loss: 20.0020 - val_loss: 5.5588
Epoch 2/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 5.9992 - val_loss: 4.7650
Epoch 3/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 5.0277 - val_loss: 4.2944
Epoch 4/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 4.4583 - val_loss: 3.4531
Epoch 5/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 3.6893 - val_loss: 2.4583
Epoch 6/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 2.5694 - val_loss: 1.3428
Epoch 7/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 1.0631 - val_loss: 0.9001
Epoch 8/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 0.7887 - v

In [8]:
y_pred = model.predict(X_test)
print("Predictions on test data generated.")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
Predictions on test data generated.


In [9]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Mean Squared Error (MSE): 0.7422
Root Mean Squared Error (RMSE): 0.8615


In [10]:
import numpy as np

def predict_harm_score(raw_effect_text):
    """Predicts the harm score for a given raw effect text."""
    # 1. Clean the raw text
    cleaned_text = clean_text(raw_effect_text)

    # 2. Tokenize the cleaned text
    # texts_to_sequences expects a list of texts, so wrap cleaned_text in a list
    encoded_text = tokenizer.texts_to_sequences([cleaned_text])

    # 3. Pad the sequence to the maximum length
    padded_text = pad_sequences(encoded_text, maxlen=max_sequence_length, padding='post')

    # 4. Predict the harm score using the trained model
    predicted_score = model.predict(padded_text, verbose=0)[0][0]

    # 5. Clamp the score between 1 and 10
    clamped_score = np.clip(predicted_score, 1, 10)

    return clamped_score

# Test the function with a sample text
sample_text = 'Causes severe irritation and allergic reactions'
predicted_score_sample = predict_harm_score(sample_text)

print(f"Predicted Harm Score for '{sample_text}': {predicted_score_sample:.2f}")

Predicted Harm Score for 'Causes severe irritation and allergic reactions': 5.01


In [11]:
import numpy as np

def calculate_product_scores(ingredient_harm_scores):
    """Calculates average, maximum, and final scores for a product."""
    if not ingredient_harm_scores:
        return 0, 0, 0 # Handle empty list case

    average_score = np.mean(ingredient_harm_scores)
    maximum_score = np.max(ingredient_harm_scores)
    final_score = (average_score + maximum_score) / 2
    return average_score, maximum_score, final_score

def classify_product_risk(final_score):
    """Classifies product risk based on the final harm score."""
    if final_score <= 4:
        return 'Safe'
    elif final_score <= 7:
        return 'Moderate Risk'
    else:
        return 'Harmful'

# Test the functions with a sample list of ingredient harm scores
sample_harm_scores = [3, 6, 8, 5]
print(f"Sample Ingredient Harm Scores: {sample_harm_scores}")

average_score, maximum_score, final_score = calculate_product_scores(sample_harm_scores)
print(f"Calculated Average Score: {average_score:.2f}")
print(f"Calculated Maximum Score: {maximum_score:.2f}")
print(f"Calculated Final Score: {final_score:.2f}")

product_classification = classify_product_risk(final_score)
print(f"Product Risk Classification: {product_classification}")

Sample Ingredient Harm Scores: [3, 6, 8, 5]
Calculated Average Score: 5.50
Calculated Maximum Score: 8.00
Calculated Final Score: 6.75
Product Risk Classification: Moderate Risk


In [12]:
def analyze_product_safety(product_ingredients):
    """Analyzes a list of product ingredients for safety, predicting scores for unknown ones.
    Displays a comprehensive, color-coded safety analysis.
    """
    ingredient_harm_scores = []
    ingredient_details = []

    # ANSI escape codes for coloring output
    COLOR_GREEN = '\033[92m'
    COLOR_YELLOW = '\033[93m'
    COLOR_RED = '\033[91m'
    COLOR_END = '\033[0m'

    print("\n--- Product Safety Analysis ---")
    print(f"Analyzing product with ingredients: {', '.join(product_ingredients)}")
    print("-------------------------------")

    for ingredient_name in product_ingredients:
        # Check if the ingredient exists in the dataframe
        matching_ingredient = df_ingredients[df_ingredients['Ingredient_Name'].str.lower() == ingredient_name.lower()]

        if not matching_ingredient.empty:
            # Ingredient found in dataset
            harm_score = matching_ingredient['Harmfulness_Score'].iloc[0]
            effect_description = matching_ingredient['Effect_On_Human_Body'].iloc[0]
            print(f"  - {ingredient_name}: Known effect - '{effect_description}', Score: {harm_score}")
        else:
            # Ingredient not found, predict score
            effect_description = "Unknown compound, potential irritant with cumulative effects. For demonstration purposes."
            harm_score = predict_harm_score(effect_description)
            print(f"  - {ingredient_name}: *New/Unknown* - Predicted effect: '{effect_description}', Predicted Score: {harm_score:.2f}")

        ingredient_harm_scores.append(harm_score)
        ingredient_details.append({
            'name': ingredient_name,
            'effect': effect_description,
            'score': harm_score
        })

    # Calculate product level scores
    average_score, maximum_score, final_score = calculate_product_scores(ingredient_harm_scores)

    # Classify product risk
    product_classification = classify_product_risk(final_score)

    print("\n--- Product Summary ---")
    for detail in ingredient_details:
        print(f"  Ingredient: {detail['name']}")
        print(f"    Effect: {detail['effect']}")
        print(f"    Harm Score: {detail['score']:.2f}")

    print(f"\n  Average Ingredient Score: {average_score:.2f}")
    print(f"  Maximum Ingredient Score: {maximum_score:.2f}")
    print(f"  Final Product Score: {final_score:.2f}")

    # Apply color-coding to the final classification
    if product_classification == 'Safe':
        color = COLOR_GREEN
    elif product_classification == 'Moderate Risk':
        color = COLOR_YELLOW
    else:
        color = COLOR_RED

    print(f"  Product Risk Classification: {color}{product_classification}{COLOR_END}")
    print("-------------------------------")

# Get user input for ingredients
user_input_ingredients_str = input("Enter ingredient names separated by commas (e.g., Parabens, Aqua, New Chemical X): ")
user_product_ingredients = [ingredient.strip() for ingredient in user_input_ingredients_str.split(',')]

# Analyze the product safety with user-provided ingredients
analyze_product_safety(user_product_ingredients)

StdinNotImplementedError: raw_input was called, but this frontend does not support input requests.