<a href="https://colab.research.google.com/github/Arpita2025/Additive-Insight/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import pickle
import sys

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_csv('en.openfoodfacts.org.products.tsv', sep='\t')

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
df.shape

(1934, 163)

In [None]:
df.columns

Index(['code', 'url', 'creator', 'created_t', 'created_datetime',
       'last_modified_t', 'last_modified_datetime', 'product_name',
       'generic_name', 'quantity',
       ...
       'fruits-vegetables-nuts_100g', 'fruits-vegetables-nuts-estimate_100g',
       'collagen-meat-protein-ratio_100g', 'cocoa_100g', 'chlorophyl_100g',
       'carbon-footprint_100g', 'nutrition-score-fr_100g',
       'nutrition-score-uk_100g', 'glycemic-index_100g',
       'water-hardness_100g'],
      dtype='object', length=163)

In [None]:
df_cleaned = df.dropna(subset=['ingredients_text'])

In [None]:
useful_fields = ["ingredients_text","additives_n","additives_en"]
filtered_df = df_cleaned[useful_fields]


In [None]:
filtered_df.shape

(1737, 3)

In [None]:
sampled_df = filtered_df.sample(frac=1, random_state=42)

In [None]:
sampled_df.shape

(1737, 3)

In [None]:
sampled_df

Unnamed: 0,ingredients_text,additives_n,additives_en
572,100% Soja-Protein-Isolat (_Soja_),0.0,
1691,Milk chocolate (sugar; cocoa butter: chocolate...,1.0,"E322 - Lecithins,E322i - Lecithin"
1118,"Caramel [tapioca syrup, invert sugar, sweetene...",2.0,"E322 - Lecithins,E322i - Lecithin,E407 - Carra..."
1173,"Milk chocolate [sugar milk, cocoa butter, choc...",1.0,"E322 - Lecithins,E322i - Lecithin"
853,"Chips (dehydrated potato, rich flour, soluble ...",1.0,E330 - Citric acid
...,...,...,...
1302,"Sugar, maltodextrin, tartaric acid, natural an...",6.0,"E110 - Sunset yellow FCF,E300 - Ascorbic acid,..."
1474,"Almonds, potato starch, cinnamon, natural flav...",2.0,"E950 - Acesulfame k,E955 - Sucralose"
1025,"Cucumbers, high fructose corn syrup, vinegar, ...",4.0,"E102 - Tartrazine,E211 - Sodium benzoate,E433 ..."
1642,"Sugar, whey, nonfat dry milk, cocoa powder (du...",2.0,"E340 - Potassium phosphates,E340ii - Dipotassi..."


# **Preprocessing**

In [None]:
def lower_case(text):
    return text.lower()

In [None]:
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z\s,]', '', text)

In [None]:
def tokenize(text):
    return text.split(',')

In [None]:
def split_additives(text):
  return text.split(' - ')

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(tokens):
    r_token_array = []
    for item in tokens:
      split_item = item.split(' ')
      removed_stop_words_array = [token for token in split_item if token not in stop_words]
      removed_stop_words_string = ' '.join(removed_stop_words_array)
      r_token_array.append(removed_stop_words_string)
    return r_token_array


In [None]:
def preprocess(ingredient_list):
    # Normalize text
    ingredient_list = lower_case(ingredient_list)

    # Remove special characters and paranthesis
    ingredient_list = remove_special_characters(ingredient_list)

    # Tokenize text
    tokens = tokenize(ingredient_list)

    # Remove stop words
    tokens = remove_stop_words(tokens)

    return tokens

In [None]:
preprocessed_ingredients_data  = sampled_df['ingredients_text'].apply(preprocess)

# **ingredient to embedding**

In [None]:
corpus_3 = []
for ingredients in preprocessed_ingredients_data:
  for ingredient in ingredients:
    corpus_3.append([ingredient,0,"null"])

In [None]:
for additives in sampled_df['additives_en']:
  if isinstance(additives, str):
    additives_tokens = tokenize(additives)
    for additive_tokens in additives_tokens:
      splitted_tokens = split_additives(additive_tokens)
      for splitted_token in splitted_tokens:
        splitted_preprocessed_token = lower_case(splitted_token)
        corpus_3.append([splitted_preprocessed_token,1, additive_tokens])

In [None]:
corpus = [row[0] for row in corpus_3]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer # BERT , G-USE , BAG OF WORDS , (NATURAL LANGUAGE PROCESSING)

In [None]:
vectorizer = TfidfVectorizer(min_df= 0.00005)

In [None]:
tfidf_matrix = vectorizer.fit_transform(corpus)

In [None]:
feature_names = vectorizer.get_feature_names_out()

In [None]:
tfidf_matrix.shape

(27858, 1548)

# **dimensionality reduction**

pca , lda , kernal pca

In [None]:
from sklearn.decomposition import PCA


In [None]:
n_components = 700
pca = PCA(n_components=n_components)

In [None]:
tfidf_reduced = pca.fit_transform(tfidf_matrix.toarray())

In [None]:
print(tfidf_reduced)

[[-1.77208642e-02 -3.00115468e-02 -1.57255336e-02 ...  6.69347909e-03
   3.09464396e-02  1.92204374e-02]
 [ 1.06602900e-01  1.44439586e-01  1.40359462e-01 ... -1.83599384e-03
   6.95276030e-04 -1.84392347e-03]
 [ 4.68861418e-02  1.34011184e-01 -6.43962588e-02 ... -4.60774389e-03
  -1.26174697e-02  1.62770981e-03]
 ...
 [-2.11173292e-02 -3.63235425e-02 -2.15190258e-02 ... -3.78846346e-04
   1.66181881e-03 -3.85451497e-05]
 [-1.76217956e-02 -2.98106414e-02 -1.63789801e-02 ...  2.36891877e-02
  -7.85419214e-02 -3.05058121e-02]
 [-1.76596299e-02 -2.98799851e-02 -1.64298185e-02 ...  6.35108518e-03
  -6.15311094e-04  1.31180441e-03]]


# **clustering algorithm**

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
dbscan = DBSCAN(eps=0.4, min_samples=3)

In [None]:
import time

In [None]:
start_time = time.time()
clusters = dbscan.fit_predict(tfidf_reduced)
end_time = time.time()
dbscan_training_time = end_time - start_time
print("DBSCAN training time:", dbscan_training_time)

DBSCAN training time: 42.03347206115723


In [None]:
model_bytes = pickle.dumps(dbscan)
model_size = sys.getsizeof(model_bytes)
model_size_mb = model_size / (1024 * 1024)
print(f"Model size: {model_size_mb:.2f} MB")

Model size: 137.86 MB


# **viewing clusters**

In [None]:
noise_points = np.sum(clusters == -1)
noise_points

2072

In [None]:
unique_labels = len(set(clusters))
unique_labels

1072

In [None]:
#display cluster
clusters_grouped = {}
for i in range(unique_labels):
    clusters_grouped[i] = []

clusters_grouped[-1] = []

for i, label in enumerate(clusters):
  clusters_grouped[label].append(corpus_3[i])

In [None]:
for clusters_group in clusters_grouped.values():
  for cluster_group in clusters_group:
    print(cluster_group[0])
  print("------------------------------------")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
e300
------------------------------------
e331
e331
e331
e331
e331
e331
e331
e331
e331
e331
e331
e331
e331
e331
e331
e331
e331
e331
e331
e331
e331
e331
e331
------------------------------------
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
sodium citrates
------------------------------------
e428
e428
e428
e428
e428
e428
e428
e428
e428
e428
e428
e428
e

In [None]:
corpus_classification = []
for clusters_group in clusters_grouped.values():
  group_sum = 0
  group_len = len(clusters_group)

  for cluster_group in clusters_group:
    group_sum += cluster_group[1]

  if group_len > 0 and (group_sum/group_len) > 0.2:
    for cluster_group in clusters_group:
      corpus_classification.append([cluster_group[0],1])
  else:
    for cluster_group in clusters_group:
      corpus_classification.append([cluster_group[0],0])

sihouette score

In [None]:
from sklearn.metrics import silhouette_score

In [None]:

def calculate_silhouette_score(tfidf_matrix, clusters):
  if len(set(clusters)) > 1:

        score = silhouette_score(tfidf_matrix, clusters)
        return score
  else:
        raise ValueError("Silhouette Score cannot be calculated because there is only one cluster or all points are noise.")


In [None]:
 score = calculate_silhouette_score(tfidf_reduced, clusters)
 print(f"Silhouette Score: {score:.4f}")


Silhouette Score: 0.5798


# **CLASSIFICATION**

In [None]:
x = [row[0] for row in corpus_classification]
y = [row[1] for row in corpus_classification]

In [None]:
X = vectorizer.fit_transform(x)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
start_time = time.time()
classifier.fit(X_train, y_train)
end_time = time.time()
random_forest_training_time = end_time - start_time
print("Random Forest training time:", random_forest_training_time)

Random Forest training time: 2.8284993171691895


In [None]:
model_bytes = pickle.dumps(classifier)
model_size = sys.getsizeof(model_bytes)
model_size_mb = model_size / (1024 * 1024)
print(f"Model size: {model_size_mb:.2f} MB")

Model size: 7.27 MB


In [None]:
y_pred = classifier.predict(X_test)

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9964106245513281
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3582
           1       1.00      0.99      0.99      1990

    accuracy                           1.00      5572
   macro avg       1.00      1.00      1.00      5572
weighted avg       1.00      1.00      1.00      5572

Confusion Matrix:
[[3573    9]
 [  11 1979]]


# **TESTING**

In [None]:
# WE GOT ONLY INGREDIENTS ON THE PAKAGE
testing_ingredients = filtered_df['ingredients_text'].iloc[23]

# preprocess the ingredient
test_preprocess_ingredient = preprocess(testing_ingredients)

# generate embedding
test_vectorizer = vectorizer.transform(test_preprocess_ingredient)

# predicting whether additive or not
test_prediction = classifier.predict(test_vectorizer)

test_result = np.column_stack((test_preprocess_ingredient, test_prediction))
print(test_result)

[['peanut butter dry roasted peanuts' '0']
 [' palm oil' '0']
 [' salt' '0']
 [' honey' '0']
 [' crispy brown rice brown rice flour rice flour' '0']
 [' rice bran' '0']
 [' honey' '0']
 [' calcium carbonate' '0']
 [' barley malt' '0']
 [' soy lecithin emulsifier' '0']
 [' sea salt' '0']
 [' locust bean gum' '0']
 [' carrageenan gum' '0']]
