In [None]:
%pip install -U -q "google-generativeai>=0.8.3"

In [None]:
import time
import pandas as pd
import numpy as np
import google.generativeai as genai
from kaggle_secrets import UserSecretsClient
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
# Patch tqdm with pandas to enable progress_apply
tqdm.pandas()
user_secrets = UserSecretsClient()
GOOGLE_GEMINI_API_KEY = user_secrets.get_secret("GOOGLE_GEMINI_API_KEY")
genai.configure(api_key = GOOGLE_GEMINI_API_KEY)

In [None]:
df = pd.read_csv("/kaggle/input/ecommerce-text-classification/ecommerceDataset.csv", header=None, names=["category", "description"])

In [None]:
df.dropna(inplace = True)
df.head(3)

In [None]:
df.info()

In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Apply the encoder to the 'Category' column
df['categorical_label'] = label_encoder.fit_transform(df['category'])
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0, stratify = df["category"])
print("Total Training Samples: ", df_train.shape[0])
print("Total Testing Samples: ", df_test.shape[0])

In [None]:
df_train["category"].value_counts()

In [None]:
# Checking Available Embedding Models supported by Google
for m in genai.list_models():
    if "embedContent" in m.supported_generation_methods:
        print(m.name)

In [None]:
df_train["category"].unique()

In [None]:
# Creating Embeddings

def get_embedding(model, text):
    embedding = genai.embed_content(model = model,
                                   content = text,
                                   task_type = "classification")["embedding"]
    return np.array(embedding)
    
def make_embed_text_fn(model, timeout=300.0, retry_interval=60.0, max_chunk_size=10000):
  
    def embed_fn(text):
        start_time = time.time()
        retries = 0
        if len(text.encode('utf-8')) > max_chunk_size:
            print("Skipping text, exceeds size limit")
            return None  # Skip the large text
        while time.time() - start_time < timeout:
            try:
                # Make the request to generate the embedding
                embedding = genai.embed_content(
                    model=model,
                    content=text,
                    task_type="classification"
                )['embedding']
                return np.array(embedding)
            except Exception as e:
                # Handle the error, print it, and retry after some delay
                print(f"Error generating embedding: {e}")
                print(f"Retrying in {retry_interval} seconds...")
                time.sleep(retry_interval)
        
        print("Timeout reached while generating embeddings.")
    
    return embed_fn

def create_embeddings(df):
    model = 'models/text-embedding-004'
    df['Embeddings'] = df['description'].progress_apply(make_embed_text_fn(model))
    return df

df_train = create_embeddings(df_train)
df_test = create_embeddings(df_test)

In [None]:
df_train.dropna(inplace = True)
df_test.dropna(inplace = True)

In [None]:
print(df_train.shape[0])
print(df_test.shape[0])

In [None]:
df_train.to_csv("train.csv")

In [None]:
df_test.to_csv("test.csv")

In [None]:
df_train.head(3)

In [None]:
print("Embedding Size: ", len(df_train["Embeddings"].iloc[0]))

In [None]:
import keras
from keras import layers
from keras.models import Model

# Build model function
def build_classification_model(input_size, num_classes):
    # Define the input layer
    inputs = layers.Input(shape=(input_size,), name="embeddings_input")
    # Hidden layer
    hidden = layers.Dense(input_size, activation="relu", name="hidden")(inputs)
    # Output layer
    outputs = layers.Dense(num_classes, activation="softmax", name="output_probs")(hidden)
    # Create the model
    model = Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
input_size = len(df_train["Embeddings"].iloc[0])  
num_classes = len(df["category"].unique())  

# Build the model
classifier = build_classification_model(input_size=input_size, num_classes=num_classes)
classifier.summary()

# Compile the model
classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=["accuracy"]
)

In [None]:
NUM_EPOCHS = 100
BATCH_SIZE = 32
x_train = np.array(df_train["Embeddings"].tolist())  
y_train = np.array(df_train["categorical_label"])
x_test = np.array(df_test["Embeddings"].tolist()) 
y_test = np.array(df_test["categorical_label"])
print(x_train.shape)  
print(x_test.shape)   
early_stop = keras.callbacks.EarlyStopping(monitor="accuracy", patience=3)
# Train the model
history = classifier.fit(
    x=x_train,
    y=y_train,
    validation_data=(x_test, y_test),
    callbacks=[early_stop],
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS
)

In [None]:
classifier.evaluate(x=x_test, y=y_test, return_dict=True)

In [None]:
df_train

In [None]:
str = df_train.loc[21148, "description"]

In [None]:
sample_description = """
TechClean Premium Microfiber Cloths for Electronics (Set of 4) - Color: Sleek Black & Gray

The TechClean Premium Microfiber Cloths are designed specifically for cleaning and maintaining your valuable electronic devices. Crafted with ultra-soft, lint-free fibers, these cloths are perfect for safely wiping down delicate screens, lenses, keyboards, and other tech surfaces. Whether you’re cleaning your smartphone, laptop, tablet, or TV, the microfiber material gently lifts dust, fingerprints, and smudges without scratching or damaging your devices.

This set includes four cloths in sleek black and gray, ensuring you always have a clean, designated cloth for each device. They’re reusable and machine washable, so you can keep them in top condition after each use. Thanks to the advanced microfiber weave, these cloths capture dust and oil particles effectively, leaving your electronics clean and streak-free with just a light wipe—no harsh chemicals required.

Each cloth measures 10 x 10 inches, the perfect size for easy handling while cleaning screens and other tech surfaces. Whether you're at home, in the office, or on the go, TechClean cloths provide a quick and efficient solution for maintaining your electronics in pristine condition.
"""

embedding_of_sample = get_embedding(model = "models/text-embedding-004",
                                   text = sample_description)
sample_input = np.array([embedding_of_sample])
[result] = classifier.predict(sample_input)

for idx in range(len(result)):
    category = df_test["category"].iloc[idx]
    print(f"{category}: {result[idx] * 100:0.2f}%")