# 1.0 Imports

In [7]:
from tqdm import tqdm
from openai import OpenAI
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math
import os
from dotenv import load_dotenv

# 2.0 Setting model

In [8]:
# Load environment variables from the .env file
load_dotenv()

# Retrieve the API key from the environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")

# 3.0 Helper functions

In [9]:
def classify_product(product_name, categories, known_examples):
    examples = "\n".join([f"Product: {k}, Category: {v}" for k, v in known_examples.items()])
    prompt = f"""Here are some examples of product classifications:

    {examples}

    Now, classify the following product into a category.
    Follow the step by step below:
    1. Use ONLY the categories {', '.join(categories)} to classify.
    2. Just provide the classification, without any explanation.
    3. Use only the classification provided in {', '.join(categories)} to classification.
    4. Don't create new categories.

    Product: {product_name}
    Possible categories: {', '.join(categories)}
    Category:"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=4096,
        n=1,
        stop=None,
        temperature=0.0,
    )

    return response.choices[0].message.content.strip()

# Function to classify products in a DataFrame
def classify_dataframe_products(df, product_column, batch_size=600):
    predictions = []
    for i in tqdm(range(0, len(df), batch_size), desc="Classifying products"):
        batch = df[product_column].iloc[i:i+batch_size]
        batch_predictions = [classify_product(product, categories, known_examples) for product in batch]
        predictions.extend(batch_predictions)
    return predictions

# Function to calculate similarity using cosine similarity
def calculate_similarity(row, vectorizer):
    vectors = vectorizer.transform([row['categoryName'], row['categoria predita']])
    return cosine_similarity(vectors[0:1], vectors[1:2])[0][0]

# 4.0 Load dataset

In [10]:
df_raw = pd.read_csv("dataset/amz_br_total_products_data_processed.csv")

# 5.0 Cleaning NaN values

In [11]:
df_raw = df_raw.fillna('sem_informacoes')
df_raw.isna().sum()

asin                 0
title                0
imgUrl               0
productURL           0
stars                0
reviews              0
price                0
listPrice            0
categoryName         0
isBestSeller         0
boughtInLastMonth    0
dtype: int64

# 6.0 Pic a 50 samples from dataset

In [13]:
df = df_raw.loc[:, ["title", "categoryName"]].sample(50)


# 7.0 Implementing the IA Classification Model

In [None]:
# Load your dataset with known categories
df_known = df.head(40)

# Load your dataset to classify
df_compare = df.tail(10)

# Create a dictionary from df_known
known_examples = dict(zip(df_known['title'], df_known['categoryName']))

# Get unique categories
categories = df['categoryName'].unique().tolist()

# Defying test data
df_new_products = df_compare
df_new_products = df_new_products.drop(('categoryName'), axis=1)

# Classify the new products
df_new_products['categoria predita'] = classify_dataframe_products(df_new_products, 'title')

# Optional: If you want to merge this with any existing data
df_final = pd.merge(df_compare.loc[:, ["title", "categoryName"]], df_new_products.loc[:, ["categoria predita"]], left_index=True, right_index=True, how='inner')
df_final

# 8.0 Validation metrics

In [None]:
# Create a TF-IDF Vectorizer
combined_text = pd.concat([df_final['categoryName'], df_final['categoria predita']])
vectorizer = TfidfVectorizer().fit(combined_text)

# Calculate similarity
df_final['similaridade'] = df_final.apply(lambda row: calculate_similarity(row, vectorizer), axis=1)

# Calculate and print average similarity
average_similarity = df_final['similaridade'].mean()
print(f"\nAverage Similarity: {average_similarity:.2f}")

# Optional: You can also add a column to flag if the prediction is correct (assuming an exact match is required)
df_final['validacao'] = df_final['categoryName'] == df_final['categoria predita']

# Print accuracy
accuracy = df_final['validacao'].mean()
print(f"Accuracy: {accuracy:.2f}")