In [None]:
import json
from langchain.document_loaders import CSVLoader
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import pandas as pd


# 1. Carregar os dados
file = 'inputs/OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file, encoding="utf-8")
docs = loader.load()

documents_dict = [{doc.page_content} for doc in docs]

In [54]:
# 2. Definir o LLM e criar o prompt
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

prompt_template = """
You are a product classification assistant. Below are details about a product from an outdoor clothing catalog. 
The dataset contains structured information about outdoor products, including clothing and accessories, 
with attributes such as name and huge description.

Product details:
Product Name: {product_name}
Description: {description}

Classify this product into categories:

Return only the categories as a valid JSON list, in this format:
["Category1", "Category2", "Category3"]
Do not add any other text, only the JSON list.
"""

prompt = PromptTemplate(
    input_variables=["product_name", "description"],
    template=prompt_template
)

# 3. Criar o pipeline de classificação
chain = LLMChain(llm=llm, prompt=prompt)

# 4. Aplicar classificação nos dados e salvar resultados
classified_products = []
names = []
descriptions = []

for docs in documents_dict:
    for element in docs:
        match = re.search(r'name: (.*?)\n', element, re.IGNORECASE)
        if match:
            product_name = match.group(1)  # Extract the matched name

        match = re.search(r'description: (.*?)\n', element, re.IGNORECASE)
        if match:
            description = match.group(1)  # Extract the matched name


        names.append(product_name)
        descriptions.append(description)

        classification = chain.run({
            "product_name": product_name,
            "description": description,
        })
        
        # Print the model's output to debug
        print("Classification output:", classification)

        # Attempt to parse it as JSON only if it's valid
        try:
            categories = json.loads(classification)
        except json.JSONDecodeError:
            print("Error decoding JSON from the classification output.")
            categories = []

    classified_products.append(categories)


data = [names, descriptions, classified_products]

Classification output: ["Footwear", "Casual Shoes", "Women's Shoes"]
Classification output: ["Pet Accessories", "Dog Mats", "Home & Garden"]
Classification output: ["Clothing", "Swimwear", "Infant and Toddler"]
Classification output: ["Swimwear", "Tankini", "Watersports Clothing"]
Classification output: ["Clothing", "Pants", "Outdoor Gear"]
Classification output: ["Clothing", "Shirts", "Outdoor Apparel"]
Classification output: ["Clothing", "Gloves", "Outdoor Gear"]
Classification output: ["Clothing", "Women's Clothing", "Sweatshirts"]
Classification output: ["Clothing", "Outerwear", "Fleece Jackets"]
Classification output: ["Camping Equipment", "Outdoor Furniture", "Tables"]
Classification output: ["Clothing", "Loungewear", "Sweaters"]
Classification output: ["Clothing", "Outerwear", "Jackets"]
Classification output: ["Bedding", "Home Textiles", "Flannel Sheets"]
Classification output: ["Clothing", "Outerwear", "Shirts"]
Classification output: ["Clothing", "Shirts", "Short-Sleeve Shirt

In [64]:
# Combine the lists into a dictionary
data = {
    'names': names,
    'descriptions': descriptions,
    'classified_products': classified_products
}

# Convert the dictionary to a DataFrame
df = pd.DataFrame(data)

# Save the results to a JSON file
results_filename = f"inputs/ProductsCategories_OutdoorClothingCatalog_1000_results.csv"

# Save the DataFrame to a CSV file
df.to_csv(results_filename, index=False)  # `index=False` prevents writing row numbers

print("Documents have been saved to {results_filename}")

Documents have been saved to {results_filename}


In [None]:
file = 'inputs/OutdoorClothingCatalog_1000.csv'
OutdoorClothingCatalog = pd.read_csv(file)
OutdoorClothingCatalog

file = 'inputs/ProductsCategories_OutdoorClothingCatalog_1000_results.csv'
ProductsCategories_OutdoorClothingCatalog_1000_results = pd.read_csv(file)
ProductsCategories_OutdoorClothingCatalog_1000_results.rename(columns={'names': 'name'}, inplace=True)
ProductsCategories = ProductsCategories_OutdoorClothingCatalog_1000_results[['name','classified_products']]
ProductsCategories

finalDatabase = pd.merge(OutdoorClothingCatalog, ProductsCategories, on='name', how='inner')

# Save the results to a JSON file
results_filename = f"inputs/OutdoorClothingCatalog_1000_withCategories.csv"

# Save the DataFrame to a CSV file
finalDatabase.to_csv(results_filename, index=False)  # `index=False` prevents writing row numbers

print("Documents have been saved to {results_filename}")


Documents have been saved to {results_filename}
