# Categorize Items
This file takes the baseline descriptions and categorizes them into 1-2 word categories so that the performance of the model can be analyzed per category

## Import Mistral-7B

In [1]:
from google.colab import drive
import pandas as pd

In [2]:
from transformers import pipeline

> Set `DIR` to where the data are stored for reading and writing to the directory

In [3]:
DIR = 'path-to-directory-in-drive'

In [4]:
drive.mount('/content/drive')

%cd "$DIR"

Mounted at /content/drive
/content/drive/MyDrive/CS/WGU/Computer Science Capstone - C964/vlm-image-to-description-generator/data


In [None]:
!hf auth login

In [None]:
# Use a GPU when loading this model
from transformers import pipeline

pipe = pipeline("text-generation", model="mistralai/Mistral-7B-v0.1")

## Read in baseline descriptions

In [7]:
df = (
  pd.read_csv("baseline_descriptions.csv", index_col=0)
)

df.head()

Unnamed: 0,item_number,Description
0,1,Impact-branded photo backdrop kit including a ...
1,2,Interfit Stellar X flash lighting unit with ad...
2,3,Interfit Stellar X 300 studio strobe light wit...
3,4,Interfit Stellar X 300 studio flash unit equip...
4,5,Interfit COR 751 lighting kit with a wheeled c...


## Define labeling instructions for model

In [8]:
def label_descriptions(data):
  min_item = data['item_number'].min()
  max_item = data['item_number'].max()

  for item in range(min_item, max_item + 1):
    row = data[data['item_number'] == item].iloc[0]
    description = row['Description']

    prompt = f"""
    You are categorizing auction items.

    Task:
    - Output exactly one category name.
    - The category must be 1–3 words only.
    - It must be a generic, broad category (e.g., "Photography Equipment", "Furniture", "Tools").
    - Do not provide multiple categories, alternatives, or synonyms.
    - Do not use parentheses, slashes, or the word "or".
    - Answer only with the category name. No explanation, no punctuation, no extra text.

    Description:
    {description}
    Answer:"""

    result = pipe(
    prompt,
    max_new_tokens=20,
    do_sample=False,
    return_full_text=False,
    eos_token_id=pipe.tokenizer.eos_token_id,
    stop_sequence="Description:"
    )

    raw = result[0]["generated_text"]
    category = next((line.strip() for line in raw.split("\n") if line.strip()), "")

    data.loc[data['item_number'] == item, 'category'] = category

In [None]:
label_descriptions(df)

In [10]:
df

Unnamed: 0,item_number,Description,category
0,1,Impact-branded photo backdrop kit including a ...,Photography Equipment
1,2,Interfit Stellar X flash lighting unit with ad...,Photography Equipment
2,3,Interfit Stellar X 300 studio strobe light wit...,Photography Equipment
3,4,Interfit Stellar X 300 studio flash unit equip...,Photography Equipment
4,5,Interfit COR 751 lighting kit with a wheeled c...,Photography Equipment
5,6,Pair of Impact floodlight fixtures mounted on ...,Photography Equipment
6,7,Assorted hand tool collection including Sears ...,Tools
7,8,Box of assorted hand tools including multiple ...,Tools
8,9,Clear storage bin filled with grinding and cut...,Metalworking
9,10,"Heavy-duty Armstrong slide hammer puller set, ...",Tools


## Correct some categories by hand

In [11]:
def correct_categories(df, corrections):
  for item_number, category in corrections:
    df.loc[df['item_number'] == item_number, 'category'] = category
  return df

In [22]:
corrections = [
  (9, 'Tools'),
  (21, 'Lighting'),
  (32, 'Art'),
  (38, 'Collectibles'),
  (39, 'Art'),
  (40, 'Decorative'),
  (41, 'Kitchen Appliance'),
  (42, 'Decorative'),
  (44, 'Rugs'),
  (47, 'Flatware'),
  (50, 'Tools'),
  (53, 'Tools'),
  (54, 'Collectibles'),
  (55, 'Collectibles'),
  (57, 'Collectibles'),
  (56, 'Tools')
]

df = correct_categories(df, corrections)

In [23]:
df

Unnamed: 0,item_number,Description,category
0,1,Impact-branded photo backdrop kit including a ...,Photography Equipment
1,2,Interfit Stellar X flash lighting unit with ad...,Photography Equipment
2,3,Interfit Stellar X 300 studio strobe light wit...,Photography Equipment
3,4,Interfit Stellar X 300 studio flash unit equip...,Photography Equipment
4,5,Interfit COR 751 lighting kit with a wheeled c...,Photography Equipment
5,6,Pair of Impact floodlight fixtures mounted on ...,Photography Equipment
6,7,Assorted hand tool collection including Sears ...,Tools
7,8,Box of assorted hand tools including multiple ...,Tools
8,9,Clear storage bin filled with grinding and cut...,Tools
9,10,"Heavy-duty Armstrong slide hammer puller set, ...",Tools


## Save data to csv file in the data folder

In [24]:
df.to_csv('auction_13_categories.csv')