# import libraries

In [1]:
# !pip install openai --upgrade

In [1]:
import os
import json
import re
import pandas as pd
from C_D_create_embeddings import embed_caption_api

# Define Category and Filenames

In [2]:
# do it for each category, dress, outerwear
fashion_category = "dress"

# Specify the path of images
category_folder_path = f"C:/Users/Esra/Desktop/Deep_Learning/Image_Classification/Fashion/Classify_ThreadUp_Images/data/threadup/{fashion_category}/"

# Specify the csv filename to store captions
captions_csv = f'thread_descriptions_{fashion_category}.csv'

#for visualizing data from csv files
captions_csv_path = os.path.join(category_folder_path, captions_csv)

emb_type = "small"
# Specify the filename to store embeddings of the image captions
embeddings_json_path =  os.path.join(category_folder_path, f'embeddings_dict_batch_dress_{emb_type}.json')

# Create Database

In [3]:
# Load the CSV file into a DataFrame
captions = pd.read_csv(captions_csv_path)

# Define a function to parse the descriptions
def parse_description(description):
    # Create a dictionary to hold the parsed data
    parsed_data = {
        'Overall Description': None,
        'Specific Category': None,
        'Color': None,
        'Material': None,
        'Features': None,
        'Vibe': None,
        'Cut': None,
        'Occasion': None
    }
    # Split the description into lines
    lines = description.split('\n')
    # Process each line
    for line in lines:
        # Use regex to capture the field name and value
        match = re.match(r'- (.*?): (.*)', line)
        if match:
            field_name = match.group(1).strip()
            field_value = match.group(2).strip()
            # Map the field name to the corresponding dictionary key
            if field_name in parsed_data and field_name:
                parsed_data[field_name] = field_value
    return pd.Series(parsed_data)

# Apply the function to each description and join the result to the original DataFrame
parsed_descriptions = captions['description'].apply(parse_description)
captions = captions.join(parsed_descriptions)

#drop descriptions column
captions.drop('description', axis=1, inplace=True)

# drop the images with none values
captions = captions.dropna().reset_index(drop=True)
# captions = captions.iloc[50:52]
captions.head()

Unnamed: 0,filename,Overall Description,Specific Category,Color,Material,Features,Vibe,Cut,Occasion
0,item154048282.jpg,A long-sleeved black dress with a modest scoop...,Long-sleeved dress.,Solid black with no visible patterns or designs.,"Appears to be a knit fabric, likely a stretch ...","Simple scoop neckline, long fitted sleeves, an...","A classic, versatile, and understated elegance.","Bodycon style, designed to closely follow the ...","Appropriate for various occasions, ranging fro..."
1,item160883176.jpg,An elegant mid-length dress in a deep black co...,Cocktail dress.,Solid deep black with a glossy shine and semi-...,Appears to be made of a satin-like fabric with...,"Sweetheart neckline, ruched bodice, sleeveless...","Classic and sophisticated, with a hint of vint...",Fit-and-flare silhouette with a defined waist ...,"Suitable for formal events, such as cocktail p..."
2,item160884729.jpg,A calf-length dress with vibrant and colorful ...,Midi dress.,"Bright hues of pink, green, and red dominate w...","Appears to be a lightweight, possibly silk-lik...",Long sleeves with a gentle puff at the shoulde...,"The dress exudes a lively, tropical vibe that'...",The dress has a loose and flowing cut with a c...,"Ideal for spring and summer events, such as re..."
3,item160885991.jpg,"A mid-calf length, coral red dress with puffed...",Midi dress.,A solid coral red with a vibrant hue.,"Appears to be a lightweight, semi-sheer fabric...",The dress features short puffed sleeves with e...,It exudes a playful yet elegant summer vibe wi...,The dress has a loose and flowing cut that lik...,"Perfect for casual summer events, such as brun..."
4,item160886021.jpg,"A sleeveless, knee-length lace dress in a cris...",Lace dress.,"Pure white with a homogenous color throughout,...","Lace fabric with a floral pattern, possibly a ...","Sleeveless with a high neckline, a visible sea...",Exhibits a classic and elegant vibe with a hin...,Straight cut that gently contours around the b...,"Ideal for formal daytime events, such as brida..."


# Call Embedding Function

In [5]:
# Assume captions is a pandas DataFrame with text data in each column

try:
    with open(embeddings_json_path, 'r') as f:
        embeddings_dict = json.load(f)
except FileNotFoundError:
    # Initialize a dictionary to store embeddings for each column
    embeddings_dict = {
        'Overall Description': [],
        'Specific Category': [],
        'Color': [],
        'Material': [],
        'Features': [],
        'Vibe': [],
        'Cut': [],
        'Occasion': []
    }

# Define the batch size to send request in one chunk to openai api
batch_size = 50

# Function to process a batch of texts and return their embeddings
def process_batch(text_list, subembed_key):
    batch_embeddings = embed_caption_api(text_list, model=f"text-embedding-3-{emb_type}")
    embeddings_dict[subembed_key].extend(batch_embeddings)
    # save embeddings regularly
    with open(embeddings_json_path, 'w') as f:
        json.dump(embeddings_dict, f, indent=4)
    
# Loop through each pandas column (subembed_key) and get embeddings for each entry
for subembed_key in embeddings_dict.keys():
    text_list = []  # Initialize/reset the list for each column
    for text in captions[subembed_key]:
        # Clean the text and add it to the batch list
        text_list.append(text.replace("\n", " "))
        
        # Check if the batch size is reached or it's the end of the list
        if len(text_list) == batch_size:
            # Process the current batch
            process_batch(text_list, subembed_key)
            # Reset the list for the next batch
            text_list = []
    
    # Process any remaining texts in the last batch (if not empty)
    if text_list:
        process_batch(text_list, subembed_key)


In [6]:
len(embeddings_dict["Occasion"])

200

# Save embeddings - create sQlite database and connect to it from here!

In [None]:

# with open(embeddings_json_path, 'w') as f:
#     json.dump(embeddings_dict, f, indent=4)
    
#maybe in future, when you have a bigger database, json might be slower, use yaml instead