In [None]:
import pandas as pd
import re

ai_completed_file_path = r"C:\Users\bened\OneDrive\Documents\Businesses\Relationship Predicting\Tesco Clubcards\4 - Processed Data Files\Categorisations\categorised_items_280524_2154.xlsx"
category_file_path = r"C:\Users\bened\OneDrive\Documents\Businesses\Relationship Predicting\Tesco Clubcards\2 - Training Data\Categorised Fine Tuning.xlsx"

# Use regular expression to find the number section of AI filepath
datestring = re.search(r'\d{6}_\d{4}', ai_completed_file_path).group()

parsed_filepath = rf"C:\Users\bened\OneDrive\Documents\Businesses\Relationship Predicting\Tesco Clubcards\4 - Processed Data Files\Parsed Data\{datestring}_parsedfile.xlsx"

In [15]:
ai_completed_df = pd.read_excel(ai_completed_file_path)

#Open list of categories, flavours & tags & turn into a list
category_df = pd.read_excel(category_file_path, sheet_name="LLM Fine Tuning")

# Select all columns except "Line Item"
columns_to_consider = category_df.drop(columns=['Line Item'])

# Unstack the DataFrame, drop NaN values, and get unique values
category_list = columns_to_consider.unstack().dropna().unique().tolist()

#Get list of columns we care about 
target_list = ai_completed_df.columns[1:].tolist()

# Filter rows where any cell in the specified columns does not contain a string in category_list
ai_completed_df['drop_row'] = ai_completed_df[target_list].apply(lambda row: any(cell not in category_list for cell in row if pd.notna(cell)), axis=1)
cleaned_df = ai_completed_df[~ai_completed_df['drop_row']].drop(columns=['drop_row'])

# Select columns with "Flavour" and "Tag" in their names
flavour_columns = [col for col in cleaned_df.columns if 'Flavour' in col]
tag_columns = [col for col in cleaned_df.columns if 'Tag' in col]

# Create new DataFrame with required columns and combined lists for tags and flavours
process_df = pd.DataFrame()
process_df['Item Name'] = cleaned_df['Item Name']
process_df['Category'] = cleaned_df['Category']
process_df['Flavours'] = cleaned_df[flavour_columns].apply(lambda x: x.dropna().tolist(), axis=1)
process_df['Tags'] = cleaned_df[tag_columns].apply(lambda x: x.dropna().tolist(), axis=1)

In [16]:
#Define list of categories likely to contain meat/seafood
meat_cats = ["Finfish And Alternatives", "Eggs", "Pizza", "Cheese", "Milk And Milk Drinks", "Poultry And Alternatives", "Prepared Sandwiches, Sushi, Pasta, Salads And Deli Food", 
             "Processed Meats And Alternatives", "Ready Meals", "Red Meat And Alternatives", "Shellfish And Alternatives", "Tofu And Other Meat Alternatives", "Yoghurt And Cream"]

#Define alcohol containing cats so we can remove rogue alcohol free tags
alcohol_cats = ["Wine", "Beer", "Cider", "Mixed Drinks And Cocktails"]

# Define the function to remove duplicates
def remove_duplicates(row):
    # Remove duplicates
    row["Tags"] = list(set(row["Tags"]))
    row["Flavours"] = list(set(row["Flavours"]))
    return row

# Add Vegan or Animal Produce if category suggests meat and no meat or seafood mentioned
#REVIEW THIS ONE AS IT COULD BE MIS TAGGING THINGS NOW 
def add_tags(row):
    if (row['Category'] in meat_cats and
        not any(tag in row['Tags'] for tag in ['Contains Meat', 'Contains Seafood', 'Vegan'])):
        row['Tags'].append('Animal Produce')
    elif (row['Category'] in meat_cats and
        not any(tag in row['Tags'] for tag in ['Contains Meat', 'Contains Seafood', 'Animal Produce'])):
        row['Tags'].append('Vegan')
    return row

# Function to remove "Animal Produce" if "Contains Meat" or "Contains Seafood" are in tags & "Vegan" if any are in
def remove_animal_vegan(tags):
    if "Contains Meat" in tags or "Contains Seafood" in tags:
        tags = [tag for tag in tags if tag != "Animal Produce"]
    elif "Contains Meat" in tags or "Contains Seafood" in tags or "Animal Produce" in tags:
        tags = [tag for tag in tags if tag != "Vegan"]
    return tags

def remove_alcohol_free(row):
    if (row["Category"] not in alcohol_cats and 
        "Alcohol Free" in row["Tags"]):
        row["Tags"] = [tag for tag in row["Tags"] if tag != "Alcohol Free"]
    return row

#Apply function to logically add vegan and animal produce
process_df = process_df.apply(add_tags, axis=1)

#Apply function to remove animal produce if already meat or seafood
process_df['Tags'] = process_df['Tags'].apply(remove_animal_vegan)

#Apply function to remove alcohol free if misplaced
process_df = process_df.apply(remove_alcohol_free, axis=1)

# Apply the function to remove duplicate tags & flavours
#Apply this one last 
process_df = process_df.apply(remove_duplicates, axis=1)

In [18]:
#Turn lists back into columns
def create_data_dict(row):
    item = row['Item Name']
    category = row['Category']
    flavours = row['Flavours']
    tags = row['Tags']

    if not isinstance(tags, list) or not isinstance(flavours, list):
        raise ValueError("Tags and flavours must be lists.")

    # Prepare the data dictionary with basic fields
    data = {'Item Name': item, 'Category': category}

    # Add flavours to the data dictionary
    for i in range(1, len(flavours) + 1):
        data[f'Flavour{i}'] = flavours[i-1]
    
    # Add tags to the data dictionary
    for i in range(1, len(tags) + 1):
        data[f'Tag{i}'] = tags[i-1]

    return data

# Apply the create_data_dict function to each row
parsed_df = process_df.apply(create_data_dict, axis=1, result_type='expand')

#Reorder columns as desired
parsed_df = parsed_df[["Item Name", "Category", "Flavour1", "Flavour2", "Tag1", 'Tag2', 'Tag3', 'Tag4', 'Tag5', 'Tag6']]

In [19]:

parsed_df.to_excel(parsed_filepath, index=False)

In [None]:
"""


"""