In [1]:
import pandas as pd

In [13]:
def clean_ingredients_list(ingredients_string):
    """
    Cleans and converts a semicolon-separated string of ingredients into a list.
    - Converts to lowercase
    - Strips whitespace
    - Removes empty strings
    - Removes duplicates within the same recipe
    """
    if pd.isna(ingredients_string):
        return []

    ingredients_list = ingredients_string.split(';')

    # Lowercase + strip + remove empty
    cleaned_list = [item.strip().lower() for item in ingredients_list if item.strip()]

    # Optional: remove duplicates while keeping order
    cleaned_list = list(dict.fromkeys(cleaned_list))

    return cleaned_list

In [14]:
df = pd.read_csv('recipes_data.csv')

# Fill NaN values first to prevent an error with the .split() method
df['ingredients'] = df['ingredients'].fillna('').apply(clean_ingredients_list)

# Fill any remaining missing values in the 'instructions' column
df['instructions'] = df['instructions'].fillna('N/A')

# Save the cleaned DataFrame to a new CSV file
df.to_csv('cleaned_recipes.csv', index=False)

In [15]:
# Display the first 5 rows to verify the changes
print("First 5 rows of the cleaned data:")
print(df.head())

First 5 rows of the cleaned data:
                                               title  \
0                    Creamy Tomato and Spinach Pasta   
1  Tulingan Bistek (Braised Bullet Tuna in Soy Sa...   
2                    Ginisang Sitaw with Bell Pepper   
3                                           Sarciado   
4  Creamy Coconut Milk Fish Stew (Ginataang Isda ...   

                                         ingredients  \
0  [linguine, extra virgin olive oil, garlic, oni...   
1  [tulingan, calamansi, garlic, ginger, green on...   
2  [string beans, pork belly, red bell pepper, to...   
3  [fish, maggi magic sarap, onion, tomatoes, gar...   
4  [round scad, coconut milk, baby bok choy, chin...   

                                        instructions  \
0  Heat the olive oil in a large pan over medium ...   
1  Combine tulingan, calamansi juice, soy sauce, ...   
2  Heat a wok over medium heat. Add the pork bell...   
3  Rub Maggi Magic Sarap all over the fish. Heat ...   
4  Rub Maggi

In [16]:
# Check the data types and non-null counts
print("\nDataFrame information after cleaning:")
print(df.info())


DataFrame information after cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2008 entries, 0 to 2007
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         2008 non-null   object
 1   ingredients   2008 non-null   object
 2   instructions  2008 non-null   object
 3   url           2008 non-null   object
dtypes: object(4)
memory usage: 62.9+ KB
None
