#### Library

In [10]:
import pandas as pd
import re
from fuzzywuzzy import fuzz, process

In [14]:
foodData = pd.read_csv("main.csv")
foodData.head()

foodData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7250 entries, 0 to 7249
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sno          7250 non-null   int64 
 1   _id          7250 non-null   object
 2   name         7250 non-null   object
 3   ingredients  7250 non-null   object
 4   diet         7250 non-null   object
 5   prep_time    7250 non-null   object
 6   cook_time    7250 non-null   object
 7   course       7250 non-null   object
 8   state        7250 non-null   object
dtypes: int64(1), object(8)
memory usage: 509.9+ KB


### Function to find exact duplicates

In [15]:
def show_all_exact_duplicates(col):
    return foodData[foodData.duplicated(col, keep=False)].sort_values(col)


# Function to find close matches

# def find_close_matches_export(col, threshold=85, limit=1000):
#     # work only on first N rows
#     names = foodData[col].head(limit).tolist()
    
#     results = []
    
#     for name in names:
#         close = process.extract(name, names, scorer=fuzz.token_set_ratio)
#         # Filter only close matches above threshold
#         close_filtered = [x for x in close if x[0] != name and x[1] >= threshold]
        
#         for match, score in close_filtered:
#             results.append([name, match, score])
    
#     # Convert to dataframe
#     df_results = pd.DataFrame(results, columns=['Name', 'Close Match', 'Match Score'])
    
#     # Export to Excel
#     df_results.to_excel("close_match_results.xlsx", index=False)
    
#     return df_results

# output_df = find_close_matches_export('name', threshold=85, limit=1000)
# output_df.head()

# checked the closed matches manually and found no false positives

print("EXACT DUPLICATE ROWS:")
print(show_all_exact_duplicates('name'))

# cleaned every duplicate in the excel sheet

EXACT DUPLICATE ROWS:
Empty DataFrame
Columns: [sno, _id, name, ingredients, diet, prep_time, cook_time, course, state]
Index: []


#### Detect Foreign Language in Names or Ingredients

In [19]:
def contains_foreign_language(text):
    """
    Detects non-English alphabet characters but ALLOWS symbols, including:
    () [] {} !@#$%^&*_-+=;:'",.<>?/ 
    Only flags characters outside A–Z and a–z.
    """
    if pd.isna(text):
        return False
    
    # Match any unicode letter that is NOT English A–Z
    # Allowed: all symbols, numbers, spaces, punctuation, parentheses
    pattern = r"[^\W\d_a-zA-Z]"
    
    return bool(re.search(pattern, text))


def detect_foreign_in_df(df):
    """
    Adds detection columns:
      - foreign_in_name
      - foreign_in_ingredients
    """

    df["foreign_in_name"] = df["name"].apply(contains_foreign_language)
    df["foreign_in_ingredients"] = df["ingredients"].apply(contains_foreign_language)

    return df



def export_to_excel(df, filename="foreign_language_output.xlsx"):
    """
    Exports the dataframe with foreign-language detection to Excel.
    """
    filtered = df[(df["foreign_in_name"] == True) | 
                  (df["foreign_in_ingredients"] == True)]

    # Export only those rows
    filtered.to_excel(filename, index=False)
    print(f"Foreign-language rows exported: {filename}")



In [18]:
# Apply detection
processed_df = detect_foreign_in_df(foodData)

# Export to Excel
export_to_excel(processed_df, "foreign_ingredients_check.xlsx")


File successfully exported: foreign_ingredients_check.xlsx


#### Function to filter by ingredient keyword

In [None]:
import pandas as pd

def filter_recipes(df, ingredients_list=None, diet_type=None, course_type=None):
    """
    Filters recipes with fuzzy match and returns:
    1. Filtered recipes
    2. Ingredient suggestions (comma-based, cleaned, sorted by length)
    """

    mask = pd.Series([True] * len(df))

    # --------------------------
    # Fuzzy Ingredient Matching
    # --------------------------
    if ingredients_list:
        if not isinstance(ingredients_list, list):
            raise ValueError("ingredients_list must be a list, e.g. ['milk', 'sugar']")
        
        for ing in ingredients_list:
            mask &= df["ingredients"].str.contains(ing, case=False, na=False)

    # --------------------------
    # Fuzzy Diet Matching
    # --------------------------
    if diet_type:
        mask &= df["diet"].str.contains(diet_type, case=False, na=False)

    # --------------------------
    # Fuzzy Course Matching
    # --------------------------
    if course_type:
        mask &= df["course"].str.contains(course_type, case=False, na=False)

    filtered_df = df[mask]

    # -----------------------------------------------------
    # BUILD CLEAN SUGGESTIONS (BASED ON COMMAS ONLY)
    # -----------------------------------------------------
    # extract comma-separated ingredient phrases
    all_ingredient_phrases = (
        df["ingredients"]
        .str.lower()
        .str.split(",")          
        .explode()
        .str.strip()             
        .unique()
        .tolist()
    )

    # remove empty values
    all_ingredient_phrases = [i for i in all_ingredient_phrases if i]

    # fuzzy remove user ingredients
    user_words = [w.lower() for w in ingredients_list] if ingredients_list else []

    def is_not_related(phrase):
        return not any(user_word in phrase for user_word in user_words)

    suggestions = [p for p in all_ingredient_phrases if is_not_related(p)]

    # sort by phrase length
    suggestions = sorted(suggestions, key=len)

    return filtered_df, suggestions


#### Sample Testing

In [52]:
ingredients = ["mil", "sug"]

result, suggestions = filter_recipes(
    foodData,
    ingredients_list=ingredients
)

print("Filtered Recipes:")
print(result[["name", "ingredients"]])

print("\nSuggested Ingredients (cleaned):")
print(suggestions)


Filtered Recipes:
                   name                                        ingredients
2        Gajar ka halwa       Carrots, milk, sugar, ghee, cashews, raisins
3                Ghevar  Flour, ghee, kewra, milk, clarified butter, su...
4           Gulab jamun  Milk powder, plain flour, baking powder, ghee,...
8              Kalakand                        Milk, cottage cheese, sugar
9                 Kheer                    Milk, rice, sugar, dried fruits
...                 ...                                                ...
7123          Chin Chin                                 Flour, Milk, Sugar
7124  Chin Chin (Jumbo)  Flour, milk, sugar, chili peppers (large quant...
7125  Chin Chin (Plain)                                 Flour, milk, sugar
7126  Chin Chin (Spicy)                  Flour, milk, sugar, chili peppers
7178                Ogi              Fermented cereal, milk, sugar, fruits

[548 rows x 2 columns]

Suggested Ingredients (cleaned):
['0', 'oil', 'sev', 'gur