In [1]:
import pandas as pd
import numpy as np

In [2]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

In [3]:
df = pd.read_csv('amazon_df_final.csv')
df.head()

Unnamed: 0,title,ingredients,url
0,Nakd Fruit & Nut Bar Variety Pack - Vegan - He...,"Cocoa Orange Ingredients: Dates 40%, CASHEWS 4...",https://amazon.co.uk/Nakd-Mind-Blown-Mixed-Cas...
1,TREK High Protein Flapjack Cocoa Oat - Gluten ...,"Gluten free OATS (25%), Rice syrup, SOYA prote...",https://amazon.co.uk/Trek-Cocoa-Oat-Protein-Fl...
2,Nourish Organic Cacao Coconut Macaroons - Keto...,,https://amazon.co.uk/Nourish-Cacao-Macaroons-1...
3,"Eat Natural Protein Bars, Protein Packed Peanu...",,https://amazon.co.uk/Natural-Protein-Packed-Pa...
4,HappyHome Ultimate Hampers Gluten Free Hamper ...,,https://amazon.co.uk/HappyHome-Ultimate-Hamper...


In [4]:
df.iloc[0].tolist()

['Nakd Fruit & Nut Bar Variety Pack - Vegan - Healthy Snack - Gluten Free - 35g x 18 bars',
 'Cocoa Orange Ingredients: Dates 40%, CASHEWS 40%, Raisins 14%, Cocoa 5%, Orange oil, Natural flavouring. Blueberry Muffin Ingredients: Dates 58%, CASHEWS 15%, Raisins 15%, ALMONDS 10%, Blueberries 2, Natural flavouring. Salted Caramel Ingredients: Dates 54%, Raisins 21%, CASHEWS 12%, PEANUTS 12%, sea salt 0,4%, Natural flavouring. Peanut Delight Ingredients: Dates 53%, PEANUTS 46%, Sea salt, Natural flavouring. Berry Delight Ingredients: Dates 49%, CASHEWS 30%, Raisins 17%, Raspberries 3%, Natural flavouring. Cocoa Delight Ingredients: Dates 48%,CASHEWS 30%, Raisins 16%, Cocoa 6%, Natural flavouring.',
 'https://amazon.co.uk/Nakd-Mind-Blown-Mixed-Case/dp/B07LFRLNVM/ref=sr_1_1?crid=2G1IQMWSJ30CR&keywords=gluten+free+snacks&qid=1705342939&sprefix=gluten+free+snacks%2Caps%2C271&sr=8-1']

In [5]:
df['title'] = df['title'].str.lower()
df['ingredients'] = df['ingredients'].str.lower()

In [6]:
df.head()

Unnamed: 0,title,ingredients,url
0,nakd fruit & nut bar variety pack - vegan - he...,"cocoa orange ingredients: dates 40%, cashews 4...",https://amazon.co.uk/Nakd-Mind-Blown-Mixed-Cas...
1,trek high protein flapjack cocoa oat - gluten ...,"gluten free oats (25%), rice syrup, soya prote...",https://amazon.co.uk/Trek-Cocoa-Oat-Protein-Fl...
2,nourish organic cacao coconut macaroons - keto...,,https://amazon.co.uk/Nourish-Cacao-Macaroons-1...
3,"eat natural protein bars, protein packed peanu...",,https://amazon.co.uk/Natural-Protein-Packed-Pa...
4,happyhome ultimate hampers gluten free hamper ...,,https://amazon.co.uk/HappyHome-Ultimate-Hamper...


In [7]:
def preprocess_text(text):
    # Replace specific phrases with a single token
    text = text.replace("gluten free", "glutenfree")
    text = text.replace("allergen free", "allergenfree")
    return text

In [8]:
def extract_words(text):
    if pd.isna(text):  # Check for NaN values
        return []

    # Preprocess the text to handle specific phrases
    text = preprocess_text(text)
    
    # Use NLTK's word_tokenize to handle phrases
    words = word_tokenize(text)
    return words

In [9]:
# Apply the custom function to create a new column for each original column
df['word_list_column1'] = df['title'].apply(extract_words)
df['word_list_column2'] = df['ingredients'].apply(extract_words)

# Combine the lists from both columns into a new column
df['final_word_list'] = df['word_list_column1'] + df['word_list_column2']

df = df.drop(['word_list_column1', 'word_list_column2'], axis=1)
df.head()

Unnamed: 0,title,ingredients,url,final_word_list
0,nakd fruit & nut bar variety pack - vegan - he...,"cocoa orange ingredients: dates 40%, cashews 4...",https://amazon.co.uk/Nakd-Mind-Blown-Mixed-Cas...,"[nakd, fruit, &, nut, bar, variety, pack, -, v..."
1,trek high protein flapjack cocoa oat - gluten ...,"gluten free oats (25%), rice syrup, soya prote...",https://amazon.co.uk/Trek-Cocoa-Oat-Protein-Fl...,"[trek, high, protein, flapjack, cocoa, oat, -,..."
2,nourish organic cacao coconut macaroons - keto...,,https://amazon.co.uk/Nourish-Cacao-Macaroons-1...,"[nourish, organic, cacao, coconut, macaroons, ..."
3,"eat natural protein bars, protein packed peanu...",,https://amazon.co.uk/Natural-Protein-Packed-Pa...,"[eat, natural, protein, bars, ,, protein, pack..."
4,happyhome ultimate hampers gluten free hamper ...,,https://amazon.co.uk/HappyHome-Ultimate-Hamper...,"[happyhome, ultimate, hampers, glutenfree, ham..."


In [10]:
def clean_and_lower(word_list):
    cleaned_list = [word.lower() for word in word_list if not word.isdigit()]
    return cleaned_list

# Apply the cleaning function to the final word list column
df['final_word_list'] = df['final_word_list'].apply(clean_and_lower)
df.head()

Unnamed: 0,title,ingredients,url,final_word_list
0,nakd fruit & nut bar variety pack - vegan - he...,"cocoa orange ingredients: dates 40%, cashews 4...",https://amazon.co.uk/Nakd-Mind-Blown-Mixed-Cas...,"[nakd, fruit, &, nut, bar, variety, pack, -, v..."
1,trek high protein flapjack cocoa oat - gluten ...,"gluten free oats (25%), rice syrup, soya prote...",https://amazon.co.uk/Trek-Cocoa-Oat-Protein-Fl...,"[trek, high, protein, flapjack, cocoa, oat, -,..."
2,nourish organic cacao coconut macaroons - keto...,,https://amazon.co.uk/Nourish-Cacao-Macaroons-1...,"[nourish, organic, cacao, coconut, macaroons, ..."
3,"eat natural protein bars, protein packed peanu...",,https://amazon.co.uk/Natural-Protein-Packed-Pa...,"[eat, natural, protein, bars, ,, protein, pack..."
4,happyhome ultimate hampers gluten free hamper ...,,https://amazon.co.uk/HappyHome-Ultimate-Hamper...,"[happyhome, ultimate, hampers, glutenfree, ham..."


In [11]:
porter = PorterStemmer()

# Function to apply stemming
def apply_stemming(word_list):
    return [porter.stem(word) for word in word_list]

# Apply the stemming function to the final word list column
df['final_word_list'] = df['final_word_list'].apply(apply_stemming)


In [12]:
# Function to remove duplicates from the lists
def remove_duplicates(word_list):
    return list(set(word_list))

# Apply the function to remove duplicates
df['final_word_list'] = df['final_word_list'].apply(remove_duplicates)


In [13]:
# Function to filter rows based on the words in the specified string
def filter_rows_exclude(input_string):
    input_words = apply_stemming(clean_and_lower(extract_words(preprocess_text(input_string.lower()))))
    filtered_df = df[~df['final_word_list'].apply(lambda x: any(word in x for word in input_words))]
    
    # Reset the index to get new numbers
    filtered_df = filtered_df.reset_index(drop=True)
    
    return filtered_df

def filter_rows_include(input_string):
    input_words = apply_stemming(clean_and_lower(extract_words(preprocess_text(input_string.lower()))))
    filtered_df = df[df['final_word_list'].apply(lambda x: any(word in x for word in input_words))]
    
    # Reset the index to get new numbers
    filtered_df = filtered_df.reset_index(drop=True)
    
    return filtered_df

# Example usage
include_string = "Gluten free protein"
exclude_string = "emulsifier peanuts"
include_result = filter_rows_include(include_string)
exclude_result = filter_rows_exclude(exclude_string)


In [14]:
include_result.head(3)

Unnamed: 0,title,ingredients,url,final_word_list
0,nakd fruit & nut bar variety pack - vegan - he...,"cocoa orange ingredients: dates 40%, cashews 4...",https://amazon.co.uk/Nakd-Mind-Blown-Mixed-Cas...,"[-, %, date, cashew, ., healthi, snack, oil, s..."
1,trek high protein flapjack cocoa oat - gluten ...,"gluten free oats (25%), rice syrup, soya prote...",https://amazon.co.uk/Trek-Cocoa-Oat-Protein-Fl...,"[-, %, *, cane, protein, oat, ., crunchi, flap..."
2,nourish organic cacao coconut macaroons - keto...,,https://amazon.co.uk/Nourish-Cacao-Macaroons-1...,"[-, cacao, free, healthi, snack, dairi, vegan,..."


In [15]:
print(include_result.iloc[0].tolist())

['nakd fruit & nut bar variety pack - vegan - healthy snack - gluten free - 35g x 18 bars', 'cocoa orange ingredients: dates 40%, cashews 40%, raisins 14%, cocoa 5%, orange oil, natural flavouring. blueberry muffin ingredients: dates 58%, cashews 15%, raisins 15%, almonds 10%, blueberries 2, natural flavouring. salted caramel ingredients: dates 54%, raisins 21%, cashews 12%, peanuts 12%, sea salt 0,4%, natural flavouring. peanut delight ingredients: dates 53%, peanuts 46%, sea salt, natural flavouring. berry delight ingredients: dates 49%, cashews 30%, raisins 17%, raspberries 3%, natural flavouring. cocoa delight ingredients: dates 48%,cashews 30%, raisins 16%, cocoa 6%, natural flavouring.', 'https://amazon.co.uk/Nakd-Mind-Blown-Mixed-Case/dp/B07LFRLNVM/ref=sr_1_1?crid=2G1IQMWSJ30CR&keywords=gluten+free+snacks&qid=1705342939&sprefix=gluten+free+snacks%2Caps%2C271&sr=8-1', ['-', '%', 'date', 'cashew', '.', 'healthi', 'snack', 'oil', 'sea', 'raspberri', '35g', 'vegan', 'delight', 'glut

In [16]:
exclude_result.head(3)

Unnamed: 0,title,ingredients,url,final_word_list
0,nourish organic cacao coconut macaroons - keto...,,https://amazon.co.uk/Nourish-Cacao-Macaroons-1...,"[-, cacao, free, healthi, snack, dairi, vegan,..."
1,happyhome ultimate hampers gluten free hamper ...,,https://amazon.co.uk/HappyHome-Ultimate-Hamper...,"[vegans-contain, free, healthi, snack, bundl, ..."
2,bon bag - gluten-free fizzy and fizz-free pick...,,https://amazon.co.uk/Large-Litre-Gluten-Free-S...,"[-, bag, 800g, pick, ., mix, candi, sweet, ,, ..."


In [17]:
print(exclude_result.iloc[0].tolist())

['nourish organic cacao coconut macaroons - keto snacks - vegan, gluten free, dairy free healthy snacks made with natural ingredients - 140g (pack of 1)', nan, 'https://amazon.co.uk/Nourish-Cacao-Macaroons-140-g/dp/B07656GYHP/ref=sr_1_3?crid=2G1IQMWSJ30CR&keywords=gluten+free+snacks&qid=1705342939&sprefix=gluten+free+snacks%2Caps%2C271&sr=8-3', ['-', 'cacao', 'free', 'healthi', 'snack', 'dairi', 'vegan', 'glutenfre', 'natur', ',', 'with', '(', 'macaroon', 'ingredi', '140g', 'of', 'pack', 'coconut', ')', 'organ', 'nourish', 'made', 'keto']]


In [18]:
def filter_rows_both(include_string, exclude_string):
    include_words = apply_stemming(clean_and_lower(extract_words(preprocess_text(include_string.lower()))))
    exclude_words = apply_stemming(clean_and_lower(extract_words(preprocess_text(exclude_string.lower()))))
    
    included_rows = df[df['final_word_list'].apply(lambda x: any(word in x for word in include_words))]
    excluded_rows = df[~df['final_word_list'].apply(lambda x: any(word in x for word in exclude_words))]
    
    # Combine the results
    filtered_df = pd.merge(included_rows, excluded_rows, how='inner', on='title')
    
    # Reset the index to get new numbers
    filtered_df = filtered_df.reset_index(drop=True)
    
    return filtered_df.iloc[:,:4]

In [22]:
result_both = filter_rows_both("gluten free vegan allergen free", "emulsifier peanuts dairy cashew almond ")
result_both.head(20)

Unnamed: 0,title,ingredients_x,url_x,final_word_list_x
0,kallo organic belgian milk chocolate rice cake...,,https://amazon.co.uk/Kallo-Organic-Milk-Chocol...,"[preserv, artifici, thin, healthi, snack, pack..."
1,purely plantain chips - low fat plantain crisp...,"plantains, sunflower oil, salt",https://amazon.co.uk/Purely-Plantain-Chips-Nut...,"[-, crisp, healthi, snack, altern, sea, oil, n..."
2,"kallo beetroot & balsamic veggie cakes, lentil...","lentil flour (76%), rapeseed oil, beetroot bal...",https://amazon.co.uk/Kallo-Beetroot-Balsamic-V...,"[%, pepper, artifici, season, healthi, 5.7, sn..."
3,emily - veg crisps - rainbow roots - gluten fr...,"sweet potato, carrot, beetroot, sunflower oil,...",https://amazon.co.uk/Emily-Veg-Crisps-Rainbow-...,"[-, crisp, free, oil, potato, sea, vegan, sunf..."
4,the happy snack company chocolate chickpeas ta...,,https://amazon.co.uk/Happy-Snack-Company-Choco...,"[portion, 20g, chickpea, tasti, cal, free, sna..."
5,trek high protein flapjack original oat - glut...,"gluten free oats (29%), rice syrup, soya prote...",https://amazon.co.uk/Trek-Protein-Flapjack-Bar...,"[-, %, *, protein, oat, flapjack, plant, snack..."
6,"kallo protein packed lentil cakes, low fat hea...","lentil flour (48%), corn flour, potato starch,...",https://amazon.co.uk/Kallo-Protein-Packed-Lent...,"[%, corn, artifici, protein, free, healthi, sn..."
7,scratch my pork bulk pork crackling - lightly ...,,https://amazon.co.uk/Scratch-Pork-Bulk-Crackli...,"[-, %, bag, rind, protein, food, ., snack, glu..."
8,bear strawberry yoyos - dried fruit rolls - he...,"apples, pears, strawberries",https://amazon.co.uk/BEAR-Strawberry-Pure-Frui...,"[-, fruit, 20g, (, bear, ), pear, roll, health..."
9,bear variety pack yoyos - dried fruit rolls - ...,"ingredients: apples, pears, strawberries, blac...",https://amazon.co.uk/BEAR-Variety-Pack-Yoyos-L...,"[-, 20g, ., pear, healthi, strawberri, raspber..."


In [26]:
print(result_both.iloc[20].tolist())

['gluten free snack bars | creative nature vegan snack bars | berry blend flavour | healthy snacks, lunchbox snacks | top 14 allergen free | 20 x 38g bars', 'gluten free oats, dried fruit (67%)(apricots, cranberries, sultanas, sour cherries, goji berries, mango), chicory fibre, vegetable glycerine, natural flavouring', 'https://amazon.co.uk/Creative-Nature-Goodness-Vegan-Flapjack/dp/B082Q149VR/ref=sr_1_53?crid=2QJW2OV8ZTB2Y&dib=eyJ2IjoiMSJ9.H87tE2j8bdAC9NriA5bvgCnaoISIYIiB0ei3Q1FVbsn8ajMafSvFvurjjVARxLT3Hbwa1qiniLh2SsJlTUPxwSvpHW3qhWaUSyBeOOm0W-6JpFJDFfF3DM7Rl8uYI08VZ6ohsAnXmUeRdJHkhlj2uv6ZZ0lHNpgl6crdczkrpe0y7qB-_Kb2543RTWOJ1xwuasuCjbidJgqMdxapE3C3kt8n3srQdomb6OyM1IoTniI.owVGWbCLWnbDBPWQ_qar8ZrktngmmOyn8X9GMPnvGWo&dib_tag=se&keywords=gluten+free+snacks&qid=1705345458&sprefix=gluten+free+snacks%2Caps%2C734&sr=8-53', ['%', 'allergenfre', 'oat', 'healthi', 'snack', 'chicori', 'vegan', 'creativ', 'glutenfre', 'flavour', 'natur', 'top', ',', 'apricot', 'goji', 'dri', '(', 'cranberri', 'ber

In [21]:
result_both[['title', 'url_x']]

Unnamed: 0,title,url_x
0,kallo organic belgian milk chocolate rice cake...,https://amazon.co.uk/Kallo-Organic-Milk-Chocol...
1,purely plantain chips - low fat plantain crisp...,https://amazon.co.uk/Purely-Plantain-Chips-Nut...
2,"kallo beetroot & balsamic veggie cakes, lentil...",https://amazon.co.uk/Kallo-Beetroot-Balsamic-V...
3,emily - veg crisps - rainbow roots - gluten fr...,https://amazon.co.uk/Emily-Veg-Crisps-Rainbow-...
4,the happy snack company chocolate chickpeas ta...,https://amazon.co.uk/Happy-Snack-Company-Choco...
...,...,...
146,farmgio organic dried apricot 4x (250 g) | cer...,https://amazon.co.uk/Farmgio-Certified-Agricul...
147,"nugo protein bar, vanilla yogurt, 11g protein,...",https://amazon.co.uk/Nugo-Nutrition-NuGo-Vanil...
148,go-low easy wrap keto mix | low carb | high fi...,https://amazon.co.uk/GO-LOW-Grain-Diabetic-fri...
149,pride of india - mung bean plain papadum lenti...,https://amazon.co.uk/Pride-India-Papadum-Lenti...
