In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import pickle


In [8]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anish\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anish\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [9]:
df = pd.read_csv('food.csv')

In [10]:
df.head()

Unnamed: 0,asin,title,description,main_cat,price
0,B008FZZHT6,Petit Guava Nectar with Vitamin C L,Petit Guava Nectar with Vitamin C L Petit Gua...,Grocery,$4.95
1,B00U6QO43M,VitamaltVitamalt Non Alcohol Malt Beer Taste B...,Enjoy the pleasant time of nonalcoholicanytime,Grocery,$26.87
2,B00182NNGC,igourmet Friesian Clove Nagelkaas ounce,Clove cheese is a unique Dutch specialty thoug...,Grocery,$8.99
3,B002L24CJA,The Republic of Tea Ginger Peach Red Tea Bags,Red Tea Collection Caffeinefree rooibos is pai...,Grocery,$13.77
4,B0001M110K,Frontier Seasoning Blends Saltfree Italian Sea...,Seasoning blends Premium quality All natural F...,Grocery,$6.85


In [11]:
# Tokenization function
def tokenize(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

In [12]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['description'])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Function to get top 3 similar products
def get_top_similar_products(query, cosine_sim_matrix, df, top_n=3):
    # Transform the query using the TF-IDF vectorizer
    query_vector = tfidf_vectorizer.transform([query])

    # Calculate the cosine similarity
    cosine_similarities = linear_kernel(query_vector, tfidf_matrix).flatten()

    # Get indices of top 3 similar products
    top_indices = cosine_similarities.argsort()[:-top_n-1:-1]

    # Return the top N similar products
    return df.iloc[top_indices]

# Testing predictions
user_query = "The Republic of Tea Ginger Peach Red Tea Bags"
top_similar_products = get_top_similar_products(user_query, cosine_sim, df)
print(top_similar_products[['title', 'description', 'price']])



                                              title  \
3799                         Sweet Ginger Peach Tea   
10522  Stash Tea Green Ginger Peach Matcha Pack of    
11245      The Republic of Tea Vanilla Almond Count   

                                             description   price  
3799                         Sweet Ginger Peach Tea  Bag  $10.49  
10522  Ginger Peach with Matcha Tea  Bags by Stash Te...   $7.79  
11245           The Republic of Tea Vanilla Almond Count  $21.96  


In [13]:
#Saving the model
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

with open('tfidf_matrix.pkl', 'wb') as file:
    pickle.dump(tfidf_matrix, file)

with open('df.pkl', 'wb') as file:
    pickle.dump(df, file)

with open('cosine_sim.pkl', 'wb') as file:
    pickle.dump(cosine_sim, file)