In [None]:
import pandas as pd
import ast

# Load RAW_recipes dataset
recipes_df = pd.read_csv("RAW_recipes.csv")

# Handle missing values
recipes_df = recipes_df.dropna(subset=['name', 'description', 'ingredients'])

# Normalize text columns
recipes_df['name'] = recipes_df['name'].str.lower().str.strip()
recipes_df['description'] = recipes_df['description'].str.lower().str.strip()

# Parse ingredients (convert stringified lists to actual lists)
recipes_df['ingredients'] = recipes_df['ingredients'].apply(ast.literal_eval)

# Remove duplicates
recipes_df = recipes_df.drop_duplicates(subset=['id', 'name'])

# Display cleaned dataset
print(f"Cleaned RAW_recipes dataset: {recipes_df.shape}")
recipes_df.head()

In [None]:
# Load RAW_interactions dataset
interactions_df = pd.read_csv("RAW_interactions.csv")

# Handle missing values
interactions_df = interactions_df.dropna(subset=['review', 'rating'])

# Normalize text columns
interactions_df['review'] = interactions_df['review'].str.lower().str.strip()

# Remove outliers in ratings
interactions_df = interactions_df[interactions_df['rating'].between(1, 5)]

# Remove duplicates
interactions_df = interactions_df.drop_duplicates(subset=['user_id', 'recipe_id', 'review'])

# Display cleaned dataset
print(f"Cleaned RAW_interactions dataset: {interactions_df.shape}")
interactions_df.head()

In [None]:
recipes_df.to_csv("cleaned_RAW_recipes.csv", index=False)
interactions_df.to_csv("cleaned_RAW_interactions.csv", index=False)

In [None]:
import pandas as pd

# Load cleaned datasets
recipes_df = pd.read_csv("cleaned_RAW_recipes.csv")
interactions_df = pd.read_csv("cleaned_RAW_interactions.csv")

# Merge datasets
merged_df = pd.merge(interactions_df, recipes_df, left_on='recipe_id', right_on='id', how='inner')

# Keep all original columns and add the combined_text
merged_df['combined_text'] = merged_df.apply(
    lambda row: f"Recipe Name: {row['name']}. Description: {row['description']}. "
                f"Ingredients: {', '.join(eval(row['ingredients']))}. User Review: {row['review']}.",
    axis=1
)

# Reduce the dataset to 50,000 rows (random sampling)
reduced_df = merged_df.sample(n=50000, random_state=42)

# Save the reduced dataset including all original columns
reduced_df.to_csv("reduced_dataset.csv", index=False)

print("Reduced dataset saved as 'reduced_dataset.csv'")
print("Number of rows in the reduced dataset:", len(reduced_df))



Reduced dataset saved as 'reduced_dataset.csv'
Number of rows in the reduced dataset: 50000


In [None]:
pip install -U langchain-community



In [None]:
pip install sentence_transformers



In [None]:
pip install faiss-gpu



In [None]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

# Load reduced dataset
reduced_df = pd.read_csv("reduced_dataset.csv")

# Initialize embedding function
model_name = "all-MiniLM-L6-v2"
embedding_function = HuggingFaceEmbeddings(model_name=model_name)

# Generate embeddings for combined_text
embeddings = embedding_function.embed_documents(reduced_df['combined_text'].tolist())

# Create FAISS index with metadata
vector_store = FAISS.from_texts(
    texts=reduced_df['combined_text'].tolist(),
    embedding=embedding_function,
    metadatas=[{'recipe_id': str(recipe_id)} for recipe_id in reduced_df['recipe_id']]
)

# Save FAISS index
vector_store.save_local("recipes_faiss_index")
print("FAISS index successfully created and saved!")






FAISS index successfully created and saved!


In [None]:
from google.colab import files
import shutil

# Zip the folder for download
shutil.make_archive('recipes_faiss_index', 'zip', 'recipes_faiss_index')

# Download the zipped folder
files.download('recipes_faiss_index.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>