# password = _9F33BNH*5AsYt7f4S=J

In [1]:
from elasticsearch import Elasticsearch
import pandas as pd
from elasticsearch.helpers import bulk
import re

In [3]:
csv_file = 'resource/recipes.csv'
df = pd.read_csv(csv_file)
print(df.columns)

Index(['RecipeId', 'Name', 'AuthorId', 'AuthorName', 'CookTime', 'PrepTime',
       'TotalTime', 'DatePublished', 'Description', 'Images', 'RecipeCategory',
       'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts',
       'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent',
       'SaturatedFatContent', 'CholesterolContent', 'SodiumContent',
       'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent',
       'RecipeServings', 'RecipeYield', 'RecipeInstructions'],
      dtype='object')


In [3]:
df = pd.read_csv('resource/completed_recipes.csv')

In [6]:
df = df.drop("Images", axis=1)

In [9]:
df.rename(columns={'image_link':'Images'},inplace=True)
df.to_csv("full_recipes.csv", index=False)

In [11]:
df['Images'][:5]

0    c("https://img.sndimg.com/food/image/upload/w_...
1    c("https://img.sndimg.com/food/image/upload/w_...
2    c("https://img.sndimg.com/food/image/upload/w_...
3    c("https://img.sndimg.com/food/image/upload/w_...
4    "https://img.sndimg.com/food/image/upload/w_55...
Name: Images, dtype: object

In [10]:
df.to_pickle("resource/full_recipes.pkl")

In [18]:
result = df[df['Name'] == 'Cookie Salad']["Images"]
print(result.iloc[1])

"https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/submissions/recipe/42265412/fyWEz4cwSf2poDqTo81M_image.jpg"


In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor, as_completed

# Load the dataset
df = pd.read_csv('resource/recipes.csv')

# Identify rows with missing images
missing_image_rows = df[df['Images'] == 'character(0)']

# Identify rows with valid images
valid_image_rows = df[df['Images'] != 'character(0)']

# Combine relevant text columns (e.g., 'Title' and 'Ingredients') for similarity comparison
df['text_data'] = df['Name'].fillna('') + ' ' + df['Description'].fillna('') + ' ' + df['Keywords'].fillna('')

print('Transforming data...')
vectorizer = TfidfVectorizer()
text_vectors = vectorizer.fit_transform(df['text_data'])
print('Transforming done.')


Transforming data...
Transforming done.


In [None]:

# Create an array of valid image indexes for fast checking
valid_image_indexes = valid_image_rows.index.tolist()

# Function to process each missing image row
def process_missing_image(idx, text_vectors, df, valid_image_indexes):
    try:
        # Compute cosine similarity between the missing row and all other rows
        similarities = cosine_similarity(text_vectors[idx], text_vectors[valid_image_indexes]).flatten()

        # Get the index of the most similar valid row
        best_match_idx = valid_image_indexes[similarities.argmax()]

        # If a valid match is found, return the index and image URL
        return idx, best_match_idx, df.at[best_match_idx, 'Images']
    except Exception as e:
        print(f"Error processing row {idx}: {e}")
        return idx, None, None

# Initialize ThreadPoolExecutor to parallelize
with ThreadPoolExecutor() as executor:
    futures = []

    # Submit each missing image row to be processed in parallel
    for idx in missing_image_rows.index:
        futures.append(executor.submit(process_missing_image, idx, text_vectors, df, valid_image_indexes))

    # Collect results and update DataFrame
    for future in as_completed(futures):
        idx, best_match_idx, image_url = future.result()
        if image_url:
            df.at[idx, 'Images'] = image_url
            print(f'Row {idx} best match found at {best_match_idx}')
        else:
            print(f"Failed to assign image for row {idx}")

# Save the first 100 rows of updated images
df.to_csv('resource/recipes_first_100.csv', index=False)

print("Missing images replaced and saved to 'resource/recipes_updated.csv'")
print("First 100 rows saved to 'resource/recipes_first_100.csv'")


Row 131 best match found at 226477
Row 201 best match found at 3863
Row 130 best match found at 98332
Row 200 best match found at 123349
Row 252 best match found at 204992
Row 31 best match found at 196226
Row 129 best match found at 144413
Row 199 best match found at 341653
Row 249 best match found at 4497
Row 128 best match found at 41494
Row 13 best match found at 264186
Row 195 best match found at 505720
Row 248 best match found at 262
Row 127 best match found at 520141
Row 194 best match found at 350423
Row 126 best match found at 500261
Row 191 best match found at 60551
Row 246 best match found at 212
Row 125 best match found at 50133
Row 28 best match found at 140276
Row 189 best match found at 371070
Row 68 best match found at 128239
Row 245 best match found at 37548
Row 124 best match found at 491425
Row 59 best match found at 183052
Row 186 best match found at 262
Row 22 best match found at 503022
Row 244 best match found at 212
Row 123 best match found at 391531
Row 185 best

: 

In [4]:
df_new = pd.read_csv('resource/recipes_first_100.csv')

# Find rows where 'Images' is 'character(0)'
rows_with_missing_images = df_new[df_new['Images'] == 'character(0)']

rows_with_missing_images.shape

(0, 29)

In [7]:
print(21*360000/100) 

75600.0


In [14]:
indices_character_0 = df[df['Images'] == 'character(0)'].index
print(indices_character_0)


Index([     5,      8,     10,     13,     15,     19,     22,     23,     24,
           25,
       ...
       522502, 522504, 522505, 522506, 522507, 522508, 522510, 522512, 522514,
       522516],
      dtype='int64', length=356620)


In [10]:
print((df.isna().sum()))

RecipeId                           0
Name                               0
AuthorId                           0
AuthorName                         0
CookTime                       82545
PrepTime                           0
TotalTime                          0
DatePublished                      0
Description                        5
Images                             1
RecipeCategory                   751
Keywords                       17237
RecipeIngredientQuantities         3
RecipeIngredientParts              0
AggregatedRating              253223
ReviewCount                   247489
Calories                           0
FatContent                         0
SaturatedFatContent                0
CholesterolContent                 0
SodiumContent                      0
CarbohydrateContent                0
FiberContent                       0
SugarContent                       0
ProteinContent                     0
RecipeServings                182911
RecipeYield                   348071
R

In [11]:
def handle_nan_values(row):
    # Replace NaN values with 0 for numeric columns
    for column in row.index:
        if isinstance(row[column], (int, float)) and pd.isna(row[column]):
            row[column] = 0  # Or any default value you prefer (e.g., "")
        
        # Special case for the RecipeYield column
        if column == 'RecipeYield':
            # Extract the numeric part from the 'RecipeYield' string (e.g., '4 kebabs' -> 4)
            match = re.match(r"(\d+)", str(row[column]))
            if match:
                row[column] = int(match.group(1))  # Extracted numeric part
            else:
                row[column] = 0  # If no numeric value, set it to a default (e.g., 0)
                
    return row

In [12]:
import time
import pickle
from elasticsearch.helpers import bulk
import pandas as pd
from elasticsearch import Elasticsearch
from concurrent.futures import ThreadPoolExecutor

class Indexer:
    def __init__(self):
        self.start_time = time.time()
        self.csv_file_path = "resource/full_recipes.csv"
        self.es_client = Elasticsearch("https://localhost:9200", 
                                       basic_auth=("elastic", "Z_3O+lFyJPcXxPB+UvD-"), 
                                       ca_certs="~/http_ca.crt")
        self.init_time = time.time() - self.start_time
        print(f"Initialization took {self.init_time:.4f} seconds")

    def run_indexer(self):
        start_time = time.time()

        # Disable refresh for bulk indexing
        self.es_client.indices.put_settings(index="recipes", body={
            "settings": {
                "index": {
                    "refresh_interval": "-1"  # Disable refresh
                }
            }
        })

        # Delete the index if exists and create a new one
        self.es_client.options(ignore_status=[400, 404]).indices.delete(index='recipes')
        self.es_client.options(ignore_status=400).indices.create(index='recipes')

        # Load data from CSV
        data = pd.read_csv(self.csv_file_path)

        actions = []  # List to hold bulk actions

        indexed_documents = []  # List to store documents for pickling

        def process_row(row):
            row = self.handle_nan_values(row)  # Handle NaN values before creating the document
            
            document = {
                "_op_type": "index",  # Optional: Index operation for bulk
                "_index": "recipes",  # Index name
                "_id": row["RecipeId"],  # Optional: If you want to set a custom ID
                "_source": {
                    "RecipeId": row["RecipeId"],
                    "Name": row["Name"],
                    "AuthorId": row["AuthorId"],
                    "AuthorName": row["AuthorName"],
                    "CookTime": row["CookTime"],
                    "PrepTime": row["PrepTime"],
                    "TotalTime": row["TotalTime"],
                    "DatePublished": row["DatePublished"],
                    "Description": row["Description"],
                    "Images": row["Images"],
                    "RecipeCategory": row["RecipeCategory"],
                    "Keywords": row["Keywords"],
                    "RecipeIngredientQuantities": row["RecipeIngredientQuantities"],
                    "RecipeIngredientParts": row["RecipeIngredientParts"],
                    "AggregatedRating": row["AggregatedRating"],
                    "ReviewCount": row["ReviewCount"],
                    "Calories": row["Calories"],
                    "FatContent": row["FatContent"],
                    "SaturatedFatContent": row["SaturatedFatContent"],
                    "CholesterolContent": row["CholesterolContent"],
                    "SodiumContent": row["SodiumContent"],
                    "CarbohydrateContent": row["CarbohydrateContent"],
                    "FiberContent": row["FiberContent"],
                    "SugarContent": row["SugarContent"],
                    "ProteinContent": row["ProteinContent"],
                    "RecipeServings": row["RecipeServings"],
                    "RecipeYield": row["RecipeYield"],
                    "RecipeInstructions": row["RecipeInstructions"]
                }
            }
            return document

        # Use ThreadPoolExecutor to parallelize the row processing
        with ThreadPoolExecutor() as executor:
            documents = list(executor.map(process_row, [row for idx, row in data.iterrows()]))
        
        # Perform bulk indexing in batches of 5000 or a suitable number
        for i in range(0, len(documents), 5000):
            batch = documents[i:i + 5000]
            success, failed = bulk(self.es_client, batch)
            print(f"Bulk indexed {len(batch)} documents: {success} successful, {failed} failed")
        
        # Enable refresh again after bulk indexing
        self.es_client.indices.put_settings(index="recipes", body={
            "settings": {
                "index": {
                    "refresh_interval": "1s"  # Re-enable refresh interval
                }
            }
        })

        # Pickle the indexed documents to a file
        with open('resource/recipes_index.pkl', 'wb') as f:
            pickle.dump(documents, f)
        print("Indexed documents pickled successfully!")

        end_time = time.time() - start_time
        print(f"run_indexer method took {end_time:.4f} seconds")

    def handle_nan_values(self, row):
        # Replace NaN values with 0 for numeric columns
        for column in row.index:
            if isinstance(row[column], (int, float)) and pd.isna(row[column]):
                row[column] = 0  # Or any default value you prefer (e.g., "")
            
            # Special case for the RecipeYield column
            if column == 'RecipeYield':
                # Extract the numeric part from the 'RecipeYield' string (e.g., '4 kebabs' -> 4)
                match = re.match(r"(\d+)", str(row[column]))
                if match:
                    row[column] = int(match.group(1))  # Extracted numeric part
                else:
                    row[column] = 0  # If no numeric value, set it to a default (e.g., 0)
                
        return row


# Run the indexing process
indexer = Indexer()
indexer.run_indexer()


Initialization took 0.2157 seconds
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 5000 documents: 5000 successful, [] failed
Bulk indexed 

In [23]:
from flask import Flask, render_template, request, jsonify
from elasticsearch import Elasticsearch
import os

app = Flask(__name__)

# Initialize Elasticsearch client
es = Elasticsearch(
    "https://localhost:9200", 
    basic_auth=("elastic", os.getenv('ELASTIC_PASSWORD', 'Z_3O+lFyJPcXxPB+UvD-')),  # Using environment variable for security
    ca_certs=os.path.expanduser("~/http_ca.crt")
    ,verify_certs=False
 # Ensure the path is correct
)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/search', methods=['GET'])
def search():
    query = request.args.get('q', '')
    if query:
        # Perform Elasticsearch search
        response = es.search(index="recipes", body={
            "query": {
                "multi_match": {
                    "query": query,
                    "fields": ["Name", "Description", "RecipeInstructions"],
                }
            },
            "size": 1000  # Increase the size to get a large enough batch of results
        })

        results = []
        seen_recipe_ids = set()  # Set to track unique RecipeIds

        for hit in response['hits']['hits']:
            recipe_id = hit['_source'].get('RecipeId')
            if recipe_id not in seen_recipe_ids:
                results.append(hit['_source'])
                seen_recipe_ids.add(recipe_id)  # Mark this RecipeId as seen

        if results:
            return jsonify(results)
        else:
            return jsonify({"message": "No results found"}), 404  # Return message if no results found
    return jsonify({"message": "No query provided"}), 400  # Return message if no query is provided



In [None]:
app.run(debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [25/Feb/2025 21:31:25] "GET /search?q=hello%20there HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2025 21:31:26] "GET /search?q=hello%20there HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2025 21:31:26] "GET /search?q=hello%20there HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2025 21:31:27] "GET /search?q=hello%20there HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2025 21:31:31] "GET /search?q=hello HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2025 21:31:33] "GET /search?q=hello HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2025 21:31:39] "GET /search?q=panna%20cotta HTTP/1.1" 200 -
127.0.0.1 - - [25/Feb/2025 21:45:51] "GET /search?q=Velvet%20Mousse HTTP/1.1" 200 -
