In [2]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from nltk.stem import PorterStemmer
import re
from nltk.tokenize import word_tokenize
import numpy as np
from flask import Flask, request
from elasticsearch import Elasticsearch, helpers
import time

In [3]:
df = pd.read_parquet('./resource/recipes.parquet')

In [4]:
cleaned_df = df
cleaned_df['CookTime'] = cleaned_df['CookTime'].fillna('') 
cleaned_df['AggregatedRating'] = cleaned_df['AggregatedRating'].fillna(0)
cleaned_df['ReviewCount'] = cleaned_df['ReviewCount'].fillna(0)
cleaned_df['Description'] = cleaned_df['Description'].fillna('')
cleaned_df['RecipeCategory'] = cleaned_df['RecipeCategory'].fillna('')
cleaned_df['RecipeServings'] = cleaned_df['RecipeServings'].fillna(0)
cleaned_df['RecipeYield'] = cleaned_df['RecipeYield'].fillna('0')
for row in cleaned_df.loc[cleaned_df['Images'].isna()].index:
    cleaned_df.at[row, 'Images'] = []
cleaned_df.isna().sum()


RecipeId                      0
Name                          0
AuthorId                      0
AuthorName                    0
CookTime                      0
PrepTime                      0
TotalTime                     0
DatePublished                 0
Description                   0
Images                        0
RecipeCategory                0
Keywords                      0
RecipeIngredientQuantities    0
RecipeIngredientParts         0
AggregatedRating              0
ReviewCount                   0
Calories                      0
FatContent                    0
SaturatedFatContent           0
CholesterolContent            0
SodiumContent                 0
CarbohydrateContent           0
FiberContent                  0
SugarContent                  0
ProteinContent                0
RecipeServings                0
RecipeYield                   0
RecipeInstructions            0
dtype: int64

In [5]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype              
---  ------                      --------------   -----              
 0   RecipeId                    522517 non-null  float64            
 1   Name                        522517 non-null  object             
 2   AuthorId                    522517 non-null  int32              
 3   AuthorName                  522517 non-null  object             
 4   CookTime                    522517 non-null  object             
 5   PrepTime                    522517 non-null  object             
 6   TotalTime                   522517 non-null  object             
 7   DatePublished               522517 non-null  datetime64[ns, UTC]
 8   Description                 522517 non-null  object             
 9   Images                      522517 non-null  object             
 10  RecipeCategory              522517 non-null 

In [39]:
cleaned_df.to_parquet('recipes_parq.parquet')

In [43]:
class Indexer:
    def __init__(self):
        self.df = pd.read_parquet('recipes_parq.parquet')
        self.es_client = Elasticsearch("https://localhost:9200", basic_auth=("elastic","2C2POmj1a75XqoUOs1-h"), ca_certs="~/http_ca.crt")

    def run_indexer(self):
        self.es_client.indices.create(index='foodir', ignore=400)
        self.es_client.indices.delete(index='foodir', ignore=[400, 404])
        self.df['_index'] = 'foodir'
        # j = json.loads(self.df[['_index','RecipeId','Name','RecipeIngredientParts','RecipeInstructions']].to_json(orient='records'))
        j = json.loads(self.df.to_json(orient='records'))
        helpers.bulk(self.es_client, j)

In [44]:
s = Indexer()
s.run_indexer()

  self.es_client.indices.create(index='foodir', ignore=400)
  self.es_client.indices.delete(index='foodir', ignore=[400, 404])


In [45]:
s.es_client.search(index='foodir', suggest_field='Name', suggest_text='very blur', suggest_mode='missing')['suggest']

{'Name': [{'text': 'very', 'offset': 0, 'length': 4, 'options': []},
  {'text': 'blur',
   'offset': 5,
   'length': 4,
   'options': [{'text': 'blue', 'score': 0.75, 'freq': 2003},
    {'text': 'bluer', 'score': 0.75, 'freq': 1},
    {'text': 'blui', 'score': 0.75, 'freq': 1},
    {'text': 'bour', 'score': 0.75, 'freq': 1},
    {'text': 'blu', 'score': 0.6666666, 'freq': 4}]}]}

In [46]:
s.es_client.search(index='foodir', query={
            "script_score":{
                "query":{
                    "combined_fields": {
                        "query": 'chicken',
                        "fields": ["Name","RecipeIngredientParts","RecipeInstructions"],
                    },
                },
                "script":{
                    "source": "_score * (doc['ReviewCount'].value + 1)"
                }
            }
        }, suggest_field='Name', suggest_text='bourbon chicken', suggest_mode='missing')['hits']['hits']

[{'_index': 'foodir',
  '_id': 'oT42UI4BzlRi80rRIk3O',
  '_score': 12138.582,
  '_ignored': ['Description.keyword'],
  '_source': {'RecipeId': 45809.0,
   'Name': 'Bourbon Chicken',
   'AuthorId': 58278,
   'AuthorName': 'LinMarie',
   'CookTime': 'PT20M',
   'PrepTime': 'PT15M',
   'TotalTime': 'PT35M',
   'DatePublished': 1037131980000,
   'Description': "I searched and finally found this recipe on the internet. It is a copycat of the Bourbon Chicken sold in Chinese carry-outs in my hometown.  This recipe is so good that my sons gobble it up leaving me just a spoonful. Their excuse was they thought I had eaten.  Editor's Note:  Named Bourbon Chicken because it was supposedly created by a Chinese cook who worked in a restaurant on Bourbon Street.",
   'Images': ['https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/45/80/9/MwuCd6HpQ5mDvn4OLRkA_0S9A9886.jpg',
    'https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/

In [47]:
def queryEslPr(query):
    results = s.es_client.search(index='foodir', query={
            "combined_fields": {
                "query": query,
                "fields": ["Name","RecipeIngredientParts","RecipeInstructions"]
            }
        }, suggest_field='Name', suggest_text=query, suggest_mode='missing')
    print("Got %d Hits:" % results['hits']['total']['value'])
    for hit in results['hits']['hits']:
        print("The recipe is '{0}' '{1}' '{2}'.".format(hit['_source']["RecipeId"],hit['_source']["Name"],hit['_score']))
    for op in results['suggest']['Name']:
        print(op['options'])

In [48]:
queryEslPr("ginger gingerbread")

Got 10000 Hits:
The recipe is '323457.0' 'Caramel Gingerbread Cupcakes' '15.352105'.
The recipe is '537189.0' 'Gingerbread Men in a Bubble Bath' '14.838849'.
The recipe is '271246.0' 'Gingerbread Man Cookies' '14.809714'.
The recipe is '352209.0' 'Pumpkin Ginger-Gingerbread' '14.727834'.
The recipe is '271513.0' 'Spiced Christmas Gingerbread Coffee With Cognac Chantilly Cream' '14.554615'.
The recipe is '139766.0' 'Gingerbread Tres Leches Cake' '14.4893055'.
The recipe is '200439.0' 'Gingerbread Martini' '14.427593'.
The recipe is '106063.0' 'Raisin Gingerbread Loaf With Ginger Icing' '14.410472'.
The recipe is '148588.0' 'Gingerbread' '14.402438'.
The recipe is '143485.0' 'Warm Gingerbread With Lemon Glaze' '14.399534'.
[]
[]


In [49]:
def get_recipes(query: str):
    return {
        "function_score": {
            "query": {
                "dis_max": {
                    "queries": [
                        {"match": {"Name": query}},
                        {"match": {"RecipeIngredientParts": query}},
                        {"match": {"RecipeInstructions": query}},
                        {"match": {"Keywords": query}},
                    ],
                    "tie_breaker": 0.3,
                }
            },
            "functions": [
                {
                    "script_score": {
                        "script": {
                            "source": "(doc['AggregatedRating'].value * doc['ReviewCount'].value + 4.632013709922984 * 100) / (doc['AggregatedRating'].value + 100)"
                        },
                    },
                    "weight": 1,
                },
                {
                    "script_score": {
                        "script": {"source": "_score"},
                    },
                    "weight": 1,
                },
            ],
            "score_mode": "multiply",
     }
    }
    
def get_query(query: str):
    return {
        "script_score":{
                "query":{
                    "combined_fields": {
                        "query": query,
                        "fields": ["Name","RecipeIngredientParts","RecipeInstructions"],
                    },
                },
                "script":{
                    "source": "_score * (doc['ReviewCount'].value + 1)"
                }
            }
    }