# Data Preparing

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')
path = '/content/drive/Shared drives/650 final project'
os.chdir(path)
# os.listdir(path)

Mounted at /content/drive


In [None]:
# import io

# from google.colab import files as colab_files
# uploaded = colab_files.upload()

In [None]:
import pandas as pd
import numpy as np
import re

### Build inverted index of recipes



#### Skip these code blocks if inverted index has been built

In [None]:
# add ids for the recipe (doc), starts from 1
recipe = pd.read_csv('original_data/all_recipes.txt',sep='\t',names=['recipe_text'])
recipe=recipe.iloc[10:,:].reset_index()[['recipe_text']]
recipe.head()

Unnamed: 0,recipe_text
0,"2 onions , chopped 2 cloves garlic , minced 1 ..."
1,1 tablespoon olive oil 1 (3 pound) roasting ch...
2,"1 sweet potato , peeled and cubed 1 medium egg..."
3,"1 onion , sliced 2 cloves garlic , minced (opt..."
4,"2 tablespoons olive oil 2 medium onions , chop..."


In [None]:
recipe.duplicated().unique()  # there is duplicate recipes

array([False,  True])

In [None]:
recipe.drop_duplicates(inplace=True,ignore_index=True) # drop duplicates #注意这里index也会变得不连续，一定要ignore_index=True重新设定为连续递增的index，不然后面匹配不上doc_id

In [None]:
recipe.tail()

Unnamed: 0,recipe_text
1405,1 cup lukewarm water 1/4 cup whole wheat flour...
1406,"1 clove garlic , minced 1/2 cup mayonnaise 2 t..."
1407,"16 cups plain yogurt 1 teaspoon salt , or to t..."
1408,1/2 cup water 1 teaspoon cornstarch 1/3 cup le...
1409,"4 cloves garlic , peeled 1/4 cup vegetable oil..."


In [None]:
recipe.to_csv('recipe_ir.txt',index=False,header=False)  # for BM25 in recipe_ir folder, need to save as .dat file and move to recipe_ir folder. only text no id

In [None]:
# recipe_n = pd.read_csv('recipe_ir.txt',sep='\t',names=['recipe'])
# recipe_n.head()
# recipe_n.shape

In [None]:
recipe['recipe_id'] = recipe.index+1
recipe.tail()

Unnamed: 0,recipe_text,recipe_id
1405,1 cup lukewarm water 1/4 cup whole wheat flour...,1406
1406,"1 clove garlic , minced 1/2 cup mayonnaise 2 t...",1407
1407,"16 cups plain yogurt 1 teaspoon salt , or to t...",1408
1408,1/2 cup water 1 teaspoon cornstarch 1/3 cup le...,1409
1409,"4 cloves garlic , peeled 1/4 cup vegetable oil...",1410


In [None]:
# save as a csv file with recipe id and text
recipe.to_csv('recipe.csv',index=False)

#### Run the code from here if inverted index has been built

In [None]:
# read recipe from the csv file
# if the recipe is not added new docs, run this step. ignore the steps above
recipe_df = pd.read_csv('recipe.csv')
recipe_df.tail()

Unnamed: 0,recipe_text,recipe_id
1405,1 cup lukewarm water 1/4 cup whole wheat flour...,1406
1406,"1 clove garlic , minced 1/2 cup mayonnaise 2 t...",1407
1407,"16 cups plain yogurt 1 teaspoon salt , or to t...",1408
1408,1/2 cup water 1 teaspoon cornstarch 1/3 cup le...,1409
1409,"4 cloves garlic , peeled 1/4 cup vegetable oil...",1410


In [None]:
# recipe_df.duplicated(subset='recipe_text').unique()  # check if there is duplicate rows - no duplicates

# BM25 ranking model

In [None]:
# read groud truth annotation
annotation = pd.read_csv('update annotation/ground_truth_annotation_v2.csv')
annotation.head()

Unnamed: 0,query_id,recipe_id,rating
0,1,274,2
1,1,294,2
2,1,295,2
3,1,314,2
4,1,318,2


In [None]:
# NDCG@10
def NDCG(query_id):
  bm25_rating = sorted(list(bm25_ranking[bm25_ranking.query_id==query_id]['rating']),reverse=True)[:10]
  annot_rating = sorted(list(annotation[annotation.query_id==query_id]['rating']),reverse=True)[:10]
  if len(bm25_rating) < 10:
    bm25_rating = bm25_rating + [0]*(10-len(bm25_rating))  # if the num of retrieval results is less than 10, add 0 for the remaining ids
  if len(annot_rating) < 10:
    annot_rating = annot_rating + [0]*(10-len(annot_rating))
  df = pd.DataFrame({
      'rank':[i for i in range(1,11)],
      'prac_rel': bm25_rating,
      'ideal_rel': annot_rating
  })

  df['discounter'] = np.log2(df['rank']) 
  df.loc[0,'discounter']=1
  df['prac_rel/discounter'] = df['prac_rel']/df['discounter']
  df['ideal_rel/discounter'] = df['ideal_rel']/df['discounter']
  DCG = np.sum(df['prac_rel/discounter'])
  IDCG = np.sum(df['ideal_rel/discounter'])
  NDCG = DCG/IDCG
  return NDCG

## BM25 via metapy

In [None]:
!pip install metapy
import metapy

Collecting metapy
[?25l  Downloading https://files.pythonhosted.org/packages/81/a4/92dae084446597d6bbf355e7eaff3e83dcb51e33d434f43ecdea4c0c4b0a/metapy-0.2.13-cp36-cp36m-manylinux1_x86_64.whl (14.3MB)
[K     |████████████████████████████████| 14.3MB 309kB/s 
[?25hInstalling collected packages: metapy
Successfully installed metapy-0.2.13


In [None]:
# Generate the metapy header configuration
with open('recipe_ir/tutorial.toml', 'w') as f:
    f.write('type = "line-corpus"\n')
    f.write('store-full-text = true\n')

config = """prefix = "." # tells MeTA where to search for datasets

dataset = "recipe_ir" # the subfolder under the prefix directory
corpus = "tutorial.toml" # a configuration file for the corpus specifying its format & additional args

index = "recipe_ir-idx" # subfolder of the current working directory to place index files

query-judgements = "recipe_ir/recipe_ir-qrels.txt" # file containing the relevance judgments for this dataset, not used here

stop-words = "lemur-stopwords.txt"

[[analyzers]]
method = "ngram-word"
ngram = 1
filter = "default-unigram-chain"
"""
with open('recipe_ir-config.toml', 'w') as f:
    f.write(config)

In [None]:
# build the inverted index with metapy
inv_idx = metapy.index.make_inverted_index('recipe_ir-config.toml')   # need to delete the inv folder and run this block again once you change the recipe_ir.dat and recipe_ie-queries.txt

In [None]:
ranker = metapy.index.OkapiBM25(k1 = 1.2, b = 0.5, k3 = 500)

num_results = 10
retrieval_results = []
with open('recipe_ir/recipe_ir-queries.txt') as query_file:
    for query_num, line in enumerate(query_file):  # query_num is the id of the query, line is the query: coronavirus origin
        print(query_num,line)
        query = metapy.index.Document()
        query.content(line.strip())
        results = ranker.score(inv_idx, query, num_results)
        res_list = [(query_num + 1, x[0]+1) for x in results]  # all the ir results relevant to that query, x[0] is  the doc_id starting from 0
        retrieval_results += res_list  # each line is (query_num+1,doc_id+1)

        # print("Query: ", query.content())
        # print("Retrieved Results")
        # for num, (d_id, _) in enumerate(results):  # d_id: doc id, _: the score
        #    content = inv_idx.metadata(d_id).get('content')
        #    print(str(num + 1), d_id+1, content)   # note that the d_id generated from metatpy begins at 0. d_id+1 to align with the id used in ground truth annotation
        # break

0 How can I cook Lettuce?

1 What part of the green onion can be use?

2 How to make stir-fried pork and cabbage?

3 pork and pepper stir fry

4 How to make broccoli salad

5 How to cook spinach and meat

6 Fried noodles with green peas

7 How to make oven-Roasted Asparagus

8 How to make cucumber salad

9 stir fry meat and Bok Choy

10 smached potato

11 minced tomato

12 minced garlic

13 minced ginger

14 ginger powder

15 garlic powder

16 chopped green bell pepper

17 chopped red bell pepper

18 chopped yellow bell pepper

19 sliced onion

20 chopped bay leaf

21 sliced carrots

22 chopped pepper


In [None]:
# map the annotated rating for the query_id and recipe_id
import csv
with open("bm25_ranking.csv","w") as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(["query_id", "recipe_id"])
    for x in retrieval_results:
      csv_writer.writerow(list(x))
    f.close()

In [None]:
bm25_ranking = pd.read_csv('bm25_ranking.csv')  # the BM25 results, no need to run cells above if the recipe and queries are not changed
bm25_ranking.head()

Unnamed: 0,query_id,recipe_id
0,1,295
1,1,687
2,1,320
3,1,1281
4,1,1134


In [None]:
bm25_ranking = bm25_ranking.merge(annotation,how='left',on=['query_id','recipe_id'])
bm25_ranking['rating'] = bm25_ranking['rating'].fillna(0)

In [None]:
# compute NDCG@10
NDCG_list = [NDCG(i) for i in range(1,24)]
query_df['NDCG@10'] = NDCG_list
print(np.mean(NDCG_list))
query_df

0.940287725426275


Unnamed: 0,query,query_id,NDCG@10
0,How can I cook Lettuce?,1,1.0
1,What part of the green onion can be use?,2,1.0
2,How to make stir-fried pork and cabbage?,3,1.0
3,pork and pepper stir fry,4,0.448261
4,How to make broccoli salad,5,0.935648
5,How to cook spinach and meat,6,0.751444
6,Fried noodles with green peas,7,1.0
7,How to make oven-Roasted Asparagus,8,1.0
8,How to make cucumber salad,9,0.751444
9,stir fry meat and Bok Choy,10,1.0


## BM25 via rank_bm25 instead of metapy

In [None]:
!pip install rank_bm25
from rank_bm25 import BM25Okapi



In [None]:
query_df = pd.read_csv('recipe_ir/recipe_ir-queries.txt',header=None)
query_df.columns = ['query']
query_df['query_id'] = query_df.index+1

In [None]:
# stop words
sw = pd.read_fwf('lemur-stopwords.txt')
sw.columns = ['stop_word']
sw_list = list(sw.stop_word)

# stop words
sw = pd.read_fwf('lemur-stopwords.txt')
sw.columns = ['stop_word']
sw_list = list(sw.stop_word)

recipe_df = pd.read_csv('recipe.csv')

test_message_id = []
test_response_id = []

tokenized_corpus = [
    [word for word in document.lower().split() if word not in sw_list]
    for document in recipe_df.recipe_text
]

bm25 = BM25Okapi(tokenized_corpus)
bm25_df = pd.DataFrame()
for query_id in range(1,24):
  query = query_df[query_df.query_id==query_id]['query'].values[0]
  print(query)
  tokenized_query = [word for word in query.lower().split() if word not in sw_list]
  doc_scores = bm25.get_scores(tokenized_query)   # a list of bm25 scores for each docs
  recipe_temp = recipe_df.iloc[np.argsort(doc_scores)[::-1][:10],:].copy()
  # print('1',np.argsort(doc_scores))  # ascending
  # print('2',np.argsort(doc_scores)[::-1][:10]) # descending
  # print(top_resp.index)
  # print('3',np.argsort(doc_scores)[::1]) # ascending

  recipe_temp['query_id'] = [query_id]*10
  recipe_temp['query'] = [query]*10
  bm25_df = pd.concat([bm25_df,recipe_temp],axis=0)

How can I cook Lettuce?
What part of the green onion can be use?
How to make stir-fried pork and cabbage?
pork and pepper stir fry
How to make broccoli salad
How to cook spinach and meat
Fried noodles with green peas
How to make oven-Roasted Asparagus
How to make cucumber salad
stir fry meat and Bok Choy
smached potato
minced tomato
minced garlic
minced ginger
ginger powder
garlic powder
chopped green bell pepper
chopped red bell pepper
chopped yellow bell pepper
sliced onion
chopped bay leaf
sliced carrots
chopped pepper


In [None]:
bm25_ranking = bm25_df[['query_id','recipe_id']].merge(annotation,how='left',on=['query_id','recipe_id'])
bm25_ranking['rating'] = bm25_ranking['rating'].fillna(0)

In [None]:
NDCG_list = [NDCG(i) for i in range(1,24)]
query_df['NDCG@10'] = NDCG_list
print(np.mean(NDCG_list))
query_df

0.8525421059041405


Unnamed: 0,query,query_id,NDCG@10
0,How can I cook Lettuce?,1,0.0
1,What part of the green onion can be use?,2,0.666318
2,How to make stir-fried pork and cabbage?,3,0.484454
3,pork and pepper stir fry,4,0.786228
4,How to make broccoli salad,5,0.935648
5,How to cook spinach and meat,6,0.677821
6,Fried noodles with green peas,7,1.0
7,How to make oven-Roasted Asparagus,8,1.0
8,How to make cucumber salad,9,0.500701
9,stir fry meat and Bok Choy,10,0.760188


## Simple interaction with users and test on other queries

In [None]:
line = input('input your query:') # interact with users in console
num_results = 10 # top10 results

query = metapy.index.Document()
query.content(line.strip())
results = ranker.score(inv_idx, query, num_results)

print("Query: ", query.content())
print("Retrieved Results")
for num, (d_id, _) in enumerate(results):  # d_id: doc id, _: the score
    content = inv_idx.metadata(d_id).get('content')
    print(str(num + 1), content)

input your query:how to make tomatoes noodles
Query:  how to make tomatoes noodles
Retrieved Results
1 "1 pound sweet Italian sausage 3/4 pound lean ground beef 1/2 cup minced onion 2 cloves garlic , crushed 1 (28 ounce) can crushed tomatoes 2 (6 ounce) cans tomato paste 2 (6.5 ounce) cans canned tomato sauce 1/2 cup water 2 tablespoons white sugar 1 1/2 teaspoons dried basil leaves 1/2 teaspoon fennel seeds 1 teaspoon Italian seasoning 1 tablespoon salt 1/4 teaspoon ground black pepper 4 tablespoons chopped fresh parsley 12 lasagna noodles 16 ounces ricotta cheese 1 egg 1/2 teaspoon salt 3/4 pound mozzarella cheese , sliced 3/4 cup grated Parmesan cheese  | In a Dutch oven , cook sausage , ground beef , onion , and garlic over medium heat until well browned . Stir in crushed tomatoes , tomato paste , tomato sauce , and water . Season with sugar , basil , fennel seeds , Italian seasoning , 1 tablespoon salt , pepper , and 2 tablespoons parsley . Simmer , covered , for about 1 1/2 hours