Make Embeddings
===

Make embeddings from sources:
 - Rori micro-lessons
 - OpenStax textbook
 - RecipeNLG

RecipeNLG data: https://paperswithcode.com/paper/recipenlg-a-cooking-recipes-dataset-for-semi

(Note: RecipeNLG is licensed for non-commercial use only.)

In [26]:
import json
import os
import time
from pathlib import Path

import dotenv
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import openai
import pandas as pd
import pyserini
import scipy
import sklearn.metrics
import tiktoken
from tqdm import tqdm

from llm_math_education import retrieval

In [27]:
dotenv.load_dotenv("../.env")

True

In [28]:
os.environ["OPENAI_API_KEY"][:3]

'sk-'

In [29]:
openai.api_key = os.environ["OPENAI_API_KEY"]

In [30]:
data_dir = Path("../data")
assert data_dir.exists()
with open(data_dir / "derived" / "rori_lessons.json") as infile:
    df = pd.read_json(infile)
df.shape

(67, 18)

In [31]:
embedding_dir = data_dir / "app_data"
embedding_dir.mkdir(exist_ok=True)

In [8]:
ds = []
ada_tokeniser = tiktoken.encoding_for_model("text-embedding-ada-002")
for row in df.itertuples():
    lesson_code = row.lesson_code
    db_string = row.short_description + "\n\n" + "\n\n".join(row.lesson_parts)
    db_string_token_count = len(ada_tokeniser.encode(db_string))
    ds.append(
        {
            "lesson_code": lesson_code,
            "grade": row.grade,
            "db_string": db_string,
            "db_string_token_count": db_string_token_count,
        }
    )
embed_df = pd.DataFrame(ds)

In [9]:
embed_df.sort_values(by="db_string_token_count").head(10)

Unnamed: 0,lesson_code,grade,db_string,db_string_token_count
28,G6.N1.3.3.5,6,In today's lesson we are going to solve word p...,47
49,G6.N3.1.1.4,6,Even decimal numbers have place values . Let u...,54
27,G6.N1.3.3.4,6,Let us multiply a 3 and 4 digit number with re...,55
26,G6.N1.3.3.3,6,In today's lesson let us learn to multiply a 3...,65
31,G6.N1.3.3.13,6,In today's lesson we are going to try out word...,65
29,G6.N1.3.3.8,6,In today's lesson we are going to learn and pr...,66
57,G7.N3.3.2.2,7,"Today, we are going to learn how to divide a d...",70
30,G6.N1.3.3.9,6,In today's lesson we are going to practice per...,71
46,G6.N2.2.3.10,6,Can you find the reciprocal of a fraction? Tur...,82
15,G6.N1.1.1.1,6,"Today we will read and write numbers upto 1,00...",83


In [10]:
db = retrieval.RetrievalDb(embedding_dir, "rori_microlesson", "db_string", embed_df)
db

<llm_math_education.retrieval.RetrievalDb at 0x29814bd30>

In [11]:
db.create_embeddings()

In [12]:
db.save_df()

### Create OpenStax embeddings

In [13]:
derived_openstax_dir = data_dir / "derived" / "openstax"
openstax_subsection_df = pd.read_parquet(derived_openstax_dir / "openstax_prealgebra_subsection_df.parquet")
openstax_subsection_df.shape

(247, 6)

In [14]:
openstax_subsection_df["db_string"] = [
    title + ":\n" + content for title, content in zip(openstax_subsection_df.title, openstax_subsection_df.content)
]

In [15]:
openstax_subsection_df.sample(n=3)

Unnamed: 0,title,content,index,n_tokens,chapter,section,db_string
100,Add and Subtract Mixed Numbers with Different ...,To add or subtract mixed numbers with differen...,4,54,4,6,Add and Subtract Mixed Numbers with Different ...
141,Find Percent Increase and Percent Decrease,People in the media often talk about how much ...,2,128,6,2,Find Percent Increase and Percent Decrease:\nP...
19,Translate Word Phrases to Math Notation,"Earlier in this section, we translated math no...",3,50,1,4,Translate Word Phrases to Math Notation:\nEarl...


In [16]:
openstax_df = openstax_subsection_df[["chapter", "section", "index", "db_string"]].copy()
openstax_df.shape

(247, 4)

In [17]:
db = retrieval.RetrievalDb(embedding_dir, "openstax_subsection", "db_string", openstax_df)

In [18]:
db.create_embeddings()

In [19]:
db.save_df()

In [20]:
!du -h {embedding_dir}/* | sort -rh

2.9M	../data/app_data/openstax_subsection_embed.npy
808K	../data/app_data/rori_microlesson_embed.npy
128K	../data/app_data/openstax_subsection_df.parquet
 32K	../data/app_data/rori_microlesson_df.parquet


### Verify loading

In [21]:
db = retrieval.RetrievalDb(embedding_dir, "openstax_subsection", "db_string")

In [22]:
distances = db.compute_string_distances("How to simplify fractions")
distances.shape

(247,)

In [23]:
k = 5
sort_inds = np.argsort(distances)
ranks = scipy.stats.rankdata(distances, method="max")
top_k_indices = sort_inds[:k]
top_k_scores = distances[top_k_indices]
assert top_k_indices.shape == top_k_scores.shape

In [24]:
top_k_indices

array([94, 78, 84, 93, 95])

In [25]:
db.df.iloc[top_k_indices]

Unnamed: 0,chapter,section,index,db_string,n_tokens
94,4,5,4,Use the Order of Operations to Simplify Comple...,99
78,4,2,0,Simplify Fractions: In working with equivalent...,443
84,4,3,2,Simplify Complex Fractions: Our work with frac...,87
93,4,5,3,Identify and Use Fraction Operations: By now i...,56
95,4,5,5,Evaluate Variable Expressions with Fractions: ...,45


### RecipeNLG embeddings

Creating a database with 1000 random recipes.

In [32]:
recipe_embedding_dir = data_dir / "derived" / "embeddings"

In [49]:
recipe_df = pd.read_csv(data_dir / "raw" / "recipenlg" / "recipeNLG_full_dataset.csv", index_col=0)
recipe_df.shape

(2231142, 6)

In [50]:
recipe_df.sample(n=3)

Unnamed: 0,title,ingredients,directions,link,source,NER
1040602,California French Bread (Better Than Garlic Br...,"[""1 loaf French bread, 1 1/2 inch slices"", ""1 ...","[""Preheat oven to 400."", ""Mix mayo, parm , and...",www.food.com/recipe/california-french-bread-be...,Gathered,"[""bread"", ""parmesan cheese"", ""mayonnaise"", ""gr..."
242998,Prune Cake,"[""3 eggs"", ""1 1/2 c. sugar"", ""1 c. cooking oil...","[""Beat eggs; add sugar and oil."", ""Mix dry ing...",www.cookbooks.com/Recipe-Details.aspx?id=163450,Gathered,"[""eggs"", ""sugar"", ""cooking oil"", ""flour"", ""sod..."
465115,Chicken Casserole,"[""1 whole chicken, boiled and deboned (reserve...","[""Mix all ingredients together (it will be sou...",www.cookbooks.com/Recipe-Details.aspx?id=53975,Gathered,"[""chicken"", ""cream of mushroom soup"", ""cream o..."


In [51]:
recipe_df = recipe_df[recipe_df.source == "Gathered"]
len(recipe_df)

1643098

In [52]:
sdf = recipe_df.sample(n=1000).copy()
sdf.shape

(1000, 6)

In [55]:
sdf.sample(n=2)

Unnamed: 0,title,ingredients,directions,link,source,NER
1104245,Cucumber Scallion Salad,"[""1/2 English cucumber, chopped"", ""2 tablespoo...","[""Combine cucumber, scallions, vinegar, lime j...",www.food.com/recipe/cucumber-scallion-salad-29...,Gathered,"[""cucumber"", ""scallions"", ""red wine vinegar"", ..."
1517241,Homemade Cincinnati Chili,"[""3 pounds ground beef"", ""1-1/2 cups chopped o...","[""In a Dutch oven, cook beef and onions over m...",www.tasteofhome.com/recipes/homemade-cincinnat...,Gathered,"[""ground beef"", ""onions"", ""garlic"", ""kidney be..."


In [59]:
db_strings = []
for row in sdf.itertuples():
    title = row.title
    directions = eval(row.directions)
    db_string = f"{title} recipe:\n - " + "\n - ".join(directions)
    db_strings.append(db_string)

db_df = sdf.reset_index()[["index"]].copy()
db_df["db_string"] = db_strings
db_df

Unnamed: 0,index,db_string
0,1437400,Grilled Caesar Salad recipe:\n - Make dressing...
1,712159,Wilted Spinach Salad recipe:\n - Cook bacon un...
2,1065822,Citrus-Beef Stir-Fry With Carrots (Ww) recipe:...
3,52150,Ultimate Chocolate Chip Cookies recipe:\n - Pr...
4,1548527,Raspberry White Chocolate Muffins recipe:\n - ...
...,...,...
995,941393,Family Favorite Chili Mac recipe:\n - Cook and...
996,168296,Sausage Apple Balls recipe:\n - Combine all in...
997,716230,Chocolate Crunch Pretzels recipe:\n - Crush ce...
998,320829,Zucchini Casserole recipe:\n - Mix beaten eggs...


In [60]:
db = retrieval.RetrievalDb(recipe_embedding_dir, "recipenlg", "db_string", db_df)

In [61]:
db.create_embeddings()

In [62]:
db.save_df()

In [64]:
!du -h {recipe_embedding_dir}/*recipenlg*

276K	../data/derived/embeddings/recipenlg_df.parquet
 12M	../data/derived/embeddings/recipenlg_embed.npy
