Make Embeddings
===

Make embeddings from two sources:
 - Rori micro-lessons
 - OpenStax textbook

In [1]:
import json
import os
import time
from pathlib import Path

import dotenv
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import openai
import pandas as pd
import pyserini
import scipy
import sklearn.metrics
import tiktoken
from tqdm import tqdm

from llm_math_education import retrieval

In [2]:
dotenv.load_dotenv("../.env")

True

In [3]:
os.environ["OPENAI_API_KEY"][:3]

'sk-'

In [4]:
openai.api_key = os.environ["OPENAI_API_KEY"]

In [5]:
data_dir = Path("../data")
assert data_dir.exists()
with open(data_dir / "derived" / "rori_lessons.json") as infile:
    df = pd.read_json(infile)
df.shape

(67, 18)

In [7]:
embedding_dir = data_dir / "app_data"
embedding_dir.mkdir(exist_ok=True)

In [8]:
ds = []
ada_tokeniser = tiktoken.encoding_for_model("text-embedding-ada-002")
for row in df.itertuples():
    lesson_code = row.lesson_code
    db_string = row.short_description + "\n\n" + "\n\n".join(row.lesson_parts)
    db_string_token_count = len(ada_tokeniser.encode(db_string))
    ds.append(
        {
            "lesson_code": lesson_code,
            "grade": row.grade,
            "db_string": db_string,
            "db_string_token_count": db_string_token_count,
        }
    )
embed_df = pd.DataFrame(ds)

In [9]:
embed_df.sort_values(by="db_string_token_count").head(10)

Unnamed: 0,lesson_code,grade,db_string,db_string_token_count
28,G6.N1.3.3.5,6,In today's lesson we are going to solve word p...,47
49,G6.N3.1.1.4,6,Even decimal numbers have place values . Let u...,54
27,G6.N1.3.3.4,6,Let us multiply a 3 and 4 digit number with re...,55
26,G6.N1.3.3.3,6,In today's lesson let us learn to multiply a 3...,65
31,G6.N1.3.3.13,6,In today's lesson we are going to try out word...,65
29,G6.N1.3.3.8,6,In today's lesson we are going to learn and pr...,66
57,G7.N3.3.2.2,7,"Today, we are going to learn how to divide a d...",70
30,G6.N1.3.3.9,6,In today's lesson we are going to practice per...,71
46,G6.N2.2.3.10,6,Can you find the reciprocal of a fraction? Tur...,82
15,G6.N1.1.1.1,6,"Today we will read and write numbers upto 1,00...",83


In [10]:
db = retrieval.RetrievalDb(embedding_dir, "rori_microlesson", "db_string", embed_df)
db

<llm_math_education.retrieval.RetrievalDb at 0x29814bd30>

In [11]:
db.create_embeddings()

In [12]:
db.save_df()

### Create OpenStax embeddings

In [13]:
derived_openstax_dir = data_dir / "derived" / "openstax"
openstax_subsection_df = pd.read_parquet(derived_openstax_dir / "openstax_prealgebra_subsection_df.parquet")
openstax_subsection_df.shape

(247, 6)

In [14]:
openstax_subsection_df["db_string"] = [
    title + ":\n" + content for title, content in zip(openstax_subsection_df.title, openstax_subsection_df.content)
]

In [15]:
openstax_subsection_df.sample(n=3)

Unnamed: 0,title,content,index,n_tokens,chapter,section,db_string
100,Add and Subtract Mixed Numbers with Different ...,To add or subtract mixed numbers with differen...,4,54,4,6,Add and Subtract Mixed Numbers with Different ...
141,Find Percent Increase and Percent Decrease,People in the media often talk about how much ...,2,128,6,2,Find Percent Increase and Percent Decrease:\nP...
19,Translate Word Phrases to Math Notation,"Earlier in this section, we translated math no...",3,50,1,4,Translate Word Phrases to Math Notation:\nEarl...


In [16]:
openstax_df = openstax_subsection_df[["chapter", "section", "index", "db_string"]].copy()
openstax_df.shape

(247, 4)

In [17]:
db = retrieval.RetrievalDb(embedding_dir, "openstax_subsection", "db_string", openstax_df)

In [18]:
db.create_embeddings()

In [19]:
db.save_df()

In [20]:
!du -h {embedding_dir}/* | sort -rh

2.9M	../data/app_data/openstax_subsection_embed.npy
808K	../data/app_data/rori_microlesson_embed.npy
128K	../data/app_data/openstax_subsection_df.parquet
 32K	../data/app_data/rori_microlesson_df.parquet


### Verify loading

In [21]:
db = retrieval.RetrievalDb(embedding_dir, "openstax_subsection", "db_string")

In [22]:
distances = db.compute_string_distances("How to simplify fractions")
distances.shape

(247,)

In [23]:
k = 5
sort_inds = np.argsort(distances)
ranks = scipy.stats.rankdata(distances, method="max")
top_k_indices = sort_inds[:k]
top_k_scores = distances[top_k_indices]
assert top_k_indices.shape == top_k_scores.shape

In [24]:
top_k_indices

array([94, 78, 84, 93, 95])

In [25]:
db.df.iloc[top_k_indices]

Unnamed: 0,chapter,section,index,db_string,n_tokens
94,4,5,4,Use the Order of Operations to Simplify Comple...,99
78,4,2,0,Simplify Fractions: In working with equivalent...,443
84,4,3,2,Simplify Complex Fractions: Our work with frac...,87
93,4,5,3,Identify and Use Fraction Operations: By now i...,56
95,4,5,5,Evaluate Variable Expressions with Fractions: ...,45
