In [1]:
import util

In [19]:
dataset = util.load_openstax_course('University Physics Volume 1')

In [23]:
dataset.head()

Unnamed: 0,question,learning_goal
0,Find the order of magnitude of the following p...,Describe the scope of physics.
1,Find the order of magnitude of the following p...,Calculate the order of magnitude of a quantity.
2,Find the order of magnitude of the following p...,"Compare measurable length, mass, and timescale..."
3,Find the order of magnitude of the following p...,"Describe the relationships among models, theor..."
4,Use the orders of magnitude you found in the p...,Describe the scope of physics.


In [24]:
dataset['learning_goal'].value_counts().shape

(327,)

In [25]:
dataset['question'].value_counts().shape

(1036,)

In [26]:
dataset.groupby('question')['learning_goal'].agg(list).value_counts().shape

(98,)

1. Define what a task is 
2. (small) Debug code so that it loads Chemistry 2e
3. (ambitious) Try a simple finetuning baseline

4. Preprcoessing data
 - (for Principles of Chemistry) stem the verb of the learning goal
 - Unicode characters:
    - delta --> "delta"
    - exponents?

In [7]:
def k_shot_sample(data, learning_goal, match=True, k=5):
  if match:
    sample_data = data[data['learning_goal'] == learning_goal]
  else:
    sample_data = data[data['learning_goal'] != learning_goal]
  return sample_data.sample(n=min(k, len(sample_data)))
  
  
def meta_task(data, k=5):
  # very clunky, but only look at data whose learning goals have enough examples
  data = data[data['learning_goal'].isin(
      data['learning_goal'].value_counts()[data['learning_goal'].value_counts() >= k].index
  )]
  query = np.random.choice(data['question'].unique())
  learning_goal = data[data['question'] == query]['learning_goal'].sample().values[0]
  k_shot_true = k_shot_sample(data[data['question'] != query], learning_goal, match=True, k=k)
  k_shot_false = k_shot_sample(data[data['question'] != query], learning_goal, match=False, k=k)
  return k_shot_true, k_shot_false, query, learning_goal


In [3]:
import re

def parse_table_of_contents(filename):
    with open(filename) as f:
        lines = [line.strip() for line in f]
    chapter_names = [
        line for line in lines 
        if re.match('[0-9]+\.[0-9]+', line)
    ]
    return chapter_names

In [20]:
import urllib

def scrape_learning_goals(url):
    response = urllib.request.urlopen(url)
    html = response.read().decode('utf8')
    objectives_list = re.findall('<ul id=\"list-00001\">[\s\S]*?</ul>', html)[0]
    learning_objectives = re.findall('<li>[\s\S].*</li>', objectives_list)
    return [item[4:-5] for item in learning_objectives]

def clean_url_extension(chapter_name):
    return chapter_name.lower().replace(',', '').replace(':', '').replace(' ', '-').replace('.', '-')
    

def read_learning_goals(chapter_names, base_url):
    learning_goals = {}
    for chapter in chapter_names:
        url = base_url + clean_url_extension(chapter)
        print(url)
        learning_goals[chapter] = scrape_learning_goals(url)
    return learning_goals


In [21]:
chapter_names = parse_table_of_contents('biology2e_table_of_contents.txt')
base_url = 'https://openstax.org/books/biology-2e/pages/'

learning_goals = read_learning_goals(chapter_names, base_url)

https://openstax.org/books/biology-2e/pages/1-1-the-science-of-biology
https://openstax.org/books/biology-2e/pages/1-2-themes-and-concepts-of-biology
https://openstax.org/books/biology-2e/pages/2-1-atoms-isotopes-ions-and-molecules-the-building-blocks
https://openstax.org/books/biology-2e/pages/2-2-water
https://openstax.org/books/biology-2e/pages/2-3-carbon
https://openstax.org/books/biology-2e/pages/3-1-synthesis-of-biological-macromolecules
https://openstax.org/books/biology-2e/pages/3-2-carbohydrates
https://openstax.org/books/biology-2e/pages/3-3-lipids
https://openstax.org/books/biology-2e/pages/3-4-proteins
https://openstax.org/books/biology-2e/pages/3-5-nucleic-acids
https://openstax.org/books/biology-2e/pages/4-1-studying-cells
https://openstax.org/books/biology-2e/pages/4-2-prokaryotic-cells
https://openstax.org/books/biology-2e/pages/4-3-eukaryotic-cells
https://openstax.org/books/biology-2e/pages/4-4-the-endomembrane-system-and-proteins
https://openstax.org/books/biology-2e

In [27]:
chapter_names = parse_table_of_contents('universityphysicsvolume2_table_of_contents.txt')
base_url = 'https://openstax.org/books/university-physics-volume-2/pages/'

learning_goals = read_learning_goals(chapter_names, base_url)

https://openstax.org/books/university-physics-volume-2/pages/1-1-temperature-and-thermal-equilibrium
https://openstax.org/books/university-physics-volume-2/pages/1-2-thermometers-and-temperature-scales
https://openstax.org/books/university-physics-volume-2/pages/1-3-thermal-expansion
https://openstax.org/books/university-physics-volume-2/pages/1-4-heat-transfer-specific-heat-and-calorimetry
https://openstax.org/books/university-physics-volume-2/pages/1-5-phase-changes
https://openstax.org/books/university-physics-volume-2/pages/1-6-mechanisms-of-heat-transfer
https://openstax.org/books/university-physics-volume-2/pages/2-1-molecular-model-of-an-ideal-gas
https://openstax.org/books/university-physics-volume-2/pages/2-2-pressure-temperature-and-rms-speed
https://openstax.org/books/university-physics-volume-2/pages/2-3-heat-capacity-and-equipartition-of-energy
https://openstax.org/books/university-physics-volume-2/pages/2-4-distribution-of-molecular-speeds
https://openstax.org/books/unive

In [30]:
chapter_names = parse_table_of_contents('universityphysicsvolume3_table_of_contents.txt')
base_url = 'https://openstax.org/books/university-physics-volume-3/pages/'

learning_goals = read_learning_goals(chapter_names, base_url)

https://openstax.org/books/university-physics-volume-3/pages/1-1-the-propagation-of-light
https://openstax.org/books/university-physics-volume-3/pages/1-2-the-law-of-reflection
https://openstax.org/books/university-physics-volume-3/pages/1-3-refraction
https://openstax.org/books/university-physics-volume-3/pages/1-4-total-internal-reflection
https://openstax.org/books/university-physics-volume-3/pages/1-5-dispersion
https://openstax.org/books/university-physics-volume-3/pages/1-6-huygenss-principle
https://openstax.org/books/university-physics-volume-3/pages/1-7-polarization
https://openstax.org/books/university-physics-volume-3/pages/2-1-images-formed-by-plane-mirrors
https://openstax.org/books/university-physics-volume-3/pages/2-2-spherical-mirrors
https://openstax.org/books/university-physics-volume-3/pages/2-3-images-formed-by-refraction
https://openstax.org/books/university-physics-volume-3/pages/2-4-thin-lenses
https://openstax.org/books/university-physics-volume-3/pages/2-5-the-

In [31]:
import json

with open('universityphysicsvolume3_subchapter_to_learning_goal.json', 'w+') as f:
    json.dump(learning_goals, f)

In [27]:
import urllib

In [127]:
url = 'https://openstax.org/books/biology-2e/pages/2-3-carbon'

response = urllib.request.urlopen(url)

html = response.read().decode('utf8')

In [128]:
import re

In [129]:
matches = re.findall('<ul id=\"list-00001\">[\s\S]*?</ul>', html)

In [130]:
'Learning Objectives' in html

True

In [131]:
matches

['<ul id="list-00001">\n<li>Explain why carbon is important for life</li>\n<li>Describe the role of functional groups in biological molecules</li>\n</ul>']

In [134]:
extracted = re.findall('<li>[\s\S].*</li>', matches[0])
[item[4:-5] for item in extracted]

['Explain why carbon is important for life',
 'Describe the role of functional groups in biological molecules']

In [117]:
type(matches)

list

In [118]:
raw_str = matches[0]


In [119]:
raw_lgs = raw_str.split('>\n<li>')
print(raw_lgs)

['<ul id="list-00001"', 'Explain why carbon is important for life</li', 'Describe the role of functional groups in biological molecules</li>\n</ul>']


In [120]:
for i in range(len(raw_lgs)):
    raw_lgs[i] = raw_lgs[i].split('<')

In [121]:
print(raw_lgs)

[['', 'ul id="list-00001"'], ['Explain why carbon is important for life', '/li'], ['Describe the role of functional groups in biological molecules', '/li>\n', '/ul>']]


In [122]:
processed_lgs = []
for i in range(1, len(raw_lgs)):
    elem = raw_lgs[i]
    processed_lgs.append(elem[0])


In [123]:
print(processed_lgs)

['Explain why carbon is important for life', 'Describe the role of functional groups in biological molecules']
