In [1]:
import util

In [19]:
dataset = util.load_openstax_course('University Physics Volume 1')

In [23]:
dataset.head()

Unnamed: 0,question,learning_goal
0,Find the order of magnitude of the following p...,Describe the scope of physics.
1,Find the order of magnitude of the following p...,Calculate the order of magnitude of a quantity.
2,Find the order of magnitude of the following p...,"Compare measurable length, mass, and timescale..."
3,Find the order of magnitude of the following p...,"Describe the relationships among models, theor..."
4,Use the orders of magnitude you found in the p...,Describe the scope of physics.


In [24]:
dataset['learning_goal'].value_counts().shape

(327,)

In [25]:
dataset['question'].value_counts().shape

(1036,)

In [26]:
dataset.groupby('question')['learning_goal'].agg(list).value_counts().shape

(98,)

1. Define what a task is 
2. (small) Debug code so that it loads Chemistry 2e
3. (ambitious) Try a simple finetuning baseline

4. Preprcoessing data
 - (for Principles of Chemistry) stem the verb of the learning goal
 - Unicode characters:
    - delta --> "delta"
    - exponents?

In [7]:
def k_shot_sample(data, learning_goal, match=True, k=5):
  if match:
    sample_data = data[data['learning_goal'] == learning_goal]
  else:
    sample_data = data[data['learning_goal'] != learning_goal]
  return sample_data.sample(n=min(k, len(sample_data)))
  
  
def meta_task(data, k=5):
  # very clunky, but only look at data whose learning goals have enough examples
  data = data[data['learning_goal'].isin(
      data['learning_goal'].value_counts()[data['learning_goal'].value_counts() >= k].index
  )]
  query = np.random.choice(data['question'].unique())
  learning_goal = data[data['question'] == query]['learning_goal'].sample().values[0]
  k_shot_true = k_shot_sample(data[data['question'] != query], learning_goal, match=True, k=k)
  k_shot_false = k_shot_sample(data[data['question'] != query], learning_goal, match=False, k=k)
  return k_shot_true, k_shot_false, query, learning_goal


In [2]:
import re

def parse_table_of_contents(filename):
    with open(filename) as f:
        lines = [line.strip() for line in f]
    chapter_names = [
        line for line in lines 
        if re.match('[0-9]+\.[0-9]+', line)
    ]
    return chapter_names

In [11]:
import urllib

def scrape_learning_goals(url):
    response = urllib.request.urlopen(url)
    html = response.read().decode('utf8')
    objectives_list = re.findall('<ul id=\"list-00001\">[\s\S]*?</ul>', html)
    if len(objectives_list) == 0:
        print('faulty:', url)
        return []
    learning_objectives = re.findall('<li>[\s\S].*</li>', objectives_list[0])
    return [item[4:-5] for item in learning_objectives]

def clean_url_extension(chapter_name):
    return chapter_name.lower().replace(',', '').replace(':', '').replace(' ', '-').replace('.', '-')
    

def read_learning_goals(chapter_names, base_url):
    learning_goals = {}
    for chapter in chapter_names:
        url = base_url + clean_url_extension(chapter)
        print(url)
        learning_goals[chapter] = scrape_learning_goals(url)
    return learning_goals


In [12]:
chapter_names = parse_table_of_contents('chemistry2e_table_of_contents.txt')
base_url = 'https://openstax.org/books/chemistry-2e/pages/'

learning_goals = read_learning_goals(chapter_names, base_url)

https://openstax.org/books/chemistry-2e/pages/1-1-chemistry-in-context
https://openstax.org/books/chemistry-2e/pages/1-2-phases-and-classification-of-matter
https://openstax.org/books/chemistry-2e/pages/1-3-physical-and-chemical-properties
https://openstax.org/books/chemistry-2e/pages/1-4-measurements
https://openstax.org/books/chemistry-2e/pages/1-5-measurement-uncertainty-accuracy-and-precision
https://openstax.org/books/chemistry-2e/pages/1-6-mathematical-treatment-of-measurement-results
https://openstax.org/books/chemistry-2e/pages/2-1-early-ideas-in-atomic-theory
https://openstax.org/books/chemistry-2e/pages/2-2-evolution-of-atomic-theory
https://openstax.org/books/chemistry-2e/pages/2-3-atomic-structure-and-symbolism
https://openstax.org/books/chemistry-2e/pages/2-4-chemical-formulas
https://openstax.org/books/chemistry-2e/pages/2-5-the-periodic-table
https://openstax.org/books/chemistry-2e/pages/2-6-ionic-and-molecular-compounds
https://openstax.org/books/chemistry-2e/pages/2-7-

In [13]:
import json

with open('chemistry2e_subchapter_to_learning_goal.json', 'w+') as f:
    json.dump(learning_goals, f)

In [41]:
import os
import re

def parse_openstax_questions_file(filename, folder_path):
    with open(os.path.join(folder_path, filename), encoding='utf-8') as f:
        lines = [l.strip() for l in f]

    questions = {}
    question_nums = []
    current_subchapter = ''
    for line in lines:
        # when we encounter subchapter heading
        subchapter_num = re.match('[0-9]+\.[0-9]+', line)
        if subchapter_num:
            current_subchapter = subchapter_num.group(0)
            questions[current_subchapter] = []
            continue

        # when we encounter questions
        question_num = re.match('[0-9]+\. ', line)
        if question_num:
            question_num = question_num.group(0)
            questions[current_subchapter].append(line[len(question_num):])
            question_nums.append(question_num)
            continue

        # if this is part of a previous question
        questions[current_subchapter][-1] += '\n' + line

    return questions, question_nums

In [22]:
questions, question_nums = parse_openstax_questions_file('Chemistry2e_11.txt', 'OpenStax Dataset/Chemistry 2e')

In [10]:
question_nums = [int(q[:q.find('.')]) for q in question_nums]

In [65]:
def parse_openstax_questions_folder(folder_path):
    questions, question_nums = {}, []
    for filename in os.listdir(folder_path):
        if filename.endswith('txt'):
            q, q_num = parse_openstax_questions_file(filename, folder_path)
            questions.update(q)
            question_nums.extend(q_num)
    return questions, question_nums

In [47]:
import json
import pandas as pd

OPENSTAX_DIR = 'OpenStax Dataset'

def load_openstax_course(course_name):
    course_code = course_name.replace(' ', '').lower()
    with open(f'{course_code}_subchapter_to_learning_goal.json') as f:
        subchapter_to_lgs = json.load(f)
    
    subchapter_to_lgs = {
        re.findall('[0-9]+\.[0-9]+', k)[0]: v for k, v in subchapter_to_lgs.items()
    }

    questions, question_nums = parse_openstax_questions_folder(
        os.path.join(OPENSTAX_DIR, course_name)
    )

    dataset = []
    for subchapter, question_list in questions.items():
        for question in question_list:
            for learnning_goal in subchapter_to_lgs[subchapter]:
                dataset.append([question, learnning_goal])

    dataset = pd.DataFrame(data=dataset, columns=['question', 'learning_goal'])
    dataset['course'] = course_name
    return dataset

In [69]:
d = load_openstax_course('University Physics Volume 3')

In [70]:
len(d['question'].unique())

617

In [71]:
len(d['learning_goal'].unique())

209

In [5]:
import os

base = 'OpenStax Dataset/Chemistry 2e'
for filename in os.listdir(base):
    if filename.endswith('txt'):
        os.rename(os.path.join(base, filename), os.path.join(base, filename[len('Chemistry2e_'):]))

In [5]:
import util
import pandas as pd

COURSES = [
    'Chemistry 2e', 
    'University Physics Volume 1', 
    'University Physics Volume 2', 
    'University Physics Volume 3'
]

data = pd.concat([
    util.load_openstax_course(course) for course in COURSES
])

In [13]:
lgs = data.groupby('question').agg(list).iloc[0]['learning_goal']
lgs

['Evaluate the net force on a current loop in an external magnetic field',
 'Evaluate the net torque on a current loop in an external magnetic field',
 'Define the magnetic dipole moment of a current loop']

In [1]:
import openstax_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [3]:
from models.protobert import ProtoBert
model = ProtoBert.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing ProtoBert: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing ProtoBert from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ProtoBert from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
train_dataloader = openstax_dataset.get_openstax_dataloader(
    'train',
    4,
    2,
    2,
    100,
    tokenizer,
)

eval_dataloader = openstax_dataset.get_openstax_dataloader(
    'val',
    4,
    2,
    2,
    100,
    tokenizer,
)

In [5]:
data = openstax_dataset.OpenstaxDataset(num_support=2, num_query=2, tokenizer=None)
len(data)

1088

In [5]:
from trainer import Trainer

trainer = Trainer(model, train_dataloader, eval_dataloader)

In [6]:
trainer.fit()

0it [00:05, ?it/s] 0%|          | 0/1000 [00:00<?, ?it/s]
Epoch 1 of 1000:   0%|          | 0/1000 [00:31<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 4.00 GiB total capacity; 2.55 GiB already allocated; 3.05 MiB free; 2.56 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir='outputs', evaluation_strategy='epoch')

In [9]:
import numpy as np
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_score(labels, predictions)

In [10]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=openstax_dataset.OpenstaxDataset(tokenizer, 5, 5, True, 128),
    eval_dataset=openstax_dataset.OpenstaxDataset(tokenizer, 5, 5, True, 128),
    compute_metrics=compute_metrics
)

In [2]:
import util
import pandas as pd

courses = [
    'Chemistry 2e', 
    'University Physics Volume 1', 
    'University Physics Volume 2', 
    'University Physics Volume 3'
]

data = pd.concat([
    util.load_openstax_course(course) for course in courses
])

In [49]:
data_by_q = data.groupby('question').agg(list)
data_by_lg = data.groupby('learning_goal').agg(list)

In [52]:
lg_group = data_by_lg.iloc[1]
lg_group

question    [In Figure 14.12, ε=12V, L=20mH, and R=5.0Ω. D...
course      [University Physics Volume 2, University Physi...
Name: Analyze circuits that have an inductor and resistor in series, dtype: object

In [63]:
import openstax_dataset

dataset = openstax_dataset.OpenstaxDataset(
    tokenizer=None,
    num_support=5,
    num_query=2,
    tokenize=False,
)

  from .autonotebook import tqdm as notebook_tqdm


In [68]:
a = range(10)
b = range(5)

In [80]:
import itertools

indices = np.array(list(itertools.product(a, b)))
indices.shape

(50, 2)

In [89]:
indices[np.random.choice(indices.shape[0], replace=False)]

array([7, 4])