Skip to content

Commit

Permalink
Common Core State Standards (#132)
Browse files Browse the repository at this point in the history
Include package for common core state standards

Full commit list before squash:
* Common Core State Standards

* Sentence Transformers Search

* PEP8, slightly improved interfaces

* Updated README
  • Loading branch information
pmitros committed May 28, 2024
1 parent 9a38d93 commit 978378a
Show file tree
Hide file tree
Showing 9 changed files with 1,932 additions and 0 deletions.
24 changes: 24 additions & 0 deletions modules/ccss/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Common Core State Standards for Python

This is a small package which allows the use of Common Core State Standards from Python.

import ccss
ccss.standards
ccss.standards.math()
ccss.standards.math().grade(5)
ccss.standards.ela().subdomain('CCRA')
ccss.standards.ela().subdomain('LF').grade([5,6])

These will all return dictionary-like objects mapping CCSS tags to their text. Queries can be changed in arbitrary order.

It's possible to see options available. For example:

ccss.standards.grades()
ccss.standards.subdomains()
ccss.standards.subdomain('CCRA').grades()

You should be mindful of [licensing issues with Common Core](ccss_public_license). This code is open-source. The standards are not.

The text is also scraped, and there are occasional bugs. We are missing a few tags, and a few have partial text. Feel free to submit a PR to fix it!

This package is in development. If you use it in your project, we recommend pinning versions, as the API may change (but it's very usable in the current version, and we don't anticipate specific reasons to upgrade just because a newer version exists).
2 changes: 2 additions & 0 deletions modules/ccss/ccss/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from ccss import settings
from ccss import ELA, MATH
1,526 changes: 1,526 additions & 0 deletions modules/ccss/ccss/ccss.json

Large diffs are not rendered by default.

88 changes: 88 additions & 0 deletions modules/ccss/ccss/ccss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
'''
This is a simple interface to navigate Common Core State Standards.
'''

from pkg_resources import resource_filename
import json

ELA = 'ELA-Literacy'
MATH = 'Math'


class Standard:
def __init__(self, standard_str):
self.standard_str = standard_str.replace("Math.Content", "Math")
parts = self.standard_str.split('.')
self.subject = parts[1]
if self.subject == ELA:
self.subdomain = parts[2]
self.grade = parts[3]
self.id = ".".join(parts[4:])
elif self.subject == MATH:
self.subdomain = parts[3]
self.grade = parts[2]
self.id = ".".join(parts[4:])
else:
raise AttributeError("Unknown subject")

def __str__(self):
return self.standard_str


class Standards(dict):
def query(self, func):
return Standards(
{
key: value
for key, value in self.items()
if func(Standard(key))
}
)

def math(self):
# Return a new Standards object with just math items
return self.query(lambda key: key.subject == MATH)

def ela(self):
# Return a new Standards object with just ELA items
return self.query(lambda key: key.subject == ELA)

def subdomain(self, subdomains):
# Handle lists or individual values
if not isinstance(subdomains, list):
subdomains = [subdomains]
# Return a new Standards object with specified subdomain items
return self.query(lambda key: key.subdomain in subdomains)

def id(self, ids):
# Handle lists or individual values
if not isinstance(ids, list):
ids = [ids]
# Return a new Standards object with specified id items
return self.query(lambda key: key.id in ids)

def grade(self, grade_levels):
# Handle lists or individual values
if not isinstance(grade_levels, list):
grade_levels = [grade_levels]

# Handle integers
grade_levels = list(map(str, grade_levels))
return self.query(lambda key: key.grade in grade_levels)

def subdomains(self):
all_subdomains = {Standard(key).subdomain for key in self}
return sorted(all_subdomains)

def ids(self):
all_ids = {Standard(key).id for key in self}
return sorted(all_ids)

def grades(self):
all_grades = {Standard(key).grade for key in self}
return sorted(all_grades)


json_file_path = resource_filename(__name__, 'ccss.json')

standards = Standards(json.load(open(json_file_path)))
70 changes: 70 additions & 0 deletions modules/ccss/ccss/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
'''This is a script which downloads CCSS standards.
This script is a one-off, since it will break if the page layout ever
changes. It was half-generated by GPT. The core of this package are
the JSON files extracted, and the scripts to make use of them.
'''

from bs4 import BeautifulSoup
import requests
import json

# Fetch the webpage
subjects = ["ELA-Literacy", "Math"]


def fetch_urls(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

nav = soup.find('div', {'id': 'sidebar'})
all_urls = [a['href'] for a in nav.find_all('a')]
return [
u.replace('../', '')
for u in all_urls
if 'pdf' not in u and 'http' not in u
]


def fetch_standards(url):
response = requests.get(url)

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find the standards and sub-standards
standards = soup.find_all('div', {'class': 'standard'})
standards_json = {}
for standard in standards:
identifier = standard.find('a', {'class': 'identifier'}).text
description = standard.find('br').next_sibling
standards_json[identifier] = description.strip()

# Find the substandards
substandards = soup.find_all('div', {'class': 'substandard'})
substandards_json = {}
for substandard in substandards:
identifier = substandard.find('a', {'class': 'identifier'}).text
description = substandard.find('br').next_sibling
substandards_json[identifier] = description.strip()

# Output the JSON document
return standards_json, substandards_json


all_standards = {}

for subject in subjects:
base_url = f"https://www.thecorestandards.org/{subject}"
for u in fetch_urls(base_url):
try:
standards, substandards = fetch_standards(
f"https://www.thecorestandards.org/{u}")
all_standards.update(standards)
all_standards.update(substandards)
except Exception:
print(f"Skipping {u}")

print(json.dumps(all_standards, indent=2))

json.dump(all_standards, open("ccss.json", "w"), indent=2)
62 changes: 62 additions & 0 deletions modules/ccss/ccss/st_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
'''
This implements a search for a particular standard using a
relatively small, fast neural network.
Load time is a little bit annoying (3-4 seconds), but queries
run quickly.
It can be used directly, or to return a set of all possible
results, with the final results then pruned by an LLM
Perhaps this ought to be teased out into its own library. This should
definitely not be placed in anything imported in __init__.py or
ccss.py, since most uses of ccss probably won't use this, and it adds
to startup time.
'''

import json

from sentence_transformers import SentenceTransformer, util
import torch

# Load the model. This is a relatively small model (80MB)
model = SentenceTransformer('all-MiniLM-L6-v2')

standard_keys, standard_texts = zip(*json.load(open("ccss.json")).items())

# Encode all standard_texts to get their embeddings
embeddings = model.encode(standard_texts, convert_to_tensor=True)


def search(query, *args, max_result_count=5):
'''
Fast (imperfect) semantic similarity search.
Fast enough to work in realtime (e.g. for autocomplete)
From best to worst.
`max_result_count` can be set to `None` to return all standards
items
'''
# Encode the query
query_embedding = model.encode(query, convert_to_tensor=True)

# Use cosine similarity to find the most similar
# standard_texts to the query
cos_similarities = util.pytorch_cos_sim(query_embedding, embeddings)[0]

# Get the index of the most similar sentence
top_match_index = torch.argmax(cos_similarities).item()
top_match_indices = cos_similarities.argsort(descending=True)
if max_result_count:
top_match_indices = top_match_indices[:max_result_count]

result_standard_texts = [standard_texts[i] for i in top_match_indices]
result_standard_keys = [standard_keys[i] for i in top_match_indices]
return zip(result_standard_keys, result_standard_texts)


if __name__ == '__main__':
for key, text in search("division", max_result_count=10):
print(f"{key}: {text}")
50 changes: 50 additions & 0 deletions modules/ccss/ccss/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
'''
For now, this just runs all the code paths and checks return
types. We should do something more robust later.
'''

import unittest

import ccss


class TestStandards(unittest.TestCase):
def test_math(self):
math_standards = ccss.standards.math()
self.assertIsInstance(math_standards, ccss.Standards)

def test_ela(self):
ela_standards = ccss.standards.ela()
self.assertIsInstance(ela_standards, ccss.Standards)

def test_subdomain_with_list(self):
sub_standards = ccss.standards.subdomain(['Math'])
self.assertIsInstance(sub_standards, ccss.Standards)

def test_subdomain_with_str(self):
sub_standards = ccss.standards.subdomain('Math')
self.assertIsInstance(sub_standards, ccss.Standards)

def test_id_with_list(self):
sub_standards = ccss.standards.id(['1', '2'])
self.assertIsInstance(sub_standards, ccss.Standards)

def test_id_with_str(self):
sub_standards = ccss.standards.id('1')
self.assertIsInstance(sub_standards, ccss.Standards)

def test_grade_with_list(self):
sub_standards = ccss.standards.grade(['1', '2'])
self.assertIsInstance(sub_standards, ccss.Standards)

def test_grade_with_str(self):
sub_standards = ccss.standards.grade('1')
self.assertIsInstance(sub_standards, ccss.Standards)

def test_grade_with_int(self):
sub_standards = ccss.standards.grade(1)
self.assertIsInstance(sub_standards, ccss.Standards)


if __name__ == '__main__':
unittest.main()
Loading

0 comments on commit 978378a

Please sign in to comment.