-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Include package for common core state standards Full commit list before squash: * Common Core State Standards * Sentence Transformers Search * PEP8, slightly improved interfaces * Updated README
- Loading branch information
Showing
9 changed files
with
1,932 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Common Core State Standards for Python | ||
|
||
This is a small package which allows the use of Common Core State Standards from Python. | ||
|
||
import ccss | ||
ccss.standards | ||
ccss.standards.math() | ||
ccss.standards.math().grade(5) | ||
ccss.standards.ela().subdomain('CCRA') | ||
ccss.standards.ela().subdomain('LF').grade([5,6]) | ||
|
||
These will all return dictionary-like objects mapping CCSS tags to their text. Queries can be changed in arbitrary order. | ||
|
||
It's possible to see options available. For example: | ||
|
||
ccss.standards.grades() | ||
ccss.standards.subdomains() | ||
ccss.standards.subdomain('CCRA').grades() | ||
|
||
You should be mindful of [licensing issues with Common Core](ccss_public_license). This code is open-source. The standards are not. | ||
|
||
The text is also scraped, and there are occasional bugs. We are missing a few tags, and a few have partial text. Feel free to submit a PR to fix it! | ||
|
||
This package is in development. If you use it in your project, we recommend pinning versions, as the API may change (but it's very usable in the current version, and we don't anticipate specific reasons to upgrade just because a newer version exists). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from ccss import settings | ||
from ccss import ELA, MATH |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
''' | ||
This is a simple interface to navigate Common Core State Standards. | ||
''' | ||
|
||
from pkg_resources import resource_filename | ||
import json | ||
|
||
ELA = 'ELA-Literacy' | ||
MATH = 'Math' | ||
|
||
|
||
class Standard: | ||
def __init__(self, standard_str): | ||
self.standard_str = standard_str.replace("Math.Content", "Math") | ||
parts = self.standard_str.split('.') | ||
self.subject = parts[1] | ||
if self.subject == ELA: | ||
self.subdomain = parts[2] | ||
self.grade = parts[3] | ||
self.id = ".".join(parts[4:]) | ||
elif self.subject == MATH: | ||
self.subdomain = parts[3] | ||
self.grade = parts[2] | ||
self.id = ".".join(parts[4:]) | ||
else: | ||
raise AttributeError("Unknown subject") | ||
|
||
def __str__(self): | ||
return self.standard_str | ||
|
||
|
||
class Standards(dict): | ||
def query(self, func): | ||
return Standards( | ||
{ | ||
key: value | ||
for key, value in self.items() | ||
if func(Standard(key)) | ||
} | ||
) | ||
|
||
def math(self): | ||
# Return a new Standards object with just math items | ||
return self.query(lambda key: key.subject == MATH) | ||
|
||
def ela(self): | ||
# Return a new Standards object with just ELA items | ||
return self.query(lambda key: key.subject == ELA) | ||
|
||
def subdomain(self, subdomains): | ||
# Handle lists or individual values | ||
if not isinstance(subdomains, list): | ||
subdomains = [subdomains] | ||
# Return a new Standards object with specified subdomain items | ||
return self.query(lambda key: key.subdomain in subdomains) | ||
|
||
def id(self, ids): | ||
# Handle lists or individual values | ||
if not isinstance(ids, list): | ||
ids = [ids] | ||
# Return a new Standards object with specified id items | ||
return self.query(lambda key: key.id in ids) | ||
|
||
def grade(self, grade_levels): | ||
# Handle lists or individual values | ||
if not isinstance(grade_levels, list): | ||
grade_levels = [grade_levels] | ||
|
||
# Handle integers | ||
grade_levels = list(map(str, grade_levels)) | ||
return self.query(lambda key: key.grade in grade_levels) | ||
|
||
def subdomains(self): | ||
all_subdomains = {Standard(key).subdomain for key in self} | ||
return sorted(all_subdomains) | ||
|
||
def ids(self): | ||
all_ids = {Standard(key).id for key in self} | ||
return sorted(all_ids) | ||
|
||
def grades(self): | ||
all_grades = {Standard(key).grade for key in self} | ||
return sorted(all_grades) | ||
|
||
|
||
json_file_path = resource_filename(__name__, 'ccss.json') | ||
|
||
standards = Standards(json.load(open(json_file_path))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
'''This is a script which downloads CCSS standards. | ||
This script is a one-off, since it will break if the page layout ever | ||
changes. It was half-generated by GPT. The core of this package are | ||
the JSON files extracted, and the scripts to make use of them. | ||
''' | ||
|
||
from bs4 import BeautifulSoup | ||
import requests | ||
import json | ||
|
||
# Fetch the webpage | ||
subjects = ["ELA-Literacy", "Math"] | ||
|
||
|
||
def fetch_urls(url): | ||
response = requests.get(url) | ||
soup = BeautifulSoup(response.text, 'html.parser') | ||
|
||
nav = soup.find('div', {'id': 'sidebar'}) | ||
all_urls = [a['href'] for a in nav.find_all('a')] | ||
return [ | ||
u.replace('../', '') | ||
for u in all_urls | ||
if 'pdf' not in u and 'http' not in u | ||
] | ||
|
||
|
||
def fetch_standards(url): | ||
response = requests.get(url) | ||
|
||
# Parse the HTML with BeautifulSoup | ||
soup = BeautifulSoup(response.text, 'html.parser') | ||
|
||
# Find the standards and sub-standards | ||
standards = soup.find_all('div', {'class': 'standard'}) | ||
standards_json = {} | ||
for standard in standards: | ||
identifier = standard.find('a', {'class': 'identifier'}).text | ||
description = standard.find('br').next_sibling | ||
standards_json[identifier] = description.strip() | ||
|
||
# Find the substandards | ||
substandards = soup.find_all('div', {'class': 'substandard'}) | ||
substandards_json = {} | ||
for substandard in substandards: | ||
identifier = substandard.find('a', {'class': 'identifier'}).text | ||
description = substandard.find('br').next_sibling | ||
substandards_json[identifier] = description.strip() | ||
|
||
# Output the JSON document | ||
return standards_json, substandards_json | ||
|
||
|
||
all_standards = {} | ||
|
||
for subject in subjects: | ||
base_url = f"https://www.thecorestandards.org/{subject}" | ||
for u in fetch_urls(base_url): | ||
try: | ||
standards, substandards = fetch_standards( | ||
f"https://www.thecorestandards.org/{u}") | ||
all_standards.update(standards) | ||
all_standards.update(substandards) | ||
except Exception: | ||
print(f"Skipping {u}") | ||
|
||
print(json.dumps(all_standards, indent=2)) | ||
|
||
json.dump(all_standards, open("ccss.json", "w"), indent=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
''' | ||
This implements a search for a particular standard using a | ||
relatively small, fast neural network. | ||
Load time is a little bit annoying (3-4 seconds), but queries | ||
run quickly. | ||
It can be used directly, or to return a set of all possible | ||
results, with the final results then pruned by an LLM | ||
Perhaps this ought to be teased out into its own library. This should | ||
definitely not be placed in anything imported in __init__.py or | ||
ccss.py, since most uses of ccss probably won't use this, and it adds | ||
to startup time. | ||
''' | ||
|
||
import json | ||
|
||
from sentence_transformers import SentenceTransformer, util | ||
import torch | ||
|
||
# Load the model. This is a relatively small model (80MB) | ||
model = SentenceTransformer('all-MiniLM-L6-v2') | ||
|
||
standard_keys, standard_texts = zip(*json.load(open("ccss.json")).items()) | ||
|
||
# Encode all standard_texts to get their embeddings | ||
embeddings = model.encode(standard_texts, convert_to_tensor=True) | ||
|
||
|
||
def search(query, *args, max_result_count=5): | ||
''' | ||
Fast (imperfect) semantic similarity search. | ||
Fast enough to work in realtime (e.g. for autocomplete) | ||
From best to worst. | ||
`max_result_count` can be set to `None` to return all standards | ||
items | ||
''' | ||
# Encode the query | ||
query_embedding = model.encode(query, convert_to_tensor=True) | ||
|
||
# Use cosine similarity to find the most similar | ||
# standard_texts to the query | ||
cos_similarities = util.pytorch_cos_sim(query_embedding, embeddings)[0] | ||
|
||
# Get the index of the most similar sentence | ||
top_match_index = torch.argmax(cos_similarities).item() | ||
top_match_indices = cos_similarities.argsort(descending=True) | ||
if max_result_count: | ||
top_match_indices = top_match_indices[:max_result_count] | ||
|
||
result_standard_texts = [standard_texts[i] for i in top_match_indices] | ||
result_standard_keys = [standard_keys[i] for i in top_match_indices] | ||
return zip(result_standard_keys, result_standard_texts) | ||
|
||
|
||
if __name__ == '__main__': | ||
for key, text in search("division", max_result_count=10): | ||
print(f"{key}: {text}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
''' | ||
For now, this just runs all the code paths and checks return | ||
types. We should do something more robust later. | ||
''' | ||
|
||
import unittest | ||
|
||
import ccss | ||
|
||
|
||
class TestStandards(unittest.TestCase): | ||
def test_math(self): | ||
math_standards = ccss.standards.math() | ||
self.assertIsInstance(math_standards, ccss.Standards) | ||
|
||
def test_ela(self): | ||
ela_standards = ccss.standards.ela() | ||
self.assertIsInstance(ela_standards, ccss.Standards) | ||
|
||
def test_subdomain_with_list(self): | ||
sub_standards = ccss.standards.subdomain(['Math']) | ||
self.assertIsInstance(sub_standards, ccss.Standards) | ||
|
||
def test_subdomain_with_str(self): | ||
sub_standards = ccss.standards.subdomain('Math') | ||
self.assertIsInstance(sub_standards, ccss.Standards) | ||
|
||
def test_id_with_list(self): | ||
sub_standards = ccss.standards.id(['1', '2']) | ||
self.assertIsInstance(sub_standards, ccss.Standards) | ||
|
||
def test_id_with_str(self): | ||
sub_standards = ccss.standards.id('1') | ||
self.assertIsInstance(sub_standards, ccss.Standards) | ||
|
||
def test_grade_with_list(self): | ||
sub_standards = ccss.standards.grade(['1', '2']) | ||
self.assertIsInstance(sub_standards, ccss.Standards) | ||
|
||
def test_grade_with_str(self): | ||
sub_standards = ccss.standards.grade('1') | ||
self.assertIsInstance(sub_standards, ccss.Standards) | ||
|
||
def test_grade_with_int(self): | ||
sub_standards = ccss.standards.grade(1) | ||
self.assertIsInstance(sub_standards, ccss.Standards) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |
Oops, something went wrong.