Common Core State Standards (#132)

Include package for common core state standards Full commit list before squash: * Common Core State Standards * Sentence Transformers Search * PEP8, slightly improved interfaces * Updated README
ETS-Next-Gen · May 28, 2024 · 978378a · 978378a
1 parent 9a38d93
commit 978378a
Show file tree

Hide file tree

Showing 9 changed files with 1,932 additions and 0 deletions.
diff --git a/modules/ccss/README.md b/modules/ccss/README.md
@@ -0,0 +1,24 @@
+# Common Core State Standards for Python
+
+This is a small package which allows the use of Common Core State Standards from Python.
+
+     import ccss
+     ccss.standards
+     ccss.standards.math()
+     ccss.standards.math().grade(5)
+     ccss.standards.ela().subdomain('CCRA')
+     ccss.standards.ela().subdomain('LF').grade([5,6])
+
+These will all return dictionary-like objects mapping CCSS tags to their text. Queries can be changed in arbitrary order.
+
+It's possible to see options available. For example:
+
+     ccss.standards.grades()
+     ccss.standards.subdomains()
+     ccss.standards.subdomain('CCRA').grades()
+
+You should be mindful of [licensing issues with Common Core](ccss_public_license). This code is open-source. The standards are not.
+
+The text is also scraped, and there are occasional bugs. We are missing a few tags, and a few have partial text. Feel free to submit a PR to fix it!
+
+This package is in development. If you use it in your project, we recommend pinning versions, as the API may change (but it's very usable in the current version, and we don't anticipate specific reasons to upgrade just because a newer version exists).
diff --git a/modules/ccss/ccss/__init__.py b/modules/ccss/ccss/__init__.py
@@ -0,0 +1,2 @@
+from ccss import settings
+from ccss import ELA, MATH
diff --git a/modules/ccss/ccss/ccss.json b/modules/ccss/ccss/ccss.json
diff --git a/modules/ccss/ccss/ccss.py b/modules/ccss/ccss/ccss.py
@@ -0,0 +1,88 @@
+'''
+This is a simple interface to navigate Common Core State Standards.
+'''
+
+from pkg_resources import resource_filename
+import json
+
+ELA = 'ELA-Literacy'
+MATH = 'Math'
+
+
+class Standard:
+    def __init__(self, standard_str):
+        self.standard_str = standard_str.replace("Math.Content", "Math")
+        parts = self.standard_str.split('.')
+        self.subject = parts[1]
+        if self.subject == ELA:
+            self.subdomain = parts[2]
+            self.grade = parts[3]
+            self.id = ".".join(parts[4:])
+        elif self.subject == MATH:
+            self.subdomain = parts[3]
+            self.grade = parts[2]
+            self.id = ".".join(parts[4:])
+        else:
+            raise AttributeError("Unknown subject")
+
+    def __str__(self):
+        return self.standard_str
+
+
+class Standards(dict):
+    def query(self, func):
+        return Standards(
+            {
+                key: value
+                for key, value in self.items()
+                if func(Standard(key))
+            }
+        )
+
+    def math(self):
+        # Return a new Standards object with just math items
+        return self.query(lambda key: key.subject == MATH)
+
+    def ela(self):
+        # Return a new Standards object with just ELA items
+        return self.query(lambda key: key.subject == ELA)
+
+    def subdomain(self, subdomains):
+        # Handle lists or individual values
+        if not isinstance(subdomains, list):
+            subdomains = [subdomains]
+        # Return a new Standards object with specified subdomain items
+        return self.query(lambda key: key.subdomain in subdomains)
+
+    def id(self, ids):
+        # Handle lists or individual values
+        if not isinstance(ids, list):
+            ids = [ids]
+        # Return a new Standards object with specified id items
+        return self.query(lambda key: key.id in ids)
+
+    def grade(self, grade_levels):
+        # Handle lists or individual values
+        if not isinstance(grade_levels, list):
+            grade_levels = [grade_levels]
+
+        # Handle integers
+        grade_levels = list(map(str, grade_levels))
+        return self.query(lambda key: key.grade in grade_levels)
+
+    def subdomains(self):
+        all_subdomains = {Standard(key).subdomain for key in self}
+        return sorted(all_subdomains)
+
+    def ids(self):
+        all_ids = {Standard(key).id for key in self}
+        return sorted(all_ids)
+
+    def grades(self):
+        all_grades = {Standard(key).grade for key in self}
+        return sorted(all_grades)
+
+
+json_file_path = resource_filename(__name__, 'ccss.json')
+
+standards = Standards(json.load(open(json_file_path)))
diff --git a/modules/ccss/ccss/download.py b/modules/ccss/ccss/download.py
@@ -0,0 +1,70 @@
+'''This is a script which downloads CCSS standards.
+
+This script is a one-off, since it will break if the page layout ever
+changes. It was half-generated by GPT. The core of this package are
+the JSON files extracted, and the scripts to make use of them.
+'''
+
+from bs4 import BeautifulSoup
+import requests
+import json
+
+# Fetch the webpage
+subjects = ["ELA-Literacy", "Math"]
+
+
+def fetch_urls(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    nav = soup.find('div', {'id': 'sidebar'})
+    all_urls = [a['href'] for a in nav.find_all('a')]
+    return [
+        u.replace('../', '')
+        for u in all_urls
+        if 'pdf' not in u and 'http' not in u
+    ]
+
+
+def fetch_standards(url):
+    response = requests.get(url)
+
+    # Parse the HTML with BeautifulSoup
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    # Find the standards and sub-standards
+    standards = soup.find_all('div', {'class': 'standard'})
+    standards_json = {}
+    for standard in standards:
+        identifier = standard.find('a', {'class': 'identifier'}).text
+        description = standard.find('br').next_sibling
+        standards_json[identifier] = description.strip()
+
+    # Find the substandards
+    substandards = soup.find_all('div', {'class': 'substandard'})
+    substandards_json = {}
+    for substandard in substandards:
+        identifier = substandard.find('a', {'class': 'identifier'}).text
+        description = substandard.find('br').next_sibling
+        substandards_json[identifier] = description.strip()
+
+    # Output the JSON document
+    return standards_json, substandards_json
+
+
+all_standards = {}
+
+for subject in subjects:
+    base_url = f"https://www.thecorestandards.org/{subject}"
+    for u in fetch_urls(base_url):
+        try:
+            standards, substandards = fetch_standards(
+                f"https://www.thecorestandards.org/{u}")
+            all_standards.update(standards)
+            all_standards.update(substandards)
+        except Exception:
+            print(f"Skipping {u}")
+
+print(json.dumps(all_standards, indent=2))
+
+json.dump(all_standards, open("ccss.json", "w"), indent=2)
diff --git a/modules/ccss/ccss/st_search.py b/modules/ccss/ccss/st_search.py
@@ -0,0 +1,62 @@
+'''
+This implements a search for a particular standard using a
+relatively small, fast neural network.
+
+Load time is a little bit annoying (3-4 seconds), but queries
+run quickly.
+
+It can be used directly, or to return a set of all possible
+results, with the final results then pruned by an LLM
+
+Perhaps this ought to be teased out into its own library. This should
+definitely not be placed in anything imported in __init__.py or
+ccss.py, since most uses of ccss probably won't use this, and it adds
+to startup time.
+'''
+
+import json
+
+from sentence_transformers import SentenceTransformer, util
+import torch
+
+# Load the model. This is a relatively small model (80MB)
+model = SentenceTransformer('all-MiniLM-L6-v2')
+
+standard_keys, standard_texts = zip(*json.load(open("ccss.json")).items())
+
+# Encode all standard_texts to get their embeddings
+embeddings = model.encode(standard_texts, convert_to_tensor=True)
+
+
+def search(query, *args, max_result_count=5):
+    '''
+    Fast (imperfect) semantic similarity search.
+
+    Fast enough to work in realtime (e.g. for autocomplete)
+
+    From best to worst.
+
+    `max_result_count` can be set to `None` to return all standards
+    items
+    '''
+    # Encode the query
+    query_embedding = model.encode(query, convert_to_tensor=True)
+
+    # Use cosine similarity to find the most similar
+    # standard_texts to the query
+    cos_similarities = util.pytorch_cos_sim(query_embedding, embeddings)[0]
+
+    # Get the index of the most similar sentence
+    top_match_index = torch.argmax(cos_similarities).item()
+    top_match_indices = cos_similarities.argsort(descending=True)
+    if max_result_count:
+        top_match_indices = top_match_indices[:max_result_count]
+
+    result_standard_texts = [standard_texts[i] for i in top_match_indices]
+    result_standard_keys = [standard_keys[i] for i in top_match_indices]
+    return zip(result_standard_keys, result_standard_texts)
+
+
+if __name__ == '__main__':
+    for key, text in search("division", max_result_count=10):
+        print(f"{key}: {text}")
diff --git a/modules/ccss/ccss/test.py b/modules/ccss/ccss/test.py
@@ -0,0 +1,50 @@
+'''
+For now, this just runs all the code paths and checks return
+types. We should do something more robust later.
+'''
+
+import unittest
+
+import ccss
+
+
+class TestStandards(unittest.TestCase):
+    def test_math(self):
+        math_standards = ccss.standards.math()
+        self.assertIsInstance(math_standards, ccss.Standards)
+
+    def test_ela(self):
+        ela_standards = ccss.standards.ela()
+        self.assertIsInstance(ela_standards, ccss.Standards)
+
+    def test_subdomain_with_list(self):
+        sub_standards = ccss.standards.subdomain(['Math'])
+        self.assertIsInstance(sub_standards, ccss.Standards)
+
+    def test_subdomain_with_str(self):
+        sub_standards = ccss.standards.subdomain('Math')
+        self.assertIsInstance(sub_standards, ccss.Standards)
+
+    def test_id_with_list(self):
+        sub_standards = ccss.standards.id(['1', '2'])
+        self.assertIsInstance(sub_standards, ccss.Standards)
+
+    def test_id_with_str(self):
+        sub_standards = ccss.standards.id('1')
+        self.assertIsInstance(sub_standards, ccss.Standards)
+
+    def test_grade_with_list(self):
+        sub_standards = ccss.standards.grade(['1', '2'])
+        self.assertIsInstance(sub_standards, ccss.Standards)
+
+    def test_grade_with_str(self):
+        sub_standards = ccss.standards.grade('1')
+        self.assertIsInstance(sub_standards, ccss.Standards)
+
+    def test_grade_with_int(self):
+        sub_standards = ccss.standards.grade(1)
+        self.assertIsInstance(sub_standards, ccss.Standards)
+
+
+if __name__ == '__main__':
+    unittest.main()