From 08cf3a60a8b8507ac3febd1292cbc160026b5723 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Fri, 13 Jun 2025 15:17:19 +0200 Subject: [PATCH 1/4] add table --- welearn_datastack/data/db_models.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/welearn_datastack/data/db_models.py b/welearn_datastack/data/db_models.py index 1fa5a7e..d664acb 100644 --- a/welearn_datastack/data/db_models.py +++ b/welearn_datastack/data/db_models.py @@ -38,6 +38,20 @@ class Corpus(Base): is_fix: Mapped[bool] binary_treshold: Mapped[float] = mapped_column(nullable=False, default=0.5) is_active: Mapped[bool] + # category_id: Mapped[UUID] = mapped_column( + # types.Uuid, + # ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.category.id"), + # ) + + +class Category(Base): + __tablename__ = "category" + __table_args__ = {"schema": DbSchemaEnum.CORPUS_RELATED.value} + + id: Mapped[UUID] = mapped_column( + types.Uuid, primary_key=True, nullable=False, server_default="gen_random_uuid()" + ) + title: Mapped[str] class WeLearnDocument(Base): From 99943a5bdb449c4d289a0320bf94bc856310cd90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Fri, 13 Jun 2025 15:20:45 +0200 Subject: [PATCH 2/4] add table --- alembic/versions/89920abb7ff8_add_category.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 alembic/versions/89920abb7ff8_add_category.py diff --git a/alembic/versions/89920abb7ff8_add_category.py b/alembic/versions/89920abb7ff8_add_category.py new file mode 100644 index 0000000..bd28450 --- /dev/null +++ b/alembic/versions/89920abb7ff8_add_category.py @@ -0,0 +1,35 @@ +"""Add category + +Revision ID: 89920abb7ff8 +Revises: 16ff997426d3 +Create Date: 2025-06-13 15:06:08.092501 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "89920abb7ff8" +down_revision: Union[str, None] = "16ff997426d3" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "category", + sa.Column("id", sa.Uuid(), server_default="gen_random_uuid()", nullable=False), + sa.Column("title", sa.String(), nullable=False), + sa.PrimaryKeyConstraint("id"), + schema="corpus_related", + ) + + +def downgrade() -> None: + op.drop_table("category", schema="corpus_related") From b52daa421787e925f8dbde01b06171a7b6e3c62d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Fri, 13 Jun 2025 15:23:48 +0200 Subject: [PATCH 3/4] add table and relation --- alembic/versions/89920abb7ff8_add_category.py | 23 +++++++++++++- .../test_document_classifier.py | 11 +++++++ .../test_nodes/test_extract_n_collect_docs.py | 17 ++++++++++- .../test_document_vectorizer.py | 10 +++++++ .../test_qdrant_syncronizer.py | 10 +++++++ tests/test_retrieve_data_from_database.py | 30 +++++++++++++++++++ .../test_url_sanitary_crawler.py | 17 ++++++++++- welearn_datastack/data/db_models.py | 8 ++--- 8 files changed, 119 insertions(+), 7 deletions(-) diff --git a/alembic/versions/89920abb7ff8_add_category.py b/alembic/versions/89920abb7ff8_add_category.py index bd28450..64a4aa7 100644 --- a/alembic/versions/89920abb7ff8_add_category.py +++ b/alembic/versions/89920abb7ff8_add_category.py @@ -8,6 +8,7 @@ from typing import Sequence, Union +import sqlalchemy import sqlalchemy as sa from sqlalchemy.dialects import postgresql @@ -24,12 +25,32 @@ def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### op.create_table( "category", - sa.Column("id", sa.Uuid(), server_default="gen_random_uuid()", nullable=False), + sa.Column( + "id", + sa.Uuid(), + server_default=sqlalchemy.func.gen_random_uuid(), + nullable=False, + ), sa.Column("title", sa.String(), nullable=False), sa.PrimaryKeyConstraint("id"), schema="corpus_related", ) + op.add_column( + "corpus", + sa.Column("category_id", sa.Uuid()), + schema="corpus_related", + ) + op.create_foreign_key( + None, + "corpus", + "category", + ["category_id"], + ["id"], + source_schema="corpus_related", + referent_schema="corpus_related", + ) def downgrade() -> None: + op.drop_column("corpus", "category_id", schema="corpus_related") op.drop_table("category", schema="corpus_related") diff --git a/tests/document_classifier/test_document_classifier.py b/tests/document_classifier/test_document_classifier.py index 1576cdb..bb06372 100644 --- a/tests/document_classifier/test_document_classifier.py +++ b/tests/document_classifier/test_document_classifier.py @@ -8,11 +8,13 @@ import numpy from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker +from sympy.integrals.meijerint_doc import category from tests.database_test_utils import handle_schema_with_sqlite from welearn_datastack.data.db_models import ( Base, BiClassifierModel, + Category, Corpus, DocumentSlice, NClassifierModel, @@ -40,6 +42,11 @@ def setUp(self): self.test_session = s_maker() Base.metadata.create_all(self.test_session.get_bind()) + self.category_name = "categroy_test0" + self.category_id = uuid4() + + self.category = Category(id=self.category_id, title=self.category_name) + corpus_source_name = "test_corpus" self.corpus_test = Corpus( @@ -47,6 +54,7 @@ def setUp(self): source_name=corpus_source_name, is_fix=True, is_active=True, + category_id=self.category_id, ) self.doc_test_id = uuid.uuid4() @@ -83,6 +91,7 @@ def setUp(self): n_classifier_model_id=uuid4(), ) + self.test_session.add(self.category) self.test_session.add(self.corpus_test) self.test_session.add(self.doc_test) self.test_session.add(self.slice_test) @@ -293,6 +302,7 @@ def test_main_externally_classified( source_name=corpus_source_name, is_fix=True, is_active=True, + category_id=self.category_id, ) doc_test = WeLearnDocument( id=doc_test_id, @@ -391,6 +401,7 @@ def test_main_externally_classified_but_without_sdg( source_name=corpus_source_name, is_fix=True, is_active=True, + category_id=self.category_id, ) doc_test = WeLearnDocument( id=doc_test_id, diff --git a/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py b/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py index 9ff03da..c516cfd 100644 --- a/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py +++ b/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py @@ -12,9 +12,16 @@ from sqlalchemy import create_engine from sqlalchemy.exc import DatabaseError from sqlalchemy.orm import sessionmaker +from sympy.integrals.meijerint_doc import category from tests.database_test_utils import handle_schema_with_sqlite -from welearn_datastack.data.db_models import Base, Corpus, ProcessState, WeLearnDocument +from welearn_datastack.data.db_models import ( + Base, + Corpus, + ProcessState, + WeLearnDocument, + Category, +) from welearn_datastack.data.scraped_welearn_document import ScrapedWeLearnDocument from welearn_datastack.modules import collector_selector from welearn_datastack.nodes_workflow.DocumentHubCollector import document_collector @@ -101,12 +108,20 @@ def setUp(self) -> None: self.path_test_input.mkdir(parents=True, exist_ok=True) os.environ["ARTIFACT_ROOT"] = self.path_test_input.parent.as_posix() + self.category_name = "categroy_test0" + + self.category_id = uuid.uuid4() + + self.category = Category(id=self.category_id, title=self.category_name) + + self.test_session.add(self.category) self.corpus_test = Corpus( id=uuid.uuid4(), source_name=corpus_source_name, is_fix=True, is_active=True, + category_id=self.category_id, ) self.test_session.add(self.corpus_test) diff --git a/tests/document_vectorizer/test_document_vectorizer.py b/tests/document_vectorizer/test_document_vectorizer.py index c1605ec..4637dfd 100644 --- a/tests/document_vectorizer/test_document_vectorizer.py +++ b/tests/document_vectorizer/test_document_vectorizer.py @@ -13,6 +13,7 @@ from tests.database_test_utils import handle_schema_with_sqlite from welearn_datastack.data.db_models import ( Base, + Category, Corpus, CorpusEmbeddingModel, DocumentSlice, @@ -48,6 +49,14 @@ def setUp(self): self.test_session = s_maker() Base.metadata.create_all(self.test_session.get_bind()) + self.category_name = "category_test0" + + self.category_id = uuid.uuid4() + + self.category = Category(id=self.category_id, title=self.category_name) + + self.test_session.add(self.category) + corpus_source_name = "test_corpus" self.embedding_model = EmbeddingModel( @@ -63,6 +72,7 @@ def setUp(self): source_name=corpus_source_name, is_fix=True, is_active=True, + category_id=self.category_id, ) self.test_session.add(self.corpus_test) diff --git a/tests/qdrant_syncronizer/test_qdrant_syncronizer.py b/tests/qdrant_syncronizer/test_qdrant_syncronizer.py index 90b065a..6b841f3 100644 --- a/tests/qdrant_syncronizer/test_qdrant_syncronizer.py +++ b/tests/qdrant_syncronizer/test_qdrant_syncronizer.py @@ -16,6 +16,7 @@ from tests.database_test_utils import handle_schema_with_sqlite from welearn_datastack.data.db_models import ( Base, + Category, Corpus, DocumentSlice, ProcessState, @@ -61,6 +62,14 @@ def setUp(self): Base.metadata.create_all(self.test_session.get_bind()) os.environ["ARTIFACT_ROOT"] = self.path_test_input.parent.as_posix() + self.category_name = "category_test0" + + self.category_id = uuid.uuid4() + + self.category = Category(id=self.category_id, title=self.category_name) + + self.test_session.add(self.category) + corpus_source_name = "corpus" self.corpus_test = Corpus( @@ -68,6 +77,7 @@ def setUp(self): source_name=corpus_source_name, is_fix=True, is_active=True, + category_id=self.category_id, ) doc_id = uuid.uuid4() diff --git a/tests/test_retrieve_data_from_database.py b/tests/test_retrieve_data_from_database.py index 537fc62..3f319e7 100644 --- a/tests/test_retrieve_data_from_database.py +++ b/tests/test_retrieve_data_from_database.py @@ -11,6 +11,7 @@ from welearn_datastack.data.db_models import ( Base, BiClassifierModel, + Category, Corpus, CorpusBiClassifierModel, CorpusEmbeddingModel, @@ -82,6 +83,11 @@ def connect(conn, rec): test_session = s_maker() Base.metadata.create_all(test_session.get_bind()) + category_id = uuid.uuid4() + category_name = "test" + category = Category(id=category_id, title=category_name) + + test_session.add(category) corpus_source_name0 = "corpus0" corpus_source_name1 = "corpus1" @@ -90,11 +96,13 @@ def connect(conn, rec): source_name=corpus_source_name0, is_fix=True, is_active=True, + category_id=category_id, ) corpus_test1 = Corpus( id=uuid.uuid4(), source_name=corpus_source_name1, is_fix=True, + category_id=category_id, is_active=True, ) @@ -199,6 +207,14 @@ def test_retrieve_bi_models(self): test_session = s_maker() Base.metadata.create_all(test_session.get_bind()) + self.category_name = "category_test0" + + self.category_id = uuid.uuid4() + + self.category = Category(id=self.category_id, title=self.category_name) + + test_session.add(self.category) + corpus_source_name = "test_corpus" corpus_test = Corpus( @@ -206,6 +222,7 @@ def test_retrieve_bi_models(self): source_name=corpus_source_name, is_fix=True, is_active=True, + category_id=self.category_id, ) test_session.add(corpus_test) test_session.commit() @@ -316,12 +333,18 @@ def test_retrieve_n_models(self): Base.metadata.create_all(test_session.get_bind()) corpus_source_name = "test_corpus" + category_id = uuid.uuid4() + category_name = "test" + category = Category(id=category_id, title=category_name) + + test_session.add(category) corpus_test = Corpus( id=uuid.uuid4(), source_name=corpus_source_name, is_fix=True, is_active=True, + category_id=category_id, ) test_session.add(corpus_test) test_session.commit() @@ -430,6 +453,12 @@ def test_retrieve_embedding_models(self): test_session = s_maker() Base.metadata.create_all(test_session.get_bind()) + category_id = uuid.uuid4() + category_name = "test" + category = Category(id=category_id, title=category_name) + + test_session.add(category) + corpus_source_name = "test_corpus" corpus_test = Corpus( @@ -437,6 +466,7 @@ def test_retrieve_embedding_models(self): source_name=corpus_source_name, is_fix=True, is_active=True, + category_id=category_id, ) test_session.add(corpus_test) test_session.commit() diff --git a/tests/url_sanitary_crawler/test_url_sanitary_crawler.py b/tests/url_sanitary_crawler/test_url_sanitary_crawler.py index f572dd1..79ba6db 100644 --- a/tests/url_sanitary_crawler/test_url_sanitary_crawler.py +++ b/tests/url_sanitary_crawler/test_url_sanitary_crawler.py @@ -8,7 +8,13 @@ from sqlalchemy.orm import sessionmaker from tests.database_test_utils import handle_schema_with_sqlite -from welearn_datastack.data.db_models import Base, Corpus, ProcessState, WeLearnDocument +from welearn_datastack.data.db_models import ( + Base, + Corpus, + ProcessState, + WeLearnDocument, + Category, +) from welearn_datastack.data.enumerations import Step, URLStatus from welearn_datastack.nodes_workflow.URLSanitaryCrawler.url_sanitary_crawler import ( main, @@ -33,6 +39,13 @@ def setUp(self) -> None: self.test_session = s_maker() Base.metadata.create_all(self.test_session.get_bind()) + self.category_name = "categroy_test0" + self.category_id = uuid.uuid4() + + self.category = Category(id=self.category_id, title=self.category_name) + + self.test_session.add(self.category) + corpus_source_name0 = "corpus0" corpus_source_name1 = "corpus1" corpus_test = Corpus( @@ -40,11 +53,13 @@ def setUp(self) -> None: source_name=corpus_source_name0, is_fix=True, is_active=True, + category_id=self.category_id, ) corpus_test1 = Corpus( id=uuid.uuid4(), source_name=corpus_source_name1, is_fix=True, + category_id=self.category_id, is_active=True, ) diff --git a/welearn_datastack/data/db_models.py b/welearn_datastack/data/db_models.py index d664acb..7b4a1ea 100644 --- a/welearn_datastack/data/db_models.py +++ b/welearn_datastack/data/db_models.py @@ -38,10 +38,10 @@ class Corpus(Base): is_fix: Mapped[bool] binary_treshold: Mapped[float] = mapped_column(nullable=False, default=0.5) is_active: Mapped[bool] - # category_id: Mapped[UUID] = mapped_column( - # types.Uuid, - # ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.category.id"), - # ) + category_id: Mapped[UUID] = mapped_column( + types.Uuid, + ForeignKey(f"{DbSchemaEnum.CORPUS_RELATED.value}.category.id"), + ) class Category(Base): From 96d05bd5d6223c6b98d17de03d4b817f78e5448c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Fri, 13 Jun 2025 15:47:00 +0200 Subject: [PATCH 4/4] populate sql script --- alembic/versions/89920abb7ff8_add_category.py | 2 - sql/89920abb7ff8_populate_corpus_category.sql | 91 +++++++++++++++++++ 2 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 sql/89920abb7ff8_populate_corpus_category.sql diff --git a/alembic/versions/89920abb7ff8_add_category.py b/alembic/versions/89920abb7ff8_add_category.py index 64a4aa7..d72d9d7 100644 --- a/alembic/versions/89920abb7ff8_add_category.py +++ b/alembic/versions/89920abb7ff8_add_category.py @@ -10,8 +10,6 @@ import sqlalchemy import sqlalchemy as sa -from sqlalchemy.dialects import postgresql - from alembic import op # revision identifiers, used by Alembic. diff --git a/sql/89920abb7ff8_populate_corpus_category.sql b/sql/89920abb7ff8_populate_corpus_category.sql new file mode 100644 index 0000000..6ce5ef9 --- /dev/null +++ b/sql/89920abb7ff8_populate_corpus_category.sql @@ -0,0 +1,91 @@ +-- Academic scientific publications +WITH ids AS ( +INSERT + INTO + corpus_related.category(title) + VALUES + ('Academic scientific publications') + RETURNING id AS catid +) +UPDATE + corpus_related.corpus +SET + category_id = ids.catid +FROM + ids +WHERE + corpus.source_name IN ('hal', 'plos', 'peerj', 'oapen', 'openalex', 'open-edition-books'); + +-- Teaching resources +WITH ids AS ( +INSERT + INTO + corpus_related.category(title) + VALUES + ('teaching resources') + RETURNING id AS catid +) +UPDATE + corpus_related.corpus +SET + category_id = ids.catid +FROM + ids +WHERE + corpus.source_name IN ('uved'); + +-- expert reports +WITH ids AS ( +INSERT + INTO + corpus_related.category(title) + VALUES + ('expert reports') + RETURNING id AS catid +) +UPDATE + corpus_related.corpus +SET + category_id = ids.catid +FROM + ids +WHERE + corpus.source_name IN ('ipcc', 'ipbes'); + +-- science communication and outreach +WITH ids AS ( +INSERT + INTO + corpus_related.category(title) + VALUES + ('science communication and outreach') + RETURNING id AS catid +) +UPDATE + corpus_related.corpus +SET + category_id = ids.catid +FROM + ids +WHERE + corpus.source_name IN ('conversation', 'ted'); + + +-- collaborative and encyclopedic knowledge +WITH ids AS ( +INSERT + INTO + corpus_related.category(title) + VALUES + ('collaborative and encyclopedic knowledge') + RETURNING id AS catid +) +UPDATE + corpus_related.corpus +SET + category_id = ids.catid +FROM + ids +WHERE + corpus.source_name IN ('wikipedia'); +