Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions alembic/versions/89920abb7ff8_add_category.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Add category

Revision ID: 89920abb7ff8
Revises: 16ff997426d3
Create Date: 2025-06-13 15:06:08.092501

"""

from typing import Sequence, Union

import sqlalchemy
import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision: str = "89920abb7ff8"
down_revision: Union[str, None] = "16ff997426d3"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"category",
sa.Column(
"id",
sa.Uuid(),
server_default=sqlalchemy.func.gen_random_uuid(),
nullable=False,
),
sa.Column("title", sa.String(), nullable=False),
sa.PrimaryKeyConstraint("id"),
schema="corpus_related",
)
op.add_column(
"corpus",
sa.Column("category_id", sa.Uuid()),
schema="corpus_related",
)
op.create_foreign_key(
None,
"corpus",
"category",
["category_id"],
["id"],
source_schema="corpus_related",
referent_schema="corpus_related",
)


def downgrade() -> None:
op.drop_column("corpus", "category_id", schema="corpus_related")
op.drop_table("category", schema="corpus_related")
91 changes: 91 additions & 0 deletions sql/89920abb7ff8_populate_corpus_category.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
-- Academic scientific publications
WITH ids AS (
INSERT
INTO
corpus_related.category(title)
VALUES
('Academic scientific publications')
RETURNING id AS catid
)
UPDATE
corpus_related.corpus
SET
category_id = ids.catid
FROM
ids
WHERE
corpus.source_name IN ('hal', 'plos', 'peerj', 'oapen', 'openalex', 'open-edition-books');

-- Teaching resources
WITH ids AS (
INSERT
INTO
corpus_related.category(title)
VALUES
('teaching resources')
RETURNING id AS catid
)
UPDATE
corpus_related.corpus
SET
category_id = ids.catid
FROM
ids
WHERE
corpus.source_name IN ('uved');

-- expert reports
WITH ids AS (
INSERT
INTO
corpus_related.category(title)
VALUES
('expert reports')
RETURNING id AS catid
)
UPDATE
corpus_related.corpus
SET
category_id = ids.catid
FROM
ids
WHERE
corpus.source_name IN ('ipcc', 'ipbes');

-- science communication and outreach
WITH ids AS (
INSERT
INTO
corpus_related.category(title)
VALUES
('science communication and outreach')
RETURNING id AS catid
)
UPDATE
corpus_related.corpus
SET
category_id = ids.catid
FROM
ids
WHERE
corpus.source_name IN ('conversation', 'ted');


-- collaborative and encyclopedic knowledge
WITH ids AS (
INSERT
INTO
corpus_related.category(title)
VALUES
('collaborative and encyclopedic knowledge')
RETURNING id AS catid
)
UPDATE
corpus_related.corpus
SET
category_id = ids.catid
FROM
ids
WHERE
corpus.source_name IN ('wikipedia');

11 changes: 11 additions & 0 deletions tests/document_classifier/test_document_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
import numpy
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sympy.integrals.meijerint_doc import category

from tests.database_test_utils import handle_schema_with_sqlite
from welearn_datastack.data.db_models import (
Base,
BiClassifierModel,
Category,
Corpus,
DocumentSlice,
NClassifierModel,
Expand Down Expand Up @@ -40,13 +42,19 @@ def setUp(self):
self.test_session = s_maker()
Base.metadata.create_all(self.test_session.get_bind())

self.category_name = "categroy_test0"
self.category_id = uuid4()

self.category = Category(id=self.category_id, title=self.category_name)

corpus_source_name = "test_corpus"

self.corpus_test = Corpus(
id=uuid.uuid4(),
source_name=corpus_source_name,
is_fix=True,
is_active=True,
category_id=self.category_id,
)

self.doc_test_id = uuid.uuid4()
Expand Down Expand Up @@ -83,6 +91,7 @@ def setUp(self):
n_classifier_model_id=uuid4(),
)

self.test_session.add(self.category)
self.test_session.add(self.corpus_test)
self.test_session.add(self.doc_test)
self.test_session.add(self.slice_test)
Expand Down Expand Up @@ -293,6 +302,7 @@ def test_main_externally_classified(
source_name=corpus_source_name,
is_fix=True,
is_active=True,
category_id=self.category_id,
)
doc_test = WeLearnDocument(
id=doc_test_id,
Expand Down Expand Up @@ -391,6 +401,7 @@ def test_main_externally_classified_but_without_sdg(
source_name=corpus_source_name,
is_fix=True,
is_active=True,
category_id=self.category_id,
)
doc_test = WeLearnDocument(
id=doc_test_id,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,16 @@
from sqlalchemy import create_engine
from sqlalchemy.exc import DatabaseError
from sqlalchemy.orm import sessionmaker
from sympy.integrals.meijerint_doc import category

from tests.database_test_utils import handle_schema_with_sqlite
from welearn_datastack.data.db_models import Base, Corpus, ProcessState, WeLearnDocument
from welearn_datastack.data.db_models import (
Base,
Corpus,
ProcessState,
WeLearnDocument,
Category,
)
from welearn_datastack.data.scraped_welearn_document import ScrapedWeLearnDocument
from welearn_datastack.modules import collector_selector
from welearn_datastack.nodes_workflow.DocumentHubCollector import document_collector
Expand Down Expand Up @@ -101,12 +108,20 @@ def setUp(self) -> None:
self.path_test_input.mkdir(parents=True, exist_ok=True)

os.environ["ARTIFACT_ROOT"] = self.path_test_input.parent.as_posix()
self.category_name = "categroy_test0"

self.category_id = uuid.uuid4()

self.category = Category(id=self.category_id, title=self.category_name)

self.test_session.add(self.category)

self.corpus_test = Corpus(
id=uuid.uuid4(),
source_name=corpus_source_name,
is_fix=True,
is_active=True,
category_id=self.category_id,
)

self.test_session.add(self.corpus_test)
Expand Down
10 changes: 10 additions & 0 deletions tests/document_vectorizer/test_document_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from tests.database_test_utils import handle_schema_with_sqlite
from welearn_datastack.data.db_models import (
Base,
Category,
Corpus,
CorpusEmbeddingModel,
DocumentSlice,
Expand Down Expand Up @@ -48,6 +49,14 @@ def setUp(self):
self.test_session = s_maker()
Base.metadata.create_all(self.test_session.get_bind())

self.category_name = "category_test0"

self.category_id = uuid.uuid4()

self.category = Category(id=self.category_id, title=self.category_name)

self.test_session.add(self.category)

corpus_source_name = "test_corpus"

self.embedding_model = EmbeddingModel(
Expand All @@ -63,6 +72,7 @@ def setUp(self):
source_name=corpus_source_name,
is_fix=True,
is_active=True,
category_id=self.category_id,
)

self.test_session.add(self.corpus_test)
Expand Down
10 changes: 10 additions & 0 deletions tests/qdrant_syncronizer/test_qdrant_syncronizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from tests.database_test_utils import handle_schema_with_sqlite
from welearn_datastack.data.db_models import (
Base,
Category,
Corpus,
DocumentSlice,
ProcessState,
Expand Down Expand Up @@ -61,13 +62,22 @@ def setUp(self):
Base.metadata.create_all(self.test_session.get_bind())
os.environ["ARTIFACT_ROOT"] = self.path_test_input.parent.as_posix()

self.category_name = "category_test0"

self.category_id = uuid.uuid4()

self.category = Category(id=self.category_id, title=self.category_name)

self.test_session.add(self.category)

corpus_source_name = "corpus"

self.corpus_test = Corpus(
id=uuid.uuid4(),
source_name=corpus_source_name,
is_fix=True,
is_active=True,
category_id=self.category_id,
)

doc_id = uuid.uuid4()
Expand Down
Loading