Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,10 @@ class MachineTranslationSettings(BaseModel):

class XliffProcessingSettings(BaseModel):
substitute_numbers: bool
use_machine_translation: bool
machine_translation_settings: Optional[MachineTranslationSettings]
tmx_file_ids: list[int]
tmx_usage: TmxUsage
similarity_threshold: float = Field(default=1.0, ge=0.0, le=1.0)


class StatusMessage(BaseModel):
Expand Down
21 changes: 2 additions & 19 deletions backend/app/routers/xliff.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile, status
from fastapi.responses import StreamingResponse
from sqlalchemy import select, func, text
from sqlalchemy.orm import Session

from app import models, schema
from app.auth import get_current_user_id, has_user_role
from app.db import get_db
from app.translation_memory.utils import get_substitutions
from app.xliff import SegmentState, extract_xliff_content

# TODO: add XLIFF segments statuses according to the specification
Expand Down Expand Up @@ -128,24 +128,7 @@ def get_segment_substitutions(
if not tmx_ids:
return []

similarity_func = func.similarity(schema.TmxRecord.source, original_segment.source)
db.execute(
text("SET pg_trgm.similarity_threshold TO :threshold"), {"threshold": 0.7}
)
records = db.execute(
select(schema.TmxRecord.source, schema.TmxRecord.target, similarity_func)
.filter(
schema.TmxRecord.source.op("%")(original_segment.source),
schema.TmxRecord.document_id.in_(tmx_ids),
)
.order_by(similarity_func.desc())
.limit(10),
).all()

return [
models.XliffSubstitution(source=source, target=target, similarity=similarity)
for (source, target, similarity) in records
]
return get_substitutions(original_segment.source, tmx_ids, db)


@router.put("/{doc_id}/record/{record_id}")
Expand Down
31 changes: 31 additions & 0 deletions backend/app/translation_memory/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from sqlalchemy import func, select, text
from sqlalchemy.orm import Session

from app import models, schema


def get_substitutions(
source: str,
tmx_ids: list[int],
db: Session,
threshold: float = 0.7,
count: int = 10,
) -> list[models.XliffSubstitution]:
similarity_func = func.similarity(schema.TmxRecord.source, source)
db.execute(
text("SET pg_trgm.similarity_threshold TO :threshold"), {"threshold": threshold}
)
records = db.execute(
select(schema.TmxRecord.source, schema.TmxRecord.target, similarity_func)
.filter(
schema.TmxRecord.source.op("%")(source),
schema.TmxRecord.document_id.in_(tmx_ids),
)
.order_by(similarity_func.desc())
.limit(count),
).all()

return [
models.XliffSubstitution(source=source, target=target, similarity=similarity)
for (source, target, similarity) in records
]
6 changes: 1 addition & 5 deletions backend/tests/test_routes_xliff.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,6 @@ def test_process_sets_document_in_pending_stage_and_creates_task(
"/xliff/1/process",
json={
"substitute_numbers": False,
"use_machine_translation": False,
"machine_translation_settings": None,
"tmx_file_ids": [],
"tmx_usage": "newest",
Expand All @@ -417,7 +416,6 @@ def test_process_creates_task(user_logged_client: TestClient, session: Session):
"/xliff/1/process",
json={
"substitute_numbers": False,
"use_machine_translation": False,
"machine_translation_settings": None,
"tmx_file_ids": [1],
"tmx_usage": "newest",
Expand All @@ -436,10 +434,10 @@ def test_process_creates_task(user_logged_client: TestClient, session: Session):
"doc_id": 1,
"settings": {
"substitute_numbers": False,
"use_machine_translation": False,
"machine_translation_settings": None,
"tmx_file_ids": [1],
"tmx_usage": "newest",
"similarity_threshold": 1.0,
},
}

Expand All @@ -459,7 +457,6 @@ def test_process_creates_xliff_tmx_link(
"/xliff/1/process",
json={
"substitute_numbers": False,
"use_machine_translation": False,
"machine_translation_settings": None,
"tmx_file_ids": [1, 2],
"tmx_usage": "newest",
Expand All @@ -482,7 +479,6 @@ def test_returns_404_when_processing_nonexistent_xliff_doc(
"/xliff/1/process",
json={
"substitute_numbers": False,
"use_machine_translation": False,
"machine_translation_settings": None,
"tmx_file_ids": [],
"tmx_usage": "newest",
Expand Down
60 changes: 34 additions & 26 deletions backend/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,37 +10,45 @@
from sqlalchemy.orm import Session

from app import db, models, schema
from app.translation_memory.utils import get_substitutions
from app.translators import yandex
from app.xliff import XliffSegment, extract_xliff_content
from app.xliff import extract_xliff_content


def get_segment_translation(
segment: XliffSegment,
source: str,
settings: models.XliffProcessingSettings,
session: Session,
):
# TODO: this is slow, it needs to be optimized
selector = (
select(schema.TmxRecord.source, schema.TmxRecord.target)
.where(schema.TmxRecord.source == segment.original)
.where(schema.TmxRecord.document_id.in_(settings.tmx_file_ids))
)
match settings.tmx_usage:
case models.TmxUsage.NEWEST:
selector = selector.order_by(schema.TmxRecord.change_date.desc())
case models.TmxUsage.OLDEST:
selector = selector.order_by(schema.TmxRecord.change_date.asc())
case _:
logging.error("Unknown TMX usage option")
return None

tmx_data = session.execute(selector.limit(1)).first()

if tmx_data:
return tmx_data.target

if settings.substitute_numbers and segment.original.isdigit():
return segment.original
# TODO: this would be nice to have batching for all segments to reduce amounts of requests to DB
if settings.substitute_numbers and source.isdigit():
return source

if settings.similarity_threshold < 1.0:
substitutions = get_substitutions(
source, settings.tmx_file_ids, session, settings.similarity_threshold, 1
)
if substitutions:
return substitutions[0].target
else:
selector = (
select(schema.TmxRecord.source, schema.TmxRecord.target)
.where(schema.TmxRecord.source == source)
.where(schema.TmxRecord.document_id.in_(settings.tmx_file_ids))
)
match settings.tmx_usage:
case models.TmxUsage.NEWEST:
selector = selector.order_by(schema.TmxRecord.change_date.desc())
case models.TmxUsage.OLDEST:
selector = selector.order_by(schema.TmxRecord.change_date.asc())
case _:
logging.error("Unknown TMX usage option")
return None

tmx_data = session.execute(selector.limit(1)).first()

if tmx_data:
return tmx_data.target

return None

Expand All @@ -54,7 +62,7 @@ def process_xliff(
to_translate: list[int] = []
for i, segment in enumerate(xliff_data.segments):
if not segment.approved:
translation = get_segment_translation(segment, settings, session)
translation = get_segment_translation(segment.original, settings, session)
if not translation:
# we cannot find translation for this segment
# save it to translate by Yandex
Expand All @@ -66,7 +74,7 @@ def process_xliff(
# translate by Yandex if there is a setting to do so enabled
# TODO: it is better to make solution more translation service agnostic
machine_translation_failed = False
if settings.use_machine_translation and len(to_translate) > 0:
if settings.machine_translation_settings and len(to_translate) > 0:
if (
not settings.machine_translation_settings
or not settings.machine_translation_settings.folder_id
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/client/schemas/XliffProcessingSettings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ import {TmxUsage} from './TmxUsage'

export interface XliffProcessingSettings {
substitute_numbers: boolean
use_machine_translation: boolean
machine_translation_settings: MachineTranslationSettings | null
tmx_file_ids: number[]
tmx_usage: TmxUsage
similarity_threshold?: number
}
27 changes: 21 additions & 6 deletions frontend/src/components/XliffUploadingDialog.vue
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ const machineTranslationSettings = ref<MachineTranslationSettings>({
folder_id: '',
oauth_token: '',
})
const similarityThreshold = ref<number>(1.0)

const processingAvailable = computed(() => uploadedFile.value != null)
const tmxStore = useTmxStore()
Expand Down Expand Up @@ -70,12 +71,12 @@ const startProcessing = async () => {
status.value = 'Processing...'
await processXliff(uploadedFile.value!.id, {
substitute_numbers: substituteNumbers.value,
use_machine_translation: useMachineTranslation.value,
machine_translation_settings: useMachineTranslation.value
? machineTranslationSettings.value
: null,
tmx_file_ids: tmxStore.selectedIds,
tmx_usage: tmxStore.tmxMode,
similarity_threshold: similarityThreshold.value,
})
uploading.value = false
status.value = 'Done!'
Expand Down Expand Up @@ -128,6 +129,22 @@ onMounted(async () => {
option-value="value"
/>
</div>
<div class="flex flex-col gap-2 mb-4 max-w-96 mt-2">
<label>Substitution similary threshold:</label>
<Select
v-model="similarityThreshold"
:options="[
{name: '100%', value: 1.0},
{name: '95%', value: 0.95},
{name: '90%', value: 0.9},
{name: '85%', value: 0.85},
{name: '80%', value: 0.8},
{name: '75%', value: 0.75},
]"
option-label="name"
option-value="value"
/>
</div>
<div class="flex items-center mt-2">
<Checkbox
id="sn"
Expand All @@ -139,7 +156,7 @@ onMounted(async () => {
class="ml-2"
@click="substituteNumbers = !substituteNumbers"
>
Substitute segments with numbers only
Substitute segments containing only digits
</label>
</div>
<div class="flex items-center">
Expand Down Expand Up @@ -200,10 +217,8 @@ onMounted(async () => {
</div>
<div v-else>{{ status }}</div>
</template>
<template #empty
><span v-if="!status">
Choose XLIFF file to upload.
</span>
<template #empty>
<span v-if="!status">Choose XLIFF file to upload.</span>
</template>
</FileUpload>
</div>
Expand Down