In [1]:
from config import init_polish_perplexity_model, init_spacy_polish_nlp_model, init_language_tool_pl

from typing import List
from tqdm import tqdm

from dao.lab_report import DAOLabReport
from dao.attribute import DAOAttributePL

from models.lab_report import LabReportInDB
from models.attribute import AttributePL, AttributePLInDB

from analysis.attribute_retriving import perform_full_analysis
from analysis.nlp_transformations import remove_report_tags, replace_whitespaces, replace_links_with_text
from services.utils import suppress_stdout

[nltk_data] Downloading package stopwords to /home/pawel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/pawel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package pl196x to /home/pawel/nltk_data...
[nltk_data]   Package pl196x is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pawel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Init models

In [2]:
#init_polish_perplexity_model()

In [2]:
init_spacy_polish_nlp_model()

In [3]:
init_language_tool_pl()

# Load data

In [4]:
dao_lab_reports = DAOLabReport()
dao_attributes = DAOAttributePL()

In [5]:
real_lab_reports: List[LabReportInDB] = dao_lab_reports.find_many_by_query({'is_generated': False})
generated_lab_reports: List[LabReportInDB] = dao_lab_reports.find_many_by_query({'is_generated': True})

In [6]:
alreadyprocessed_lab_reports = dao_attributes.find_many_by_query({})
alreadyprocessed_lab_reports_ids = [report.referenced_doc_id for report in alreadyprocessed_lab_reports]

real_lab_reports = [report for report in real_lab_reports if report.id not in alreadyprocessed_lab_reports_ids]
generated_lab_reports = [report for report in generated_lab_reports if report.id not in alreadyprocessed_lab_reports_ids]

# Imports

In [7]:
for real_lab_report in tqdm(real_lab_reports, total=len(real_lab_reports), desc=f'Calculating real lab reports statistics', unit='Lab reports', miniters=1):
    text_to_analyse = remove_report_tags(real_lab_report.plaintext_content)
    text_to_analyse = replace_whitespaces(text_to_analyse)
    text_to_analyse = replace_links_with_text(text_to_analyse, replacement="")
    with suppress_stdout():
        analysis_result = perform_full_analysis(text_to_analyse, 'pl')
    attribute_to_insert = AttributePL(
        referenced_db_name='lab_reports',
        referenced_doc_id=real_lab_report.id,
        language="pl",
        is_generated=False,
        is_personal=None,
        **analysis_result.dict()
    )
    dao_attributes.insert_one(attribute_to_insert)
    
for generated_lab_report in tqdm(generated_lab_reports, total=len(generated_lab_reports), desc=f'Calculating generated lab reports statistics', unit='Lab reports', miniters=1):
    text_to_analyse = remove_report_tags(generated_lab_report.plaintext_content)
    text_to_analyse = replace_whitespaces(text_to_analyse)
    text_to_analyse = replace_links_with_text(text_to_analyse, replacement="")
    with suppress_stdout():
        analysis_result = perform_full_analysis(text_to_analyse, 'pl')
    attribute_to_insert = AttributePL(
        referenced_db_name='lab_reports',
        referenced_doc_id=generated_lab_report.id,
        language="pl",
        is_generated=True,
        is_personal=None,
        **analysis_result.dict()
    )
    dao_attributes.insert_one(attribute_to_insert)

Calculating real lab reports statistics:   0%|          | 0/41 [00:00<?, ?Lab reports/s]Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors
Calculating real lab reports statistics:   2%|▏         | 1/41 [43:16<28:51:00, 2596.51s/Lab reports]Token indices sequence length is longer than the specified maximum sequence length for this model (684 > 512). Running this sequence through the model will result in indexing errors
Calculating real lab reports statistics:  32%|███▏      | 13/41 [51:15<13:10, 28.24s/Lab reports]    Token indices sequence length is longer than the specified maximum sequence length for this model (889 > 512). Running this sequence through the model will result in indexing errors
Calculating real lab reports statistics:  41%|████▏     | 17/41 [57:09<24:54, 62.26s/Lab reports]Token indices sequence length is longer than the specified maximum 