In [1]:

import pandas as pd
import spacy
from fuzzywuzzy import fuzz
from sklearn.model_selection import train_test_split

from data.dictionary import general_symptom_categories
from data.help_functions import clean_text, normalize_phrase, compute_scor_medical_diabet, \
    compute_scor_medical_cardio

# === 1. Inițializare ===
nlp = spacy.load("ro_core_news_sm")

# === 4. Încărcare și preprocesare text ===
df = pd.read_csv("processed/date_noi.csv", delimiter=';')
df.columns = df.columns.str.strip().str.replace('*', '', regex=False)

col1 = "Ce alte simptome sau boli prezinți?"

df[col1] = df[col1].fillna("").apply(clean_text)

df["text_lemmatized"] = (df[col1]).apply(lambda x: normalize_phrase(x, nlp))

# === Verificare fraze goale în categorii ===
for cat, phrases in general_symptom_categories.items():
    for phrase in phrases:
        if phrase.strip() == "":
            print(f"Fraza goală găsită în categoria '{cat}': '{phrase}'")

NEUTRAL_WORDS = {
    "tratament", "tratat", "valoare normala", "valori normale", "ok", "bine", "bun",
    "in regula", "rezolvat", "monitorizat", "sub control", "fara simptome"
}


def text_to_ngrams(text, n):
    words = text.split()
    return [' '.join(words[i:i + n]) for i in range(len(words) - n + 1)]


NEUTRAL_WORDS = {
    "tratament", "tratat", "valoare normala", "valori normale", "ok", "bine", "bun",
    "in regula", "rezolvat", "monitorizat", "sub control", "fara simptome"
}


def label_row(text_lemma, threshold_short=93, threshold_long=85):
    labels = set()
    detected_phrases = set()

    text_lower = text_lemma.lower().strip()
    words_in_text = text_lower.split()

    # 1. NU excludem întreg textul dacă are un cuvânt neutru, dar vom ignora fraze cu neutral words
    neutral_found = any(word in text_lower for word in NEUTRAL_WORDS)

    for category, phrases in general_symptom_categories.items():
        for phrase in phrases:
            phrase_clean = phrase.lower().strip()
            if not phrase_clean:
                continue
            if phrase_clean in detected_phrases:
                continue

            n_words = len(phrase_clean.split())
            threshold = threshold_long if n_words > 2 else threshold_short

            # 1. Match exact în text
            if phrase_clean in text_lower:
                if not neutral_found or not any(neg in phrase_clean for neg in NEUTRAL_WORDS):
                    labels.add(category)
                    detected_phrases.add(phrase_clean)
                continue

            # 2. Fuzzy match (pe întreg textul)
            ratio = fuzz.ratio(phrase_clean, text_lower)
            partial = fuzz.partial_ratio(phrase_clean, text_lower)

            if max(ratio, partial) >= threshold:
                if not neutral_found or not any(neg in phrase_clean for neg in NEUTRAL_WORDS):
                    labels.add(category)
                    detected_phrases.add(phrase_clean)
                continue

            # 3. Dacă e un singur cuvânt, îl căutăm direct
            if n_words == 1 and phrase_clean in words_in_text:
                labels.add(category)
                detected_phrases.add(phrase_clean)

    return list(labels)


df["labels"] = df["text_lemmatized"].apply(label_row)

# === Calcul scor și etichete ===
df["scor_medical"] = df.apply(compute_scor_medical_diabet, axis=1)


# === Creare coloana 'diagnostic' numerică pentru clasificare multiclasă ===
def encode_diagnostic(row):
    if row["diabet zaharat tip 2"] == 1:
        return 3  # Diabet
    elif row["prediabet"] == 1:
        return 2  # Prediabet
    elif row["rezistenta la insulina"] == 1:
        return 1  # Rezistență
    else:
        return 0  # Nu are nimic


df["diagnostic"] = df.apply(encode_diagnostic, axis=1)


# === Risc cardiovascular ===
def check_presence(text, phrase_list):
    tokens = text.split()
    for phrase in phrase_list:
        phrase_tokens = phrase.split()
        for i in range(len(tokens) - len(phrase_tokens) + 1):
            if tokens[i:i + len(phrase_tokens)] == phrase_tokens:
                return True
    return False


simptome_risc_cardio = [
    "durere piept", "presiune toracic", "nu pot respira", "sufocare",
    "tensiune mare", "hipertensiune", "puls mare", "batai rapide inima",
    "inima bate tare", "oboseala la efort", "edem", "picioare umflat",
    "retentie apa", "lesin", "vedere intunecat", "cap greu", "sforai puternic",
    "apnee somn", "colesterol mare", "trigliceride mari", "fumez zilnic",
    "stil viata sedentar", "nu fac miscare", "ma misc putin"
]
simptome_risc_cardio_norm = [normalize_phrase(p, nlp) for p in simptome_risc_cardio]

afectiuni_cardio_grave = {
    "infarct": [
        "infarct", "infarct miocardic", "atac de cord", "atac cardiac", "atac cord",
        "necroză miocardică", "ischemie miocardică", "angina instabilă", "angina pectorală",
        "stop cardiac", "infarct transmural", "infarct non-transmural",
        "ischemie coronariană", "boala coronariană", "sindrom coronarian acut"
    ],
    "avc": [
        "avc", "accident vascular cerebral", "accident vascular", "hemoragie cerebrală",
        "trombembolism cerebral", "ischemie cerebrală", "pareză", "hemipareză",
        "accident ischemic tranzitor", "tia", "accident vascular ischemic",
        "accident hemoragic", "hemoragie intracerebrală", "accident vascular lacunar",
        "embolism cerebral", "ischemie ischemică"
    ],
    "stent_sau_bypass": [
        "stent", "angioplastie", "bypass", "intervenție pe cord", "operație pe inimă",
        "revascularizare miocardică", "chirurgie coronariană", "by-pass coronarian",
        "angioplastie coronariană", "implantare stent coronarian", "intervenție coronariană percutană"
    ],
    "fibrilatie_sau_ritm": [
        "fibrilație atrială", "aritmie", "tulburări de ritm", "bătăi neregulate inimă",
        "palpitații severe", "tahicardie", "bradicardie", "extrasistole",
        "tulburări de conducere", "bloc atrioventricular", "fibrilație ventriculară",
        "flutter atrial", "tahiaritmie", "sindrom Wolff-Parkinson-White"
    ],
    "embolie_sau_tromboza": [
        "embolie", "embolie pulmonară", "tromboză", "cheag de sânge", "coagulare excesivă",
        "tromboembolism", "tromboflebită", "embolie arterială", "embolie venoasă profundă",
        "tromboză venoasă profundă", "tromboză arterială", "trombus", "trombocitopenie",
        "coagulopatie", "sindrom antifosfolipidic"
    ]
}
afectiuni_cardio_grave_normalizate = {
    col: [normalize_phrase(p, nlp) for p in expresii]
    for col, expresii in afectiuni_cardio_grave.items()
}

for col_name, phrase_list in afectiuni_cardio_grave_normalizate.items():
    df[col_name] = df["text_lemmatized"].apply(lambda text: int(check_presence(text, phrase_list)))

df["risc_cardiovascular"] = df["text_lemmatized"].apply(
    lambda text: int(check_presence(text, simptome_risc_cardio_norm)))

df["risc_cardiovascular"] = df.apply(
    lambda row: 1 if (
            row.get("infarct", 0) == 1 or
            row.get("avc", 0) == 1 or
            row.get("stent_sau_bypass", 0) == 1 or
            row.get("fibrilatie_sau_ritm", 0) == 1 or
            row.get("embolie_sau_tromboza", 0) == 1 or
            row.get("risc_cardiovascular", 0) == 1 or
            row.get("hipertensiune arteriala", 0) == 1 or
            row.get("dislipidemie (grăsimi crescute in sânge)", 0) == 1 or
            row.get("scor_medical_cardio", 0) > 55 or
            ("cardio_vascular" in row.get("labels", []))
    ) else 0,
    axis=1
)

df['scor_medical_cardio'] = df.apply(compute_scor_medical_cardio, axis=1)
mediana_scor = df['scor_medical_cardio'].median()
print("Mediana scorului medical cardio este:", mediana_scor)

# === 2. Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# === Copie originală pentru fiecare task
df_nlp = df.copy()

# === Split 60% train, 20% val, 20% test ===
train_size = 0.6
val_size = 0.2  # din total

# Primul split: train și rest
df_train, df_tmp = train_test_split(df, train_size=train_size, random_state=42)

# Al doilea split: val și test (jumătate din 40%)
df_val, df_test = train_test_split(df_tmp, test_size=0.5, random_state=42)

# Salvează în CSV
df_train.to_csv('datasets/train/train.csv', sep=';', index=False)
df_val.to_csv('datasets/validation/val.csv', sep=';', index=False)
df_test.to_csv('datasets/test/test.csv', sep=';', index=False)

# === Pregătire

# 1. Split train 60%, rest 40%
train_df, temp_df = train_test_split(df_nlp, test_size=0.4, random_state=42, shuffle=True)

# 2. Din rest 40%, split val și test 50%/50% => 20% fiecare din total
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, shuffle=True)

# Salvare în fișiere CSV
train_df.to_csv("datasets/train/train_nlp.csv", sep=';', index=False)
val_df.to_csv("datasets/validation/val_nlp.csv", sep=';', index=False)
test_df.to_csv("datasets/test/test_nlp.csv", sep=';', index=False)

# === 1. Copie dedicată pentru modelul C (clasificare risc cardiovascular)
df_cardio = df.copy()

# === 2. Separare în features și target
X = df_cardio.drop(columns=["risc_cardiovascular"])
y = df_cardio["risc_cardiovascular"]

# === 3. Split fără stratificare: 60% train, 20% val, 20% test
X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size=0.4, random_state=42  # fără stratify
)

X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, random_state=42  # fără stratify
)

# === 4. Reconstruire DataFrame-uri
train_cardio = X_train.copy()
train_cardio["risc_cardiovascular"] = y_train

val_cardio = X_val.copy()
val_cardio["risc_cardiovascular"] = y_val

test_cardio = X_test.copy()
test_cardio["risc_cardiovascular"] = y_test

# === 5. Salvare CSV
train_cardio.to_csv("datasets/train/train_cardio.csv", sep=";", index=False)
val_cardio.to_csv("datasets/validation/val_cardio.csv", sep=";", index=False)
test_cardio.to_csv("datasets/test/test_cardio.csv", sep=";", index=False)

print("Etichete extrase:", sorted(set(label for labels in df["labels"] for label in labels)))

Mediana scorului medical cardio este: 21.0
Etichete extrase: ['cardio_vascular', 'gastro_hepato_renal', 'ginecologic_hormonal', 'inflamator_autoimun', 'metabolic_endocrin', 'neuro_psiho_energie']


In [2]:
df_train.head(7000)

Unnamed: 0,Vârstă,Ești,Care este înălțimea ta?,Care este greutatea ta actuala?,"Care este circumferința taliei tale, măsurata deasupra de ombilicului?",obezitate abdominala,rezistenta la insulina,prediabet,diabet zaharat tip 2,sindromul ovarelor polichistice,...,labels,scor_medical,diagnostic,infarct,avc,stent_sau_bypass,fibrilatie_sau_ritm,embolie_sau_tromboza,risc_cardiovascular,scor_medical_cardio
2262,25,0,163,104,106,1,0,0,0,1,...,[ginecologic_hormonal],18,0,0,0,0,0,0,0,9
7105,47,0,168,99,150,1,0,1,0,0,...,[],23,2,0,0,0,0,0,0,19
3991,70,1,161,75,123,1,1,0,1,0,...,"[cardio_vascular, metabolic_endocrin]",32,3,1,0,0,0,0,1,47
6061,36,1,166,114,126,1,1,0,1,0,...,"[cardio_vascular, inflamator_autoimun, ginecol...",29,3,0,0,0,0,0,1,26
3992,39,0,170,100,103,1,1,0,0,0,...,[metabolic_endocrin],28,1,0,0,0,0,0,0,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,51,0,169,90,90,1,0,0,0,0,...,[],14,0,0,0,0,0,0,0,8
5191,47,1,190,126,122,1,0,0,0,0,...,"[cardio_vascular, metabolic_endocrin]",19,0,0,0,0,1,0,1,27
5390,35,0,156,90,115,1,1,0,0,0,...,"[cardio_vascular, inflamator_autoimun, ginecol...",25,1,0,0,0,0,0,1,15
860,49,1,160,119,112,1,1,1,0,0,...,"[cardio_vascular, metabolic_endocrin]",25,2,0,1,0,1,0,1,51


In [3]:
df_val.head(3000)


Unnamed: 0,Vârstă,Ești,Care este înălțimea ta?,Care este greutatea ta actuala?,"Care este circumferința taliei tale, măsurata deasupra de ombilicului?",obezitate abdominala,rezistenta la insulina,prediabet,diabet zaharat tip 2,sindromul ovarelor polichistice,...,labels,scor_medical,diagnostic,infarct,avc,stent_sau_bypass,fibrilatie_sau_ritm,embolie_sau_tromboza,risc_cardiovascular,scor_medical_cardio
4098,37,0,167,106,108,0,0,0,0,0,...,[metabolic_endocrin],13,0,0,0,0,0,0,0,3
2638,76,0,179,63,117,1,0,0,0,0,...,"[inflamator_autoimun, metabolic_endocrin]",19,0,0,0,0,0,0,0,7
9253,33,0,170,127,108,1,0,0,0,1,...,[neuro_psiho_energie],18,0,0,0,0,0,0,0,10
1507,20,1,174,65,86,0,0,0,0,0,...,[],5,0,0,0,0,0,0,0,0
6689,41,0,166,63,79,0,0,0,0,1,...,[],11,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4442,64,1,172,91,56,0,0,0,0,0,...,[],12,0,0,0,0,0,0,1,12
9313,62,1,166,124,104,1,1,0,1,0,...,"[cardio_vascular, metabolic_endocrin]",31,3,0,0,0,0,0,1,32
6088,53,0,160,66,68,1,0,0,0,0,...,[metabolic_endocrin],20,0,0,0,0,0,0,0,9
6890,81,0,184,97,111,1,1,0,1,0,...,"[cardio_vascular, metabolic_endocrin]",28,3,0,1,0,0,0,1,43


In [4]:
df_test.head(2500)


Unnamed: 0,Vârstă,Ești,Care este înălțimea ta?,Care este greutatea ta actuala?,"Care este circumferința taliei tale, măsurata deasupra de ombilicului?",obezitate abdominala,rezistenta la insulina,prediabet,diabet zaharat tip 2,sindromul ovarelor polichistice,...,labels,scor_medical,diagnostic,infarct,avc,stent_sau_bypass,fibrilatie_sau_ritm,embolie_sau_tromboza,risc_cardiovascular,scor_medical_cardio
2960,72,1,169,126,120,1,1,0,1,0,...,"[cardio_vascular, metabolic_endocrin]",33,3,0,0,0,1,0,1,36
7182,48,0,155,71,87,0,1,0,0,0,...,"[metabolic_endocrin, neuro_psiho_energie]",24,1,0,0,0,0,0,0,13
3207,54,1,163,91,130,1,0,0,0,0,...,"[cardio_vascular, metabolic_endocrin]",21,0,0,1,0,0,0,1,33
6377,72,1,173,122,137,1,0,0,0,0,...,"[cardio_vascular, metabolic_endocrin]",20,0,0,1,0,0,0,1,29
6227,57,1,165,132,120,1,0,0,0,0,...,"[cardio_vascular, metabolic_endocrin]",22,0,1,0,0,0,0,1,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4705,40,0,167,83,106,1,1,0,0,1,...,[],26,1,0,0,0,0,0,0,9
3019,30,0,157,59,108,0,1,0,0,1,...,[metabolic_endocrin],16,1,0,0,0,0,0,0,5
3077,46,0,167,82,93,1,1,0,0,0,...,[],17,1,0,0,0,0,0,0,10
8145,40,0,172,91,108,0,0,1,0,1,...,[],20,2,0,0,0,0,0,0,12


In [5]:
df.to_csv("datasets/date_preprocesate.csv", sep=';', index=False)