In [1]:
import pandas as pd
import spacy
from rapidfuzz import fuzz
from sklearn.model_selection import train_test_split

from data.dictionary import general_symptom_categories
from data.help_functions import clean_text, normalize_phrase, compute_scor_medical_diabet, compute_scor_medical_cardio, \
    infer_diagnosis_from_scor

# === 1. Inițializare ===
nlp = spacy.load("ro_core_news_sm")

# === 4. Încărcare și preprocesare text ===
df = pd.read_csv("processed/date_noi.csv", delimiter=';')
df.columns = df.columns.str.strip().str.replace('*', '', regex=False)

col1 = "Ce alte simptome sau boli prezinți?"
col2 = "In prezent, care este cea mai mare provocare a ta? Ce crezi ca te împiedica sa slăbești si sa ai o stare buna de sănătate? "

df[col1] = df[col1].fillna("").apply(clean_text)
df[col2] = df[col2].fillna("").apply(clean_text)

df["text_lemmatized"] = (df[col1] + " " + df[col2]).apply(lambda x: normalize_phrase(x, nlp))

# === Verificare fraze goale în categorii ===
for cat, phrases in general_symptom_categories.items():
    for phrase in phrases:
        if phrase.strip() == "":
            print(f"Fraza goală găsită în categoria '{cat}': '{phrase}'")


# === Etichetare fuzzy ===
def label_row(text_lemma, threshold=75):
    labels = set()
    words = set(text_lemma.lower().split())
    for category, phrases in general_symptom_categories.items():
        for phrase in phrases:
            phrase = phrase.strip().lower()
            if phrase in words:
                labels.add(category)
                break
            for w in words:
                if fuzz.ratio(phrase, w) >= threshold:
                    labels.add(category)
                    break
    return list(labels)


df["labels"] = df["text_lemmatized"].apply(label_row)

# === Calcul scor și etichete ===
df["scor_medical"] = df.apply(compute_scor_medical_diabet, axis=1)
df = df.apply(infer_diagnosis_from_scor, axis=1)


# === Creare coloana 'diagnostic' numerică pentru clasificare multiclasă ===
def encode_diagnostic(row):
    if row["diabet zaharat tip 2"] == 1:
        return 3  # Diabet
    elif row["prediabet"] == 1:
        return 2  # Prediabet
    elif row["rezistenta la insulina"] == 1:
        return 1  # Rezistență
    else:
        return 0  # Nu are nimic


df["diagnostic"] = df.apply(encode_diagnostic, axis=1)


# === Risc cardiovascular ===
def check_presence(text, phrase_list):
    tokens = text.split()
    for phrase in phrase_list:
        phrase_tokens = phrase.split()
        for i in range(len(tokens) - len(phrase_tokens) + 1):
            if tokens[i:i + len(phrase_tokens)] == phrase_tokens:
                return True
    return False


simptome_risc_cardio = [
    "durere piept", "presiune toracic", "nu pot respira", "sufocare",
    "tensiune mare", "hipertensiune", "puls mare", "batai rapide inima",
    "inima bate tare", "oboseala la efort", "edem", "picioare umflat",
    "retentie apa", "lesin", "vedere intunecat", "cap greu", "sforai puternic",
    "apnee somn", "colesterol mare", "trigliceride mari", "fumez zilnic",
    "stil viata sedentar", "nu fac miscare", "ma misc putin"
]
simptome_risc_cardio_norm = [normalize_phrase(p, nlp) for p in simptome_risc_cardio]

afectiuni_cardio_grave = {
    "infarct": [
        "infarct", "infarct miocardic", "atac de cord", "atac cardiac", "atac cord",
        "necroză miocardică", "ischemie miocardică", "angina instabilă", "angina pectorală",
        "stop cardiac", "infarct transmural", "infarct non-transmural",
        "ischemie coronariană", "boala coronariană", "sindrom coronarian acut"
    ],
    "avc": [
        "avc", "accident vascular cerebral", "accident vascular", "hemoragie cerebrală",
        "trombembolism cerebral", "ischemie cerebrală", "pareză", "hemipareză",
        "accident ischemic tranzitor", "tia", "accident vascular ischemic",
        "accident hemoragic", "hemoragie intracerebrală", "accident vascular lacunar",
        "embolism cerebral", "ischemie ischemică"
    ],
    "stent_sau_bypass": [
        "stent", "angioplastie", "bypass", "intervenție pe cord", "operație pe inimă",
        "revascularizare miocardică", "chirurgie coronariană", "by-pass coronarian",
        "angioplastie coronariană", "implantare stent coronarian", "intervenție coronariană percutană"
    ],
    "fibrilatie_sau_ritm": [
        "fibrilație atrială", "aritmie", "tulburări de ritm", "bătăi neregulate inimă",
        "palpitații severe", "tahicardie", "bradicardie", "extrasistole",
        "tulburări de conducere", "bloc atrioventricular", "fibrilație ventriculară",
        "flutter atrial", "tahiaritmie", "sindrom Wolff-Parkinson-White"
    ],
    "embolie_sau_tromboza": [
        "embolie", "embolie pulmonară", "tromboză", "cheag de sânge", "coagulare excesivă",
        "tromboembolism", "tromboflebită", "embolie arterială", "embolie venoasă profundă",
        "tromboză venoasă profundă", "tromboză arterială", "trombus", "trombocitopenie",
        "coagulopatie", "sindrom antifosfolipidic"
    ]
}
afectiuni_cardio_grave_normalizate = {
    col: [normalize_phrase(p, nlp) for p in expresii]
    for col, expresii in afectiuni_cardio_grave.items()
}

for col_name, phrase_list in afectiuni_cardio_grave_normalizate.items():
    df[col_name] = df["text_lemmatized"].apply(lambda text: int(check_presence(text, phrase_list)))

df["risc_cardiovascular"] = df["text_lemmatized"].apply(
    lambda text: int(check_presence(text, simptome_risc_cardio_norm)))

df["risc_cardiovascular"] = df.apply(
    lambda row: 1 if (
            row.get("infarct", 0) == 1 or
            row.get("avc", 0) == 1 or
            row.get("stent_sau_bypass", 0) == 1 or
            row.get("fibrilatie_sau_ritm", 0) == 1 or
            row.get("embolie_sau_tromboza", 0) == 1 or
            row.get("risc_cardiovascular", 0) == 1 or
            row.get("hipertensiune arteriala", 0) == 1 or
            row.get("dislipidemie (grăsimi crescute in sânge)", 0) == 1 or
            row.get("scor_medical_cardio", 0) > 55 or
            ("cardio_vascular" in row.get("labels", []))
    ) else 0,
    axis=1
)

df['scor_medical_cardio'] = df.apply(compute_scor_medical_cardio, axis=1)
mediana_scor = df['scor_medical_cardio'].median()
print("Mediana scorului medical cardio este:", mediana_scor)

# === Shuffle + stratified split ===
X = df.drop(columns=['diagnostic'])
y = df['diagnostic']

X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, random_state=42, stratify=y_tmp)

train = X_train.copy()
train['diagnostic'] = y_train

val = X_val.copy()
val['diagnostic'] = y_val

test = X_test.copy()
test['diagnostic'] = y_test

# === Salvare ===
train.to_csv('datasets/train/train.csv', sep=';', index=False)
val.to_csv('datasets/validation/val.csv', sep=';', index=False)
test.to_csv('datasets/test/test.csv', sep=';', index=False)

print("Etichete extrase:", sorted(set(label for labels in df["labels"] for label in labels)))

Mediana scorului medical cardio este: 22.0
Etichete extrase: ['cardio_vascular', 'gastro_hepato_renal', 'ginecologic_hormonal', 'inflamator_autoimun', 'metabolic_endocrin', 'neuro_psiho_energie']


In [2]:
train.head(7000)


Unnamed: 0,Vârstă,Ești,Care este înălțimea ta?,Care este greutatea ta actuala?,"Care este circumferința taliei tale, măsurata deasupra de ombilicului?",obezitate abdominala,rezistenta la insulina,prediabet,diabet zaharat tip 2,sindromul ovarelor polichistice,...,labels,scor_medical,infarct,avc,stent_sau_bypass,fibrilatie_sau_ritm,embolie_sau_tromboza,risc_cardiovascular,scor_medical_cardio,diagnostic
9201,53,0,168,87,112,1,1,1,0,0,...,"[neuro_psiho_energie, inflamator_autoimun, car...",32,0,0,0,0,0,1,30,2
1742,39,0,167,103,101,1,0,0,0,0,...,[],14,0,0,0,0,0,1,21,0
9308,84,1,153,132,117,1,1,1,0,0,...,"[cardio_vascular, metabolic_endocrin]",25,0,0,0,1,0,1,43,2
4773,71,1,159,81,75,0,0,0,0,0,...,[neuro_psiho_energie],13,0,0,0,0,0,0,9,0
9640,45,0,157,101,92,1,1,1,0,0,...,"[neuro_psiho_energie, cardio_vascular, metabol...",29,0,0,0,0,0,1,27,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,56,0,167,78,108,1,0,1,0,0,...,"[neuro_psiho_energie, cardio_vascular, metabol...",24,0,0,0,1,0,1,28,2
4231,47,0,151,91,108,0,0,0,0,0,...,[neuro_psiho_energie],10,0,0,0,0,0,1,19,0
3144,33,0,158,81,104,0,0,0,0,0,...,[],9,0,0,0,0,0,1,13,0
3441,33,0,168,74,86,1,1,0,0,1,...,"[neuro_psiho_energie, inflamator_autoimun, car...",22,0,0,0,0,0,1,15,1


In [3]:
val.head(3000)


Unnamed: 0,Vârstă,Ești,Care este înălțimea ta?,Care este greutatea ta actuala?,"Care este circumferința taliei tale, măsurata deasupra de ombilicului?",obezitate abdominala,rezistenta la insulina,prediabet,diabet zaharat tip 2,sindromul ovarelor polichistice,...,labels,scor_medical,infarct,avc,stent_sau_bypass,fibrilatie_sau_ritm,embolie_sau_tromboza,risc_cardiovascular,scor_medical_cardio,diagnostic
4671,41,0,170,70,93,0,0,0,0,0,...,"[cardio_vascular, metabolic_endocrin]",10,0,0,0,0,0,1,17,0
6199,40,1,159,133,129,1,0,0,0,0,...,"[neuro_psiho_energie, inflamator_autoimun, met...",11,0,0,0,0,0,1,14,0
4540,57,0,156,77,78,0,1,0,0,0,...,[neuro_psiho_energie],13,0,0,0,0,0,1,22,1
7224,38,0,182,94,105,1,1,1,0,0,...,"[cardio_vascular, ginecologic_hormonal]",18,1,0,0,1,0,1,43,2
9965,42,1,182,83,114,1,1,1,0,0,...,"[neuro_psiho_energie, cardio_vascular, metabol...",20,0,0,0,1,0,1,38,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7322,49,0,153,124,99,1,1,0,1,1,...,"[inflamator_autoimun, cardio_vascular, metabol...",33,0,0,0,1,0,1,44,3
724,22,0,167,95,108,1,1,0,0,1,...,[],18,0,0,0,0,0,0,14,1
6232,61,1,189,125,123,1,0,0,0,0,...,"[inflamator_autoimun, cardio_vascular, metabol...",14,0,0,0,1,0,1,31,0
4843,61,0,177,108,100,1,0,0,0,0,...,[],9,0,0,0,0,0,1,24,0


In [4]:
test.head(2500)

Unnamed: 0,Vârstă,Ești,Care este înălțimea ta?,Care este greutatea ta actuala?,"Care este circumferința taliei tale, măsurata deasupra de ombilicului?",obezitate abdominala,rezistenta la insulina,prediabet,diabet zaharat tip 2,sindromul ovarelor polichistice,...,labels,scor_medical,infarct,avc,stent_sau_bypass,fibrilatie_sau_ritm,embolie_sau_tromboza,risc_cardiovascular,scor_medical_cardio,diagnostic
7025,75,1,168,117,116,1,1,0,0,0,...,[neuro_psiho_energie],21,0,1,0,0,0,1,40,1
201,49,0,160,68,91,1,0,0,0,0,...,[metabolic_endocrin],14,0,0,0,0,0,1,13,0
6773,68,1,158,114,115,1,1,0,0,0,...,"[neuro_psiho_energie, inflamator_autoimun, car...",24,1,0,0,0,0,1,39,1
5944,75,0,173,77,129,1,1,0,1,0,...,"[neuro_psiho_energie, cardio_vascular, metabol...",26,1,1,0,0,0,1,52,3
9193,78,0,182,89,121,1,1,0,1,0,...,"[inflamator_autoimun, cardio_vascular, metabol...",32,0,0,0,0,0,1,37,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6974,54,0,157,125,106,1,1,0,1,0,...,[metabolic_endocrin],32,0,0,0,0,0,1,37,3
2196,58,0,179,94,100,1,1,1,0,0,...,[],27,0,0,0,0,0,1,27,2
6584,57,0,154,90,89,1,1,0,1,0,...,"[neuro_psiho_energie, cardio_vascular, metabol...",30,0,0,0,0,0,1,35,3
2180,54,0,160,98,90,1,1,0,0,0,...,[],15,0,0,0,0,0,0,15,1
