In [None]:
from transformers import pipeline

translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")

def translate_fr_to_en(text_fr):
    result = translator(text_fr, max_length=512)
    return result[0]['translation_text']

In [None]:
import pandas as pd
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

# 1. Traduction
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")
def translate_fr_to_en(text_fr):
    result = translator(text_fr, max_length=512)
    return result[0]['translation_text']

# 2. Charger Synthea
patients = pd.read_csv("synthea_output/csv/patients.csv")
conditions = pd.read_csv("synthea_output/csv/conditions.csv")
encounters = pd.read_csv("synthea_output/csv/encounters.csv")

# 3. Préparer un texte descriptif par patient (concaténation conditions + encounters)
def get_patient_text(pid):
    conds = conditions[conditions['patient_id'] == pid]['description'].fillna('').str.cat(sep=' ')
    encs = encounters[encounters['patient_id'] == pid]['description'].fillna('').str.cat(sep=' ')
    return conds + ' ' + encs

patients['text_data'] = patients['id'].apply(get_patient_text)

# 4. Embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

patient_embeddings = model.encode(patients['text_data'].tolist(), convert_to_tensor=True)

def find_top_patients(text_en, top_k=3):
    query_emb = model.encode(text_en, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_emb, patient_embeddings)[0]
    top_results = cos_scores.topk(k=top_k)
    return [(patients.iloc[idx]['id'], cos_scores[idx].item()) for idx in top_results.indices]

# 5. Utilisation
text_fr = "Fièvre élevée, toux sèche, difficulté à respirer."
text_en = translate_fr_to_en(text_fr)

top_patients = find_top_patients(text_en)

for pid, score in top_patients:
    print(f"Patient {pid} - Score similarité: {score:.3f}")
    conds = conditions[conditions['patient_id'] == pid][['description', 'start_date', 'end_date']]
    print("Conditions associées :")
    print(conds.head())