In [2]:
import pandas as pd
import os

import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

from spacy import displacy
import edsnlp, edsnlp.pipes as eds

input_dir = "../data_clean/"
visuals_dir = "../visuals/" 

In [3]:
df_person = pd.read_pickle(os.path.join(input_dir, 'df_person.pkl'))
df_bio = pd.read_pickle(os.path.join(input_dir, 'df_bio.pkl'))
df_note = pd.read_pickle(os.path.join(input_dir, 'df_note.pkl'))
df_visit = pd.read_pickle(os.path.join(input_dir, 'df_visit.pkl'))
df_condition = pd.read_pickle(os.path.join(input_dir, 'df_condition.pkl'))
df_facteur_risque = pd.read_pickle(os.path.join(input_dir, 'df_facteur_risque.pkl'))

In [9]:
nlp_smoker = edsnlp.blank("eds")
nlp_smoker.add_pipe(eds.sentences())
nlp_smoker.add_pipe(eds.normalizer())
nlp_smoker.add_pipe(eds.tobacco())
nlp_smoker.add_pipe(eds.negation())
nlp_smoker.add_pipe(eds.family())

nlp_alcool = edsnlp.blank("eds")
nlp_alcool.add_pipe(eds.sentences())
nlp_alcool.add_pipe(eds.normalizer()) 
nlp_alcool.add_pipe(eds.alcohol())
nlp_alcool.add_pipe(eds.negation())
nlp_alcool.add_pipe(eds.family())

terms = dict(
    cancer_sein=[
        "cancer du sein",
        "cancer mammaire",
        "carcinome mammaire",
        "tumeur maligne du sein",
        "carcinome canalaire invasif",
        "carcinome lobulaire infiltrant",
        "néoplasie mammaire",
        "adénocarcinome mammaire",
        "antécédent de cancer du sein",
        "histoire de cancer du sein",
        "mastectomie",
        "tumorectomie",
        "carcinome",
        "adénocarcinome",
        "néoplasie",
        "mastectomie bilatérale",
        "tumorectomie",
        "traitement pour cancer du sein",
        "radiothérapie mammaire",
        "chimiothérapie pour cancer du sein",
        "HER2 positif",
        "mammaire",
        "récepteur hormonal positif",
        "infiltration mammaire",
        "cancer du sein métastatique",
        "cancer infiltrant du sein",
        "dysplasie mammaire sévère"
]
)
nlp_genetique = edsnlp.blank("eds")
nlp_genetique.add_pipe(eds.sentences())
nlp_genetique.add_pipe(eds.normalizer())
nlp_genetique.add_pipe(eds.matcher(
    terms=terms,
    attr="NORM",
))
nlp_genetique.add_pipe(eds.family())
nlp_genetique.add_pipe(eds.negation())




<edsnlp.pipes.qualifiers.negation.negation.NegationQualifier at 0x164ab2990>

In [67]:
df_facteur_risque['fumeur'] = False

smokers, total = 0, 0
for i in range(len(df_note)):
    note = df_note.iloc[i]
    text = note['note_text']
    doc = nlp_smoker(text)

    filtered_ents_for_display = [ent for ent in doc.ents if not ent._.family]
    filtered_ents_for_display = [ent for ent in filtered_ents_for_display if not ent._.negation]

    visit_id_from_sampled_note = note['visit_occurrence_id']
    matching_visit_rows = df_visit[df_visit['visit_occurrence_id'] == visit_id_from_sampled_note]

    current_note_id_scalar = note['note_id']
    person_id = matching_visit_rows['person_id'].iloc[0]
   
    if matching_visit_rows.empty:
        raise ValueError(f"No matching visit found for visit_occurrence_id: {visit_id_from_sampled_note}")
    total += 1
    if filtered_ents_for_display: 
        smokers += 1
        df_facteur_risque.loc[df_facteur_risque['person_id'] == person_id, 'fumeur'] = True

print(f"Total notes processed: {total}")
print(f"Total smokers identified: {smokers}")

Total notes processed: 996
Total smokers identified: 90


In [None]:
df_facteur_risque['alcool'] = False

alcohol, total = 0, 0

for i in range(len(df_note)):
    note = df_note.iloc[i]
    text = note['note_text']
    doc = nlp_alcool(text)

    filtered_ents_for_display = [ent for ent in doc.ents if not ent._.family]
    filtered_ents_for_display = [ent for ent in filtered_ents_for_display if not ent._.negation]

    visit_id_from_sampled_note = note['visit_occurrence_id']
    matching_visit_rows = df_visit[df_visit['visit_occurrence_id'] == visit_id_from_sampled_note]

    current_note_id_scalar = note['note_id']
    person_id = matching_visit_rows['person_id'].iloc[0]
   
    total += 1
    if filtered_ents_for_display:
        alcohol += 1
        df_facteur_risque.loc[df_facteur_risque['person_id'] == person_id, 'alcool'] = True
        
print(f"Total notes processed: {total}")
print(f"Total alcohol consumers identified: {alcohol}")



Total notes processed: 996
Total alcohol consumers identified: 22


In [None]:
df_facteur_risque['antecedents_familiaux'] = False

genetique, total = 0, 0

for i in range(len(df_note)):
    note = df_note.iloc[i]
    text = note['note_text']
    doc = nlp_genetique(text)

    filtered_ents_for_display = [ent for ent in doc.ents if ent._.family]
    filtered_ents_for_display = [ent for ent in filtered_ents_for_display if not ent._.negation]

    visit_id_from_sampled_note = note['visit_occurrence_id']
    matching_visit_rows = df_visit[df_visit['visit_occurrence_id'] == visit_id_from_sampled_note]

    current_note_id_scalar = note['note_id']
    person_id = matching_visit_rows['person_id'].iloc[0]
   
    total += 1
    if filtered_ents_for_display:
        genetique += 1
        df_facteur_risque.loc[df_facteur_risque['person_id'] == person_id, 'antecedents_familiaux'] = True
        
print(f"Total notes processed: {total}")
print(f"Total genetique identified: {genetique}")

pd.to_pickle(df_facteur_risque, os.path.join(input_dir, 'df_facteur_risque.pkl'))

Total notes processed: 996
Total genetique  identified: 37
