We install and import the necessary libraries.

In [None]:
!pip install pandas

import pandas as pd
import os
import glob
import re
import string

We import the notes, which are originally in txt format. After changing the name of one of the columns, we get two columns: name and text.

In [None]:

notes_data = []

folder_path = r'C:/Users/Cristina/Documents/CARMEN-I/txt/replaced/IA/'
file_paths = glob.glob(folder_path + '*.txt')

for file_path in file_paths:
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            filename = os.path.basename(file_path)
            notes_data.append({'name': filename, 'text': content})
    except FileNotFoundError:
        print("File not found:", file_path)

df = pd.DataFrame(notes_data)

df.rename(columns={'File Name': 'name'}, inplace=True)
df['name'] = df['name'].str.replace('.txt', '')

df

We upload the mappings of the languages, which are originally in a tsv file.

In [None]:
languages = pd.read_csv('C:/Users/Documents/CARMEN-I/mappings.tsv', delimiter='\t')
languages.rename(columns={'filename': 'name'}, inplace=True)

We merge df and languages dataframes. Now df dataframe has 3 columns: name, text and language.

In [None]:
df = pd.merge(df, languages[['name', 'language']], on='name', how='left')
df

**Preparing Manual Labelling file**

We create a dataframe for bilingual and catalan text notes uding the language identifier from the mappings.

In [None]:
catbi = ['bi', 'cat']
cat_bi = df[df['language'].isin(catbi)]
cat_bi

We create another one for Spanish text notes.

In [None]:
#I create a dataframe with a random selection of Spanish notes:
es = df[df['language']=='es']
es = es.sample(n=46)
es = es.sort_index()
es

We merge both dataframes and export them as an Excel file, which will be used for Comparison of the methods step (2).

In [None]:
manual_analysis = pd.concat([cat_bi, es], ignore_index=True)
manual = 'munual_labelling.xlsx'
manual_analysis.to_excel(manual, index=False)

**Preparing Protected variable AGE file**

We compare results looking for terms directly in the notes vs using the NER terms.

**1.** Reading "años", "anys" in notes to find age: Get the rows with references to age, and check the number of instances.

In [None]:
age_rows = df[df['text'].str.contains(r'\baños\b|\banys\b', na=False)]
unique_name = age_rows['name'].nunique()
unique_name

**2.** Using NER:

We upload the anonymised terms.

In [None]:
text_anon = pd.read_csv('C:/Users/Cristina/Documents/MÀSTER/Applied Data Science Utrecht/Courses/THESIS/DATA CARMEN/carmen/CARMEN-I/tsv/replaced/CARMEN~1.TSV', sep='\t')

#only keep those rows with 'EDAD_SUJETO_ASISTENCIA':
ages = text_anon[text_anon['tag']== 'EDAD_SUJETO_ASISTENCIA']
agess = ages[ages['name'].str.contains("_IA_")]

#drop columns tag and span
agess.drop(columns=['tag', 'span'], inplace=True)

#merge with df
ages_df = pd.merge(agess, df, on='name', how='inner')
ages_df

We work with option 2) as it identifies better the ages. We create a new column, having 1 if it's older than 60, and 0 otherwise

In [None]:
#Function to extract age
def extract_age(text):
    match = re.search(r'\d+', text)
    if match:
        return int(match.group(0))
    return None

#We create a new column 'age' with the extracted age
ages_df['age'] = ages_df['text_x'].apply(extract_age)

#We create the column older with 1 for patients older than 60 (included), 0 otherwise.
ages_df['older'] = ages_df['age'].apply(lambda x: 1 if x >= 60 else 0)
ages_df

We export the Excel file, which will be used for the analysis of Protected variables Age.

In [None]:
ages_ex = 'ex_age.xlsx'
ages_df.to_excel(ages_ex, index=False)

**Preparing protected variable GENDER file**

Again, we compare results looking for terms directly in the notes (imputation) vs using the NER terms. Firstly, we make a copy of df names gender.

In [None]:
gender = df.copy()

**1.** Imputation: We create regular expression patterns to look for masculine and feminine instances, and use them in a function to determine gender. The function is then applied to each text, and a column 'gender' is created with 1 for women and 0 for men.

In [None]:
#create regular expressions
pattern_masculine = r'\b(varón|varon|el\s+paciente|home)\b'
pattern_feminine = r'\b(mujer|la\s+paciente)\b'

#determine gender based on text
def determine_gender(text):
    if re.search(pattern_masculine, text, flags=re.IGNORECASE):
        return 0  # Masculine
    elif re.search(pattern_feminine, text, flags=re.IGNORECASE):
        return 1  # Feminine
    else:
        return None  # Gender not determined

#apply the function to each row of the DataFrame
gender['gender'] = gender['text'].apply(determine_gender)
gender.dropna(subset=['gender'], inplace=True)
gender

**2.** Using NER terms.

In [None]:
#only keep those rows with 'EDAD_SUJETO_ASISTENCIA':
gender_ner = text_anon[text_anon['tag']== 'SEXO_SUJETO_ASISTENCIA']
genderr = gender_ner[gender_ner['name'].str.contains("_IA_")]

#drop columns tag and span
genderr.drop(columns=['tag', 'span'], inplace=True)

#merge with
gender_df = pd.merge(genderr, df, on='name', how='inner')
gender_df

We use imputation as a method, as we get more instances. The excel file is exported, which is going to be used to assess bias to the protected variable gender.

In [None]:
gender_ex = 'ex_gender.xlsx'
gender.to_excel(gender_ex, index=False)

**Preparing protected variable PERCEIVED SOCIOECONOMIC STATUS file**

We use the NER terms Profesion for this task. We will associate each profesion to jobs that require high or low qualifications. From there, a perceived socioeconomic status by the medical staff can be inferred.

In [None]:
prof_ner = text_anon[text_anon['tag']== 'PROFESION']
profesion = prof_ner[prof_ner['name'].str.contains("_IA_")]

#drop columns tag and span
profesion.drop(columns=['tag', 'span'], inplace=True)

#merge with df
profesion_df = pd.merge(profesion, df, on='name', how='inner')
profesion_df

We export the Excel file, where we will assess if the profesions require high or low qualification. Later, this file will be used to assess bias regarding perceived socioeconomic status.

In [None]:
profesions = 'ex_profesions.xlsx'
profesion_df.to_excel(profesions, index=False)