In [25]:
import pandas as pd

# Read data from IMTC

In [26]:
data1 = pd.read_csv('/home/imtc/Documenti/tmp/ID_DB_BRUGADA.csv', sep='\t')
# Drop incremental integer
data1.drop(['Unnamed: 23','Unnamed: 24'], axis=1, inplace=True)
# Drop row if missing 'Cognome' or 'Nome'
data1.dropna(subset=['Cognome', 'Nome'], inplace=True)
# Standardize column names
data1.columns = data1.columns.str.lower().str.replace(" ", "_")
# Drop completely empty columns (not almost empty ones)
data1 = data1.dropna(axis=1, how="all")
# Standardize columns: Convert to title case
data1["cognome"] = data1["cognome"].str.title().str.strip()
data1["nome"] = data1["nome"].str.title().str.strip()
# Recreate 'sex' column based on 'm' and 'f' values
data1["sex"] = data1.apply(lambda row: "M" if row["m"] == "X" else ("F" if row["f"] == "X" else None), axis=1)
data1.drop(columns=["m", "f"], inplace=True)  # Remove old columns
# Standardize date format (Data di Nascita -> date_of_birth)
data1["data_di_nascita"] = pd.to_datetime(data1["data_di_nascita"], errors="coerce", dayfirst=True)

# Read data from EF

In [27]:
data2 = pd.read_csv('/home/imtc/Documenti/tmp/DATABASE_Clinica_pazienti_Brugada.csv', sep='\t')
data2.drop(['ID','Unnamed: 232', 'Unnamed: 233'], axis=1, inplace=True)
# Standardize column names: Convert to lowercase and replace spaces with underscores
data2.columns = data2.columns.str.lower().str.replace(" ", "_")
# Drop completely empty columns
data2 = data2.dropna(axis=1, how="all")
data2["first_name"] = data2["first_name"].str.title().str.strip()
data2["last_name"] = data2["last_name"].str.title().str.strip()
data2["nazione_di_nascita"] = data2["nazione_di_nascita"].str.title().str.strip()
data2["regione_di_nascita"] = data2["regione_di_nascita"].str.title().str.strip()
data2["provincia_di_nascita"] = data2["provincia_di_nascita"].str.title().str.strip()
# Standardize date format (DATE OF BIRTH -> data_di_nascita, TEST DATE -> test_date)
data2["date_of_birth"] = pd.to_datetime(data2["date_of_birth"], errors="coerce", format="%m/%d/%Y", dayfirst=True)
data2["_test_date"] = pd.to_datetime(data2["_test_date"], errors="coerce", format="%m/%d/%Y", dayfirst=True)
# Ensure numeric columns are properly converted (e.g., age)
data2["age"] = pd.to_numeric(data2["age"], errors="coerce")

  data2 = pd.read_csv('/home/imtc/Documenti/tmp/DATABASE_Clinica_pazienti_Brugada.csv', sep='\t')


# Check Anagrafica

In [28]:
clinic_data1 = data1[['pos1_neg0', '_procedure', 'cognome', 'nome', 'data_di_nascita',
       'primo_ingresso', 'sex']]
clinic_data1.columns = ['ajmaline', 'id', 'last_name', 'first_name', 'birth_date',
       'registered_on', 'sex']

In [29]:
clinic_data2 = data2[['pk_paziente_______id_cardioref','last_name','first_name','sex','date_of_birth',
    '_test_date','nazione_di_nascita','regione_di_nascita',
    'provincia_di_nascita','proband_/_relatives',
    'fin_genetica_progressivo','ajmaline_test_result',
    'brs_baseline_pattern']]
clinic_data2.columns = ['id_cardioref','last_name','first_name','sex','birth_date',
    'test_date','nation','region',
    'province','family_status',
    'fin','ajmaline',
    'brs_baseline_pattern']

In [30]:
clinic_merged = clinic_data1.merge(clinic_data2, on=['first_name', 'last_name', 'birth_date', 'sex'], how='outer', indicator=True, suffixes=['_IMTC','_EF'])
for col in ["birth_date", "test_date", "registered_on"]:
    clinic_merged[col] = pd.to_datetime(clinic_merged[col], errors='coerce').dt.strftime('%Y-%m-%d')

  clinic_merged[col] = pd.to_datetime(clinic_merged[col], errors='coerce').dt.strftime('%Y-%m-%d')


In [31]:
clinic_merged.drop_duplicates(subset=['id_cardioref', 'first_name', 'last_name', 'birth_date' , 'sex'], inplace=True)

In [32]:
clinic_merged[['_merge', 'id', 'id_cardioref', 'fin', 'last_name', 'first_name', 'birth_date', 'sex', 'nation', 'region',
       'province', 'family_status','registered_on', 'test_date', 'ajmaline_IMTC', 'ajmaline_EF',
       'brs_baseline_pattern']].to_csv('/home/imtc/Scaricati/clinical_imtc_ef_merged_TMP.csv', index=False)