In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style('darkgrid')
palette =['#647AA3', '#89909F', '#348AA7', '#4d6d9a', '#99ced3', '#edb5bf']
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('~/Desktop/work/chest_radiography/data/train.csv')
valid = pd.read_csv('~/Desktop/work/chest_radiography/data/valid.csv')
test = pd.read_csv('~/Desktop/work/chest_radiography/data/test.csv')

In [None]:
df = pd.concat([train, valid, test])

## Patient Sex

In [None]:
df['PatientSex'][df['PatientSex']!='U'].value_counts().plot(kind='bar', color=palette)
plt.title('Patient Gender per Examination')
plt.xlabel('Gender')
plt.ylabel('Count')
#plt.gcf().set_dpi(300)
plt.show()

In [None]:
df_no_duplicate_patient = df.drop_duplicates(subset='PatientName', keep="last")
df_no_duplicate_patient['PatientSex'][df_no_duplicate_patient['PatientSex']!='U'].value_counts().plot(kind='bar', color=palette)
plt.title('Patient Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
#plt.gcf().set_dpi(300)
plt.show()

In [None]:
df_no_duplicate_patient['PatientSex'][df_no_duplicate_patient['PatientSex']!='U'].value_counts()

In [None]:
27577/(17437+27577)

## Number of Patients

In [None]:
len(df['PatientName'].unique())

## Patient Age

In [None]:
(pd.to_datetime(df['Untersuchungsdatum']).dt.year - pd.to_datetime(df['Geburtsdatum']).dt.year).plot.hist(bins=30, color=palette[2])
plt.title('Patient Age')
plt.xlabel('Age')
plt.ylabel('Count')
#plt.gcf().set_dpi(300)
plt.show()

In [None]:
print("Mean Age")
print((pd.to_datetime(df['Untersuchungsdatum']).dt.year - pd.to_datetime(df['Geburtsdatum']).dt.year).mean())
print("Std Age")
print((pd.to_datetime(df['Untersuchungsdatum']).dt.year - pd.to_datetime(df['Geburtsdatum']).dt.year).std())

In [None]:
# NO Dupilcate patients
(pd.to_datetime(df_no_duplicate_patient['Untersuchungsdatum']).dt.year - pd.to_datetime(df_no_duplicate_patient['Geburtsdatum']).dt.year).plot.hist(bins=30, color=palette[2])
plt.title('Patient Age')
plt.xlabel('Age')
plt.ylabel('Count')
#plt.gcf().set_dpi(300)
plt.show()

In [None]:
# NO Duplicate Patient
print("Mean Age")
print((pd.to_datetime(df_no_duplicate_patient['Untersuchungsdatum']).dt.year - pd.to_datetime(df_no_duplicate_patient['Geburtsdatum']).dt.year).mean())
print("Std Age")
print((pd.to_datetime(df_no_duplicate_patient['Untersuchungsdatum']).dt.year - pd.to_datetime(df_no_duplicate_patient['Geburtsdatum']).dt.year).std())

## Examination Date

In [None]:
pd.to_datetime(df['Untersuchungsdatum']).dt.year.plot.hist(bins=11, color=palette[2])
plt.title('Examination Date')
plt.xlabel('Year')
plt.ylabel('Count')

## Analyze all Datasplits separately

In [None]:
for split_name, data_split in zip(["train", "valid", "test", "all"], [train, valid, test, df]):
    print("---------------------------------")
    data_split_no_duplicate_patient = data_split.drop_duplicates(subset='PatientName', keep="last")
    #Age
    print(split_name)
    print("Length:", len(data_split))
    age_column_in_years = (pd.to_datetime(data_split_no_duplicate_patient['Untersuchungsdatum']).dt.year - pd.to_datetime(data_split_no_duplicate_patient['Geburtsdatum']).dt.year)
    print("Mean Age:", age_column_in_years.mean())
    print("Std Age:", age_column_in_years.std())
    print("Min Age:", age_column_in_years.min())
    print("Max Age:", age_column_in_years.max())
    #Gender
    patient_value_counts = data_split_no_duplicate_patient['PatientSex'][data_split_no_duplicate_patient['PatientSex']!='U'].value_counts()
    print(patient_value_counts)
    print("Male Ratio: ", patient_value_counts['M'] / (patient_value_counts['M'] + patient_value_counts['F']))
    print("Female Ratio: ", patient_value_counts['F'] / (patient_value_counts['M'] + patient_value_counts['F']))
    # Patients
    print("Unique Patients: ", len(data_split_no_duplicate_patient))

# How many physicians annotated

In [None]:
df_physician = pd.read_excel('~/Desktop/work/chest_radiography/data/csv_with_physician_names/st_Befundtext_THIN_Schulze-Hagen_dok.xlsx', skiprows=8)

In [None]:
df_physician_filtered = df_physician.dropna(subset=['Anforderungsnummer'])
df_physician_filtered['Anforderungsnummer'] = df_physician_filtered['Anforderungsnummer'].apply(lambda x: str(int(x)) + '01')

In [None]:
df_physician_filtered_num = df_physician_filtered[df_physician_filtered['Anforderungsnummer'].isin(df['Anforderungsnummer'].apply(lambda x: str(x)))]

In [None]:
len(df_physician_filtered_num['Befundarzt Name'].unique())

In [None]:
df_physician_filtered_num

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
physician_value_counts = df_physician_filtered_num['Befundarzt Name'].value_counts()[df_physician_filtered_num['Befundarzt Name'].value_counts() > 40]

In [None]:
print("Mean: ", physician_value_counts.mean())
print("Std: ", physician_value_counts.std())
print("Max: ", physician_value_counts.max())
print("Min: ", physician_value_counts.min())

In [None]:
len(physician_value_counts)

# Label distribution

In [None]:
def undummify(df, prefix_sep="_"):
    cols2collapse = {
        item.rsplit(prefix_sep, 1)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.rsplit(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

In [None]:
undummified_df = undummify(df)

In [None]:
# Standard labels
for standard_label in ['Stauung', 'Pleuraerguss_re', 'Pleuraerguss_li', 'Infiltrate_re', 'Infiltrate_li', 'Belstörungen_re', 'Belstörungen_li']:
    undummified_df[standard_label] = undummified_df[standard_label].replace({'1.0': 'kein', '2.0': '+', '3.0': '++', '4.0': '+++', '5.0': '(+)'})

# Herzgröße
undummified_df['Herzgröße'] = undummified_df['Herzgröße'].replace({'1.0': 'normal', '2.0': 'grenzwertig', '3.0': 'vergrößert', '4.0': 'massiv vergrößert', '5.0': 'nicht beurteilbar'})

# Pneumothorax
undummified_df['Pneumothorax_re'] = undummified_df['Pneumothorax_re'].replace({'1.0': 'kein', '2.0': 'spitze', '3.0': 'mantel', '4.0': 'basal', '5.0': 'gering', '6.0': 'erheblich', '7.0': 'spannung'})
undummified_df['Pneumothorax_li'] = undummified_df['Pneumothorax_li'].replace({'1.0': 'kein', '2.0': 'spitze', '3.0': 'mantel', '4.0': 'basal', '5.0': 'gering', '6.0': 'erheblich', '7.0': 'spannung'})

In [None]:
undummified_df.head(5)

In [None]:
columns_of_interest = ['Cardiomegaly', 'Pleural Effusion (left)', 'Pleural Effusion (right)', 'Pulmonary Infiltrates (left)', 'Pulmonary Infiltrates (right)', 
                       'Pulmonary Congestion', 'Atelectasis (left)', 'Atelectasis (right)'] 

undummified_df = undummified_df.rename(columns={'AcquisitionNumber': 'Anforderungsnummer', 'Pleuraerguss_li': 'Erguss_li', 'Pleuraerguss_re': 'Erguss_re', 'Belstörungen_re': 'Bel.-Störungen_re', 'Belstörungen_li': 'Bel.-störungen_li' })

# Standard labels
for standard_label in ['Stauung', 'Erguss_re', 'Erguss_li', 'Infiltrate_re', 'Infiltrate_li', 'Bel.-Störungen_re', 'Bel.-störungen_li']:
    undummified_df[standard_label] = undummified_df[standard_label].replace({'kein': 'none'})

# Herzgröße
undummified_df['Herzgröße'] = undummified_df['Herzgröße'].replace({'normal': 'none', 'grenzwertig': '(+)', 'vergrößert': '+', 'massiv vergrößert': '++', 'nicht beurteilbar': '+++'})

# lowercase the pneumothorax annotations because some are capitalized in the radiologist labels
undummified_df['Pneumothorax_re'] = undummified_df['Pneumothorax_re'].str.lower()
undummified_df['Pneumothorax_li'] = undummified_df['Pneumothorax_li'].str.lower()

# Pneumothorax
undummified_df['Pneumothorax_re'] = undummified_df['Pneumothorax_re'].replace({'kein': 'none', 'spitze': 'apex', 'mantel': 'mantle', 'basal': 'basal', 'gering': 'minor', 'erheblich': 'considerable', 'spannung': 'tension'})
undummified_df['Pneumothorax_li'] = undummified_df['Pneumothorax_li'].replace({'kein': 'none', 'spitze': 'apex', 'mantel': 'mantle', 'basal': 'basal', 'gering': 'minor', 'erheblich': 'considerable', 'spannung': 'tension'})

undummified_df = undummified_df.rename(columns= {'Herzgröße': 'Cardiomegaly', 'Erguss_li': 'Pleural Effusion (left)', 'Erguss_re': 'Pleural Effusion (right)', 'Infiltrate_li': 'Pulmonary Infiltrates (left)', 'Infiltrate_re': 'Pulmonary Infiltrates (right)', 
                   'Stauung': 'Pulmonary Congestion', 'Bel.-störungen_li': 'Atelectasis (left)', 'Bel.-Störungen_re': 'Atelectasis (right)'})


In [None]:
undummified_df.head()

In [None]:
for column in columns_of_interest:
    print(column)
    label_order = ['none', '(+)', '+', '++', '+++']
    undummified_df[column].value_counts()[label_order].plot.bar(color=palette[2])
    plt.title(column, fontsize=15, fontweight='bold')
    plt.ylabel('Count')
    plt.xlabel('Label')
    plt.gcf().set_dpi(300)
    plt.show()

In [None]:
undummified_df[column].value_counts()

In [None]:
df['Belstörungen_re_1.0'].value_counts()