# Patient Data Reduction
this notebook takes a look at different ways to reduce similar patients in the dataset


In [189]:
import pandas as pd
import ast

df = pd.read_csv('../data/pmc_patients/PMC-Patients-oa-9995.csv')

## Ground Truths

In [190]:
# check if patients reference each other
def check_patients_reference_each_other(_df):
    _df = _df.copy()
    for _, patient in _df.iterrows():
        patient_id = patient['patient_uid']
        similar_patients = ast.literal_eval(patient['similar_patients'])
        if not similar_patients:
            continue

        for similar_patient_id in similar_patients.keys():
            similar_patient = _df[_df['patient_uid'] == similar_patient_id]
            if similar_patient.empty:
                continue

            similar_patient_dict = ast.literal_eval(similar_patient['similar_patients'].values[0])
            if not similar_patient_dict.get(patient_id):
                print(False)
check_patients_reference_each_other(df)

In [191]:
# check if patients that reference each other have same similar patients length
# gives an idea that similar patients don't necessarily fully overlap
def check_similar_patients_length(_df):
    _df = _df.copy()
#    _df = _df.iloc[:2000]
    count = 0
    for _, patient in _df.iterrows():
        similar_patients = ast.literal_eval(patient['similar_patients'])
        if not similar_patients:
            continue

        for similar_patient_id in similar_patients.keys():
            similar_patient = _df[_df['patient_uid'] == similar_patient_id]
            if similar_patient.empty:
                continue

            similar_patient_dict = ast.literal_eval(similar_patient['similar_patients'].values[0])
            if len(similar_patients) != len(similar_patient_dict):
                count += 1
    print(count)

check_similar_patients_length(df)

300


In [192]:
def check_full_overlap(_df):
    _df = _df.copy()
    for _, patient in _df.iterrows():
        patient_id = patient['patient_uid']
        similar_patients = ast.literal_eval(patient['similar_patients'])
        if not similar_patients:
            continue

        for similar_patient_id in similar_patients.keys():
            similar_patient = _df[_df['patient_uid'] == similar_patient_id]
            if similar_patient.empty:
                continue

            similar_patient_dict = ast.literal_eval(similar_patient['similar_patients'].values[0])

            patient_cp = similar_patients.copy()
            del patient_cp[similar_patient_id]
            del similar_patient_dict[patient_id]

            if set(patient_cp.keys()) != set(similar_patient_dict.keys()):
                print(set(patient_cp.keys()))
                print(set(similar_patient_dict.keys()))
                print(False)

copy_of_df = df.copy().iloc[:2000]
check_full_overlap(copy_of_df)

{'5189705-1', '6235647-1', '5554405-1', '4387990-1'}
{'7336829-3', '4387990-1', '7336829-1', '7336829-2', '8556118-1'}
False
{'3963201-1', '4015694-1'}
{'7102455-1', '7883589-1', '7456631-1', '8325827-1'}
False
{'8345623-1', '8345623-3', '6421908-1', '7583527-1', '6260389-2', '3341745-1', '7647562-1', '6991144-1', '8345623-2', '6919528-1', '4772568-1', '8345623-4', '6097382-1', '7289613-1', '5824516-1', '6260389-1', '5998940-1', '4983003-1'}
set()
False
{'8293598-1', '5440976-2', '5440976-1'}
{'5583761-1'}
False
{'7102455-1', '7883589-1', '7456631-1', '8325827-1'}
{'3963201-1', '4015694-1'}
False
set()
{'8345623-1', '8345623-3', '6421908-1', '7583527-1', '6260389-2', '3341745-1', '7647562-1', '6991144-1', '8345623-2', '6919528-1', '4772568-1', '8345623-4', '6097382-1', '7289613-1', '5824516-1', '6260389-1', '5998940-1', '4983003-1'}
False
{'7336829-3', '4387990-1', '7336829-1', '7336829-2', '8556118-1'}
{'5189705-1', '6235647-1', '5554405-1', '4387990-1'}
False
{'5583761-1'}
{'8293598-

- patients always reference each other
- similar patients don't necessarily overlap, even if they reference each other

## Remove similar patients

### 1) naive approach
- remove all patients from df that have similar patients

In [193]:
# get patients with no similar patients
def get_unique_patients(_df):
    _df = _df.copy()
    return _df[_df['similar_patients'].apply(lambda x: len(x) == 2)]

no_similar_patients = get_unique_patients(df)
no_similar_patients.drop(columns=['similar_patients'], inplace=True)
no_similar_patients

Unnamed: 0,index,patient_uid,PMID,file_path,title,patient,age,gender,relevant_articles
13,13,8674405-1,34956746,comm/PMC008xxxxxx/PMC8674405.xml,Goserelin Ovarian Ablation Failure in Premenop...,A 36-year-old G4P2 premenopausal woman with a ...,"[[36.0, 'year']]",F,"{'29510273': 1, '10953134': 1, '30258937': 1, ..."
14,14,8674458-1,34956749,comm/PMC008xxxxxx/PMC8674458.xml,Treatment of Inferior Vena Cava Thrombosis by ...,The patient was a 62-year-old male with a past...,"[[62.0, 'year']]",M,"{'29396156': 1, '185006': 1, '15696057': 1, '2..."
15,15,8674685-1,34956752,comm/PMC008xxxxxx/PMC8674685.xml,Unusual Cutaneous Location of Langheransian Hi...,"We report a case of a 45-year-old woman, a non...","[[45.0, 'year']]",F,"{'30281871': 1, '9611381': 1, '26966089': 1, '..."
19,19,8675574-1,34956758,comm/PMC008xxxxxx/PMC8675574.xml,An Approach for Diagnosing and Treating Neuros...,A 71-year-old African American female with a m...,"[[71.0, 'year']]",F,"{'31577877': 1, '15477572': 1, '25387188': 1, ..."
20,20,8675577-1,34956771,comm/PMC008xxxxxx/PMC8675577.xml,An Extremely Rare Case of Metastatic Merkel Ca...,Our patient is a 78-year-old male with a past ...,"[[78.0, 'year']]",M,"{'25765179': 1, '32324435': 1, '16896047': 1, ..."
...,...,...,...,...,...,...,...,...,...
9973,9980,6623996-1,31312556,comm/PMC006xxxxxx/PMC6623996.xml,"Prostate Cancer, Gender Identity, and Testoste...","A 58-year-old male with KS, diagnosed at age 1...","[[58.0, 'year']]",M,"{'17183557': 1, '9645824': 1, '12050481': 1, '..."
9976,9983,6624119-1,31328075,comm/PMC006xxxxxx/PMC6624119.xml,Transesophageal Echocardiography During Cytore...,"The patient is a 69-year-old, 55 kg, 153 cm fe...","[[69.0, 'year']]",F,"{'30747965': 1, '24119169': 1, '20553586': 1, ..."
9977,9984,6624151-1,31312570,comm/PMC006xxxxxx/PMC6624151.xml,Submassive Pulmonary Embolism: A Re-evaluation...,A 55-year-old man with a history of type II di...,"[[55.0, 'year']]",M,"{'25433511': 1, '29248101': 1, '28460762': 1, ..."
9986,9993,6624460-1,31245979,comm/PMC006xxxxxx/PMC6624460.xml,A Rare Complication of Oropharyngeal Tularemia...,A 33-year-old man presented to our clinic with...,"[[33.0, 'year']]",M,"{'10879600': 1, '23836859': 1, '3892222': 1, '..."


### 2) remove fully overlapping similar patients
- remove all patients that reference each other and overlap completely

#### graph approach
- given the graph nature of the problem, find all maximal cliques and remove every entry in such a clique except one

In [194]:
# graph approach to solving the problem
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
copy_of_df = df.copy().iloc[:2000]

graph = defaultdict(list)
for _, patient in copy_of_df.iterrows():
    patient_id = patient['patient_uid']
    for similar_patient_id in ast.literal_eval(patient['similar_patients']).keys():
        graph[patient_id].append(similar_patient_id)
        graph[similar_patient_id].append(patient_id)

In [195]:
G = nx.Graph(graph)
for node, neighbors in graph.items():
    G.add_edges_from((node, neighbor) for neighbor in neighbors)

maximal_cliques = list(nx.find_cliques(G))

# draw the graph
if len(G) < 100:
    print(maximal_cliques)
    nx.draw(G, with_labels=True)
    plt.show()

In [196]:
# remove all maximal cliques except one entry of each such clique
def remove_maximal_cliques(_df, maximal_cliques):
    _df = _df.copy()
    for clique in maximal_cliques:
        for patient in clique[1:]:
            _df = _df[_df['patient_uid'] != patient]
    return _df

In [197]:
remove_maximal_cliques(copy_of_df, maximal_cliques)

Unnamed: 0,index,patient_uid,PMID,file_path,title,patient,age,gender,relevant_articles,similar_patients
0,0,7665777-1,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,This 60-year-old male was hospitalized due to ...,"[[60.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-2': 2, '7665777-3': 2, '7665777-4': ..."
11,11,8674153-1,34956745,comm/PMC008xxxxxx/PMC8674153.xml,Deranged Liver Function Tests and Liver Insult...,A 45-year-old female was brought in by ambulan...,"[[45.0, 'year']]",F,"{'18703853': 1, '22644309': 1, '9510397': 1, '...","{'8674153-2': 2, '4208431-1': 1}"
13,13,8674405-1,34956746,comm/PMC008xxxxxx/PMC8674405.xml,Goserelin Ovarian Ablation Failure in Premenop...,A 36-year-old G4P2 premenopausal woman with a ...,"[[36.0, 'year']]",F,"{'29510273': 1, '10953134': 1, '30258937': 1, ...",{}
14,14,8674458-1,34956749,comm/PMC008xxxxxx/PMC8674458.xml,Treatment of Inferior Vena Cava Thrombosis by ...,The patient was a 62-year-old male with a past...,"[[62.0, 'year']]",M,"{'29396156': 1, '185006': 1, '15696057': 1, '2...",{}
15,15,8674685-1,34956752,comm/PMC008xxxxxx/PMC8674685.xml,Unusual Cutaneous Location of Langheransian Hi...,"We report a case of a 45-year-old woman, a non...","[[45.0, 'year']]",F,"{'30281871': 1, '9611381': 1, '26966089': 1, '...",{}
...,...,...,...,...,...,...,...,...,...,...
1991,1991,6098999-1,30147873,comm/PMC006xxxxxx/PMC6098999.xml,Granulomatous fasciitis followed by morphea pr...,A 24-year-old Caucasian man presented with a s...,"[[24.0, 'year']]",M,"{'7053738': 1, '11402008': 1, '8902092': 1, '3...",{}
1993,1993,6099002-1,30147903,comm/PMC006xxxxxx/PMC6099002.xml,Short-term use of oral amiodarone causing tors...,"A 44-year-old man with a history of HTN, DM ty...","[[44.0, 'year']]",M,"{'10871966': 1, '14594906': 1, '10924311': 1, ...",{}
1996,1996,6099009-1,30147930,comm/PMC006xxxxxx/PMC6099009.xml,Uncommon cause of liver abscess,This 35-year-old male patient had accidentally...,"[[35.0, 'year']]",M,"{'31934467': 2, '27717979': 1, '25662871': 1, ...",{'6942747-1': 1}
1998,1998,6099020-1,30147891,comm/PMC006xxxxxx/PMC6099020.xml,Aortic root ectasia as a phenotypic feature of...,"The patient is 84-year-old Caucasian woman, he...","[[84.0, 'year']]",F,"{'21099168': 1, '20738020': 1, '16467661': 1, ...",{}
