In [1]:
from analysis_tools.preprocess import *

pd.set_option('display.max_colwidth', None)

In [2]:
file_path = "data/CS6140_meds_for_included_patients.RData"

raw_patient_df = data_load(file_path=file_path)

# Data Pre-Processing

## Categorizing medications

In this step, I will categorize the medications based on similarity(nothing about the similarity will be in the future)

In [3]:
raw_patient_df["cls.short"].unique()

array(['Insulin', 'MET', 'SGLT2', 'f_INS', 'DPP_4', 'GLP_1', 'SUL', 'TZD',
       'Other'], dtype=object)

In [4]:
# Expertise
# f-INS -> Insulin
raw_patient_df = convert_medications(raw_patient_df, "cls.short", old="f_INS", new="Insulin")

# Alpha-GI -> Other
raw_patient_df = convert_medications(raw_patient_df, "cls.short", old="Alpha-GI", new="Other")

In [5]:
raw_patient_df["cls.short"].unique()

array(['Insulin', 'MET', 'SGLT2', 'DPP_4', 'GLP_1', 'SUL', 'TZD', 'Other'],
      dtype=object)

## Get Information-Matrix

The information-matrix is the matrix that contains the information of when the patient got the medications and what medications the patient got. The information-matrix is following the one-hot encoding idea, which is it shows 1 on matrix if the patient got the specific medication on certain day. The matirx will be formed as nXd where n is the medication and d is the bins(date).

* What is bin?
    
    Bin is the specific terms that we will use in this analysis, which is the separation of date. For example, in this analysis, we will use half-year bins that have the information of all the medications that the patient got during the half-year duration.

* How the medications are represented

    All the medications we could see from the data will be handled by manually as a one-hot encoded matrix.

In [6]:
medications_one_hot_encoding = medication_one_hot_encoding(raw_patient_df)
save_one_hot_encoded_medications(medications_one_hot_encoding, "medications_one_hot_encoding")
medications_one_hot_encoding

{'Insulin': [1, 0, 0, 0, 0, 0, 0, 0],
 'MET': [0, 1, 0, 0, 0, 0, 0, 0],
 'SGLT2': [0, 0, 1, 0, 0, 0, 0, 0],
 'DPP_4': [0, 0, 0, 1, 0, 0, 0, 0],
 'GLP_1': [0, 0, 0, 0, 1, 0, 0, 0],
 'SUL': [0, 0, 0, 0, 0, 1, 0, 0],
 'TZD': [0, 0, 0, 0, 0, 0, 1, 0],
 'Other': [0, 0, 0, 0, 0, 0, 0, 1]}

In [7]:
patient_df = get_info_matrix(raw_patient_df, medications_one_hot_encoding, 2019, 2023, "half_year")
anonymous_head(patient_df, 5)

Unnamed: 0,patient_id,info_mat,all_medications
0,e599a5ed,"[[2, 0, 0, 0, 0, 0, 0, 0], [4, 4, 10, 0, 0, 0, 0, 0], [0, 1, 5, 0, 0, 0, 0, 0], [0, 1, 5, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 6, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 0, 0, 0]]","{SGLT2, Insulin, MET}"
1,02de3c3a,"[[0, 2, 0, 0, 0, 0, 0, 0], [0, 5, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 3, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 0, 0, 0]]",{MET}
2,88c1f41c,"[[35, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 0, 0, 0], [0, 2, 0, 8, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0]]","{DPP_4, Insulin, MET}"
3,8db7c31f,"[[0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 7, 0, 0, 0], [0, 0, 0, 0, 7, 0, 0, 0]]","{GLP_1, MET}"
4,592f64c7,"[[0, 3, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 0, 0, 0]]",{MET}


In [8]:
patient_df.shape

(10346, 3)

In [9]:
region = "data/CS6140_included_patients.RData"
region_df = data_load(file_path=region)
anonymous_head(region_df, 5)

Unnamed: 0,patient_id,T2D_onset_date,T2D_onset_half_year,race_ethnicity,patient_regional_location,sex,year_of_birth,generation
0,e599a5ed,2019-06-10,2019-1,White,Midwest,F,1979,Generation X
1,02de3c3a,2019-04-09,2019-1,White,Midwest,M,1972,Generation X
2,88c1f41c,2019-05-08,2019-1,Black,Midwest,F,1971,Generation X
3,8db7c31f,2019-03-22,2019-1,Other Race,Midwest,F,1977,Generation X
4,592f64c7,2019-01-15,2019-1,Black,Midwest,F,1958,Baby Boomer


In [10]:
patient_df = patient_df.merge(region_df[["patient_id", "patient_regional_location"]], how="left", on="patient_id")
patient_df.shape

(10346, 4)

In [11]:
patient_df = patient_df[patient_df['patient_regional_location']!="Ex-US"]
patient_df.shape

(10346, 4)

In [12]:
patient_df['patient_regional_location'].unique()

array(['Midwest', nan, 'South', 'Northeast', 'West'], dtype=object)

In [13]:
patient_df.dropna(subset=['patient_regional_location'], inplace=True)
patient_df.shape

(9110, 4)

Let's get the random 100 patients dataset and full number of patients dataset.

I will use 100 patients dataset to decide the best clustering method and use the best clustering to cluster the full dataset.

In [14]:
columns = ["patient_id", "info_mat", "all_medications"]

In [15]:
p_500_df = get_random_patients(df=patient_df, num_patients=500, random_state=42)
save_columns_as_csv(df=p_500_df, columns=columns, file_name="rnd_500_patients")

In [16]:
p_full_df = get_random_patients(df=patient_df, num_patients=patient_df.shape[0])
save_columns_as_csv(df=p_full_df, columns=columns, file_name="full_patients")