# Data Cleaning
### Data from Estimation of the warfarin dose with clinical and pharmacogenetic data (PMID:19228618): https://api.pharmgkb.org/v1/download/submission/553247439
---

## Load Data

In [1]:
from tqdm import tqdm
import pandas as pd
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

In [2]:
org_df = pd.read_excel('../PS206767-553247439.xlsx', sheet_name='Subject Data', engine='openpyxl')
org_df

Unnamed: 0,PharmGKB Subject ID,PharmGKB Sample ID,Project Site,Gender,Race (Reported),Race (OMB),Ethnicity (Reported),Ethnicity (OMB),Age,Height (cm),...,VKORC1 QC genotype: -4451 C>A (861); Chr16:31018002; rs17880887; A/C,CYP2C9 consensus,VKORC1 -1639 consensus,VKORC1 497 consensus,VKORC1 1173 consensus,VKORC1 1542 consensus,VKORC1 3730 consensus,VKORC1 2255 consensus,VKORC1 -4451 consensus,Comments regarding Project Site Dataset
0,PA135312261,PA135312629,1,male,White,White,not Hispanic or Latino,not Hispanic or Latino,60 - 69,193.040,...,,*1/*1,A/G,G/T,,C/G,A/G,,,Project 1:
1,PA135312262,PA135312630,1,female,White,White,not Hispanic or Latino,not Hispanic or Latino,50 - 59,176.530,...,C/C,*1/*1,A/A,G/T,T/T,C/C,G/G,T/T,C/C,Warfarin Therapeutic Dose Definition:
2,PA135312263,PA135312631,1,female,White,White,not Hispanic or Latino,not Hispanic or Latino,40 - 49,162.560,...,,*1/*1,G/G,T/T,,G/G,A/G,,,The dose (unchanged for 6 days) that yielded a...
3,PA135312264,PA135312632,1,male,White,White,not Hispanic or Latino,not Hispanic or Latino,60 - 69,182.245,...,,*1/*1,A/G,G/T,,C/G,G/G,,,
4,PA135312265,PA135312633,1,male,White,White,not Hispanic or Latino,not Hispanic or Latino,50 - 59,167.640,...,,*1/*3,A/G,T/T,,C/G,A/G,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5695,PA152407681,PA152407969,21,male,White,White,not Hispanic or Latino,not Hispanic or Latino,20 - 29,185.420,...,,*1/*1,,,,,,,,
5696,PA152407682,PA152407970,21,female,White,White,not Hispanic or Latino,not Hispanic or Latino,70 - 79,160.020,...,,*1/*3,,,,,,,,
5697,PA152407683,PA152407971,21,male,White,White,not Hispanic or Latino,not Hispanic or Latino,60 - 69,187.960,...,,*1/*1,,,,,,,,
5698,PA152407684,PA152407972,21,male,White,White,not Hispanic or Latino,not Hispanic or Latino,60 - 69,177.800,...,,,,,,,,,,


---
## Inspection

In [3]:
org_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5700 entries, 0 to 5699
Data columns (total 68 columns):
 #   Column                                                                Non-Null Count  Dtype  
---  ------                                                                --------------  -----  
 0   PharmGKB Subject ID                                                   5700 non-null   object 
 1   PharmGKB Sample ID                                                    5700 non-null   object 
 2   Project Site                                                          5700 non-null   int64  
 3   Gender                                                                5696 non-null   object 
 4   Race (Reported)                                                       5194 non-null   object 
 5   Race (OMB)                                                            5700 non-null   object 
 6   Ethnicity (Reported)                                                  4461 non-null   object 
 7

In [4]:
cols_keeps = [
    # Genetic Markers
    "CYP2C9 consensus", # Derived consensus between original and Combined QC genotypes as follows:
                        # If Original equals QC, use common value
                        # If Original equals NA, use QC
                        # If QC equals NA, use Original
                        # If Original is not equal to QC, set value to NA unless Original contains an allele not typed in QC, then use Original
                        # All subjects not included in QC genotyping retain Original value
    "VKORC1 -1639 consensus", # Derived consensus between original and QC genotypes as follows:
                              # If Originalequals QC, use common value
                              # If Original equals NA, use QC
                              # If QC equals NA, use Original
                              # If Original is not equal to QC, set value to NA
                              # All subjects not included in QC genotyping retain Original value

    # Demographics & Body Metrics
    "Gender", # Male, Female or not known = -99
    "Age", 
    "Height (cm)", 
    "Weight (kg)",

    # Response & Clinical Variables
    "INR on Reported Therapeutic Dose of Warfarin", # International Normalized Ratio on the Therapeutic Dose of Warfarin Reported Above
    "Current Smoker", # yes = 1, not present = 0 or not known = NA
    "Diabetes", # yes = 1, not present = 0 or not known = NA

    # Drug Interactions
    "Amiodarone (Cordarone)", # yes = 1, not present = 0 or not known = NA
    "Phenytoin (Dilantin)", # yes = 1, not present = 0 or not known = NA
    "Rifampin or Rifampicin", # yes = 1, not present = 0 or not known = NA
    "Sulfonamide Antibiotics", # yes = 1, not present = 0 or not known = NA
    "Anti-fungal Azoles", # Includes ketoconazole, fluconazole, itraconazole, metronidazole, etc. Please do not include other drugs that end in "azole" such as omeprazole or metronidazole; yes = 1, not present = 0 or not known = NA

    # Target Variable
    "Therapeutic Dose of Warfarin", # Dose given in milligrams/week
]

In [5]:
dirty_df = org_df[cols_keeps]
dirty_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5700 entries, 0 to 5699
Data columns (total 15 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   CYP2C9 consensus                              5556 non-null   object 
 1   VKORC1 -1639 consensus                        4201 non-null   object 
 2   Gender                                        5696 non-null   object 
 3   Age                                           5658 non-null   object 
 4   Height (cm)                                   4554 non-null   float64
 5   Weight (kg)                                   5413 non-null   float64
 6   INR on Reported Therapeutic Dose of Warfarin  4968 non-null   float64
 7   Current Smoker                                3220 non-null   float64
 8   Diabetes                                      3283 non-null   float64
 9   Amiodarone (Cordarone)                        4182 non-null   f

---
## Missing Data

In [6]:
# drop all None values for core columns(0~8)
dirty_df.dropna(inplace=True)
dirty_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1487 entries, 1671 to 5686
Data columns (total 15 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   CYP2C9 consensus                              1487 non-null   object 
 1   VKORC1 -1639 consensus                        1487 non-null   object 
 2   Gender                                        1487 non-null   object 
 3   Age                                           1487 non-null   object 
 4   Height (cm)                                   1487 non-null   float64
 5   Weight (kg)                                   1487 non-null   float64
 6   INR on Reported Therapeutic Dose of Warfarin  1487 non-null   float64
 7   Current Smoker                                1487 non-null   float64
 8   Diabetes                                      1487 non-null   float64
 9   Amiodarone (Cordarone)                        1487 non-null   flo

In [7]:
# For all diseases, if value is unknown, we assume it is 0(FALSE)
# dirty_df.fillna(0, inplace=True)
# dirty_df.info()

In [8]:
dirty_df.reset_index(drop=True, inplace=True)
dirty_df

Unnamed: 0,CYP2C9 consensus,VKORC1 -1639 consensus,Gender,Age,Height (cm),Weight (kg),INR on Reported Therapeutic Dose of Warfarin,Current Smoker,Diabetes,Amiodarone (Cordarone),Phenytoin (Dilantin),Rifampin or Rifampicin,Sulfonamide Antibiotics,Anti-fungal Azoles,Therapeutic Dose of Warfarin
0,*1/*1,A/G,male,80 - 89,173.482,75.50,2.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.000000
1,*1/*3,A/G,male,80 - 89,166.116,70.00,2.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.500000
2,*1/*2,A/G,male,70 - 79,176.022,88.60,1.90,0.0,1.0,0.0,0.0,0.0,0.0,0.0,20.000000
3,*1/*2,A/G,male,70 - 79,176.022,92.00,2.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.000000
4,*1/*1,A/G,male,50 - 59,178.562,114.00,2.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1482,*1/*1,A/G,male,60 - 69,180.340,77.27,2.30,1.0,0.0,0.0,0.0,0.0,0.0,0.0,39.974286
1483,*1/*2,A/G,male,70 - 79,180.340,84.55,2.30,0.0,1.0,0.0,0.0,0.0,0.0,0.0,28.000000
1484,*1/*2,G/G,male,50 - 59,185.420,90.91,2.90,1.0,0.0,0.0,0.0,0.0,0.0,0.0,49.980000
1485,*1/*1,G/G,female,70 - 79,157.480,86.36,2.50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.490000


---
## One-hot Encoder

In [9]:
df_encoded = pd.get_dummies(dirty_df, drop_first=True)
df_encoded

Unnamed: 0,Height (cm),Weight (kg),INR on Reported Therapeutic Dose of Warfarin,Current Smoker,Diabetes,Amiodarone (Cordarone),Phenytoin (Dilantin),Rifampin or Rifampicin,Sulfonamide Antibiotics,Anti-fungal Azoles,...,VKORC1 -1639 consensus_G/G,Gender_male,Age_20 - 29,Age_30 - 39,Age_40 - 49,Age_50 - 59,Age_60 - 69,Age_70 - 79,Age_80 - 89,Age_90+
0,173.482,75.50,2.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,False,False,False,True,False
1,166.116,70.00,2.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,False,False,False,True,False
2,176.022,88.60,1.90,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,False,False,True,False,False
3,176.022,92.00,2.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,False,False,True,False,False
4,178.562,114.00,2.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1482,180.340,77.27,2.30,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,False,True,False,False,False
1483,180.340,84.55,2.30,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,False,False,True,False,False
1484,185.420,90.91,2.90,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,True,False,False,False,True,False,False,False,False
1485,157.480,86.36,2.50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,False,False,False,True,False,False


---
## Saving to csv

In [10]:
clean_df = df_encoded.copy(deep=True)
clean_df

Unnamed: 0,Height (cm),Weight (kg),INR on Reported Therapeutic Dose of Warfarin,Current Smoker,Diabetes,Amiodarone (Cordarone),Phenytoin (Dilantin),Rifampin or Rifampicin,Sulfonamide Antibiotics,Anti-fungal Azoles,...,VKORC1 -1639 consensus_G/G,Gender_male,Age_20 - 29,Age_30 - 39,Age_40 - 49,Age_50 - 59,Age_60 - 69,Age_70 - 79,Age_80 - 89,Age_90+
0,173.482,75.50,2.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,False,False,False,True,False
1,166.116,70.00,2.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,False,False,False,True,False
2,176.022,88.60,1.90,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,False,False,True,False,False
3,176.022,92.00,2.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,False,False,True,False,False
4,178.562,114.00,2.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1482,180.340,77.27,2.30,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,False,True,False,False,False
1483,180.340,84.55,2.30,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,False,False,True,False,False
1484,185.420,90.91,2.90,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,True,False,False,False,True,False,False,False,False
1485,157.480,86.36,2.50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,False,False,False,True,False,False


In [12]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1487 entries, 0 to 1486
Data columns (total 30 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Height (cm)                                   1487 non-null   float64
 1   Weight (kg)                                   1487 non-null   float64
 2   INR on Reported Therapeutic Dose of Warfarin  1487 non-null   float64
 3   Current Smoker                                1487 non-null   float64
 4   Diabetes                                      1487 non-null   float64
 5   Amiodarone (Cordarone)                        1487 non-null   float64
 6   Phenytoin (Dilantin)                          1487 non-null   float64
 7   Rifampin or Rifampicin                        1487 non-null   float64
 8   Sulfonamide Antibiotics                       1487 non-null   float64
 9   Anti-fungal Azoles                            1487 non-null   f

In [13]:
clean_df.to_csv("../cleaned_data.csv", index=False)