# One Hot Encoding
***

#### Creating dummy variables
***

In [3]:
import numpy as np
import pandas as pd

In [4]:
# read the csv file into a pandas dataframe
data = pd.read_csv('diabetes_cleaned.csv')
# look into the first 10 rows of the dataframe
data.head(10)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),88.0,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),88.0,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),88.0,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),88.0,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),88.0,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,[50-60),88.0,2,1,2,3,...,No,Steady,No,No,No,No,No,No,Yes,>30
6,55842,84259809,Caucasian,Male,[60-70),88.0,3,1,2,4,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
7,63768,114882984,Caucasian,Male,[70-80),88.0,1,1,7,5,...,No,No,No,No,No,No,No,No,Yes,>30
8,12522,48330783,Caucasian,Female,[80-90),88.0,2,1,4,13,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
9,15738,63555939,Caucasian,Female,[90-100),88.0,3,3,4,12,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
# changing the ranges to integer using the median of each range
age_dict = {'[50-60)': 55, '[80-90)': 85, '[40-50)': 45, '[70-80)': 75, '[60-70)': 65, '[30-40)': 35, '[0-10)': 5, '[90-100)': 95, '[10-20)':15, '[20-30)':25}

# Map age categories to integers
data['age'] = data['age'].replace(age_dict)

# Check weight values after replacing ranges with median value
print(print("Unique values after mapping:", data['age'].unique()))

Unique values after mapping: [ 5 15 25 35 45 55 65 75 85 95]
None


In [6]:
print(data.columns.tolist())

['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']


In [7]:
# looking into readmitted
print(data['readmitted'].value_counts())

readmitted
NO     54864
>30    35545
<30    11357
Name: count, dtype: int64


In [8]:
# replace NO with 0 and >30; <30 with 1 for patient readmission status
data['readmitted'] = data['readmitted'].replace('NO', 0)

data['readmitted'] = data['readmitted'].replace('<30', 1)

data['readmitted'] = data['readmitted'].replace('>30', 1)

In [9]:
# looking into readmitted
# print(data['readmitted'].value_counts())

In [10]:
# looking into diabetesMed
print(data['diabetesMed'].value_counts())

diabetesMed
Yes    78363
No     23403
Name: count, dtype: int64


In [11]:
data['diabetesMed'] = data['diabetesMed'].replace('Yes', 1)
data['diabetesMed'] = data['diabetesMed'].replace('No', 0)

In [12]:
# looking into diabetesMed
# print(data['diabetesMed'].value_counts())

In [13]:
# looking into change
print(data['change'].value_counts())

change
No    54755
Ch    47011
Name: count, dtype: int64


In [14]:
data['change'] = data['change'].replace('No', 0)
data['change'] = data['change'].replace('Ch', 1)

In [15]:
# looking into change
# print(data['change'].value_counts())

In [16]:
# The following columns can be dropped because they contain a single value for each record
# which does not contribute meaningfully to the data
data = data.drop(columns=['examide', 'acetohexamide', 'citoglipton', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone'])

In [17]:
# print(data.dtypes)

In [18]:
# looking into diagnoses
# print(data['diag_1'].unique())

In [19]:
# ICD-9 to category mapping
diag_category = {
    '250': 'Diabetes',
    '401': 'Hypertension',
    '414': 'Cardiovascular',
    '428': 'Heart Failure',
    '786': 'Respiratory',
}
# Function to convert ICD-9 code to integer and map to category
def convert_and_categorize(diag):
    # Remove non-numeric characters
    numeric_part = ''.join(filter(str.isdigit, diag))
    if numeric_part:
        # Convert to integer
        diag_int = int(numeric_part)
        # Map to category
        category = diag_category.get(numeric_part, 'Other')
        return diag_int, category
    else:
        return None, 'Unknown'

In [20]:
# Apply the conversion and categorization to the 'diag_1' column
data[['diag_1_int', 'diag_1_category']] = data['diag_1'].apply(lambda x: pd.Series(convert_and_categorize(x)))

In [21]:
data.rename(columns={'diag_1_category': 'primary_diag'}, inplace=True)
data.rename(columns={'diag_1_int': 'primary_diag_int'}, inplace=True)

In [22]:
# Apply the conversion and categorization to the 'diag_2' column
data[['diag_2_int', 'diag_2_category']] = data['diag_2'].apply(lambda x: pd.Series(convert_and_categorize(x)))

In [23]:
data.rename(columns={'diag_2_category': 'principal_diag'}, inplace=True)
data.rename(columns={'diag_2_int': 'principal_diag_int'}, inplace=True)

In [24]:
# Apply the conversion and categorization to the 'diag_3' column
data[['diag_3_int', 'diag_3_category']] = data['diag_3'].apply(lambda x: pd.Series(convert_and_categorize(x)))

In [25]:
data.rename(columns={'diag_3_category': 'secondary_diag'}, inplace=True)
data.rename(columns={'diag_3_int': 'secondary_diag_int'}, inplace=True)

In [26]:
data.drop(columns = ['diag_1', 'diag_2', 'diag_3'], inplace = True)

In [27]:
data.drop(columns = ['primary_diag', 'principal_diag', 'secondary_diag'], inplace = True)

In [28]:
# print(data.columns.tolist())

In [29]:
print(data.dtypes)

encounter_id                  int64
patient_nbr                   int64
race                         object
gender                       object
age                           int64
weight                      float64
admission_type_id             int64
discharge_disposition_id      int64
admission_source_id           int64
time_in_hospital              int64
payer_code                   object
medical_specialty            object
num_lab_procedures            int64
num_procedures                int64
num_medications               int64
number_outpatient             int64
number_emergency              int64
number_inpatient              int64
number_diagnoses              int64
max_glu_serum               float64
A1Cresult                   float64
metformin                    object
repaglinide                  object
nateglinide                  object
chlorpropamide               object
glimepiride                  object
glipizide                    object
glyburide                   

In [30]:
def onehot_encode(data, columns):
    for column in columns:
        # Generate dummy variables
        dummies = pd.get_dummies(data[column], prefix=column)
        # Concatenate dummies with the original DataFrame
        data = pd.concat([data, dummies], axis=1)
        # Drop the original column
        data.drop(column, axis=1, inplace=True)
    return data

In [31]:
# Get columns with dtype 'object'
object_columns = data.select_dtypes(include='object').columns
object_columns

Index(['race', 'gender', 'payer_code', 'medical_specialty', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'insulin', 'glyburide-metformin', 'glipizide-metformin'],
      dtype='object')

In [32]:
data = onehot_encode(data, object_columns)

In [33]:
print(data.columns.tolist())

['encounter_id', 'patient_nbr', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'change', 'diabetesMed', 'readmitted', 'primary_diag_int', 'principal_diag_int', 'secondary_diag_int', 'race_AfricanAmerican', 'race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Other', 'gender_Female', 'gender_Male', 'gender_Unknown/Invalid', 'payer_code_BC', 'payer_code_CH', 'payer_code_CM', 'payer_code_CP', 'payer_code_DM', 'payer_code_FR', 'payer_code_HM', 'payer_code_MC', 'payer_code_MD', 'payer_code_MP', 'payer_code_OG', 'payer_code_OT', 'payer_code_PO', 'payer_code_SI', 'payer_code_SP', 'payer_code_UN', 'payer_code_WC', 'medical_specialty_AllergyandImmunology', 'medical_specialty_Anesthesiology', 'medical_specialty_Anesthesiology-Pediatric', 'medical_specialty_Cardiolog

In [34]:
data

Unnamed: 0,encounter_id,patient_nbr,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,...,insulin_Down,insulin_No,insulin_Steady,insulin_Up,glyburide-metformin_Down,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_No,glipizide-metformin_Steady
0,2278392,8222157,5,88.0,6,25,1,1,41,0,...,False,True,False,False,False,True,False,False,True,False
1,149190,55629189,15,88.0,1,1,7,3,59,0,...,False,False,False,True,False,True,False,False,True,False
2,64410,86047875,25,88.0,1,1,7,2,11,5,...,False,True,False,False,False,True,False,False,True,False
3,500364,82442376,35,88.0,1,1,7,2,44,1,...,False,False,False,True,False,True,False,False,True,False
4,16680,42519267,45,88.0,1,1,7,1,51,0,...,False,False,True,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,75,88.0,1,3,7,3,51,0,...,True,False,False,False,False,True,False,False,True,False
101762,443847782,74694222,85,88.0,1,4,5,5,33,3,...,False,False,True,False,False,True,False,False,True,False
101763,443854148,41088789,75,88.0,1,1,7,1,53,0,...,True,False,False,False,False,True,False,False,True,False
101764,443857166,31693671,85,88.0,2,3,7,10,45,2,...,False,False,False,True,False,True,False,False,True,False


In [35]:
# Replace True/False with 1/0
data = data.astype(int)

In [36]:
data

Unnamed: 0,encounter_id,patient_nbr,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,...,insulin_Down,insulin_No,insulin_Steady,insulin_Up,glyburide-metformin_Down,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_No,glipizide-metformin_Steady
0,2278392,8222157,5,88,6,25,1,1,41,0,...,0,1,0,0,0,1,0,0,1,0
1,149190,55629189,15,88,1,1,7,3,59,0,...,0,0,0,1,0,1,0,0,1,0
2,64410,86047875,25,88,1,1,7,2,11,5,...,0,1,0,0,0,1,0,0,1,0
3,500364,82442376,35,88,1,1,7,2,44,1,...,0,0,0,1,0,1,0,0,1,0
4,16680,42519267,45,88,1,1,7,1,51,0,...,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,75,88,1,3,7,3,51,0,...,1,0,0,0,0,1,0,0,1,0
101762,443847782,74694222,85,88,1,4,5,5,33,3,...,0,0,1,0,0,1,0,0,1,0
101763,443854148,41088789,75,88,1,1,7,1,53,0,...,1,0,0,0,0,1,0,0,1,0
101764,443857166,31693671,85,88,2,3,7,10,45,2,...,0,0,0,1,0,1,0,0,1,0


In [37]:
data.dtypes

encounter_id                  int32
patient_nbr                   int32
age                           int32
weight                        int32
admission_type_id             int32
                              ...  
glyburide-metformin_No        int32
glyburide-metformin_Steady    int32
glyburide-metformin_Up        int32
glipizide-metformin_No        int32
glipizide-metformin_Steady    int32
Length: 181, dtype: object

In [38]:
print(data.isna().sum())

encounter_id                  0
patient_nbr                   0
age                           0
weight                        0
admission_type_id             0
                             ..
glyburide-metformin_No        0
glyburide-metformin_Steady    0
glyburide-metformin_Up        0
glipizide-metformin_No        0
glipizide-metformin_Steady    0
Length: 181, dtype: int64


In [39]:
data.to_csv('diabetes_encoded.csv', index = False)