In [55]:
import pandas as pd

In [56]:
df_copy = pd.read_csv("diabetic_data.csv")

In [57]:
df = df_copy.copy()

In [58]:
X = df[['race', 'gender', 'age','admission_type_id','admission_source_id', 'time_in_hospital', 'medical_specialty',
        'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3','metformin', 'glipizide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'insulin', 'change', 'readmitted']]

Preprocessing

In [59]:
X.medical_specialty.value_counts()

medical_specialty
?                         49949
InternalMedicine          14635
Emergency/Trauma           7565
Family/GeneralPractice     7440
Cardiology                 5352
                          ...  
Proctology                    1
Speech                        1
SportsMedicine                1
Perinatology                  1
Neurophysiology               1
Name: count, Length: 73, dtype: int64

In [60]:
X = X.drop(columns=['medical_specialty']) # has 49% '?', therefore dropped the column

In [61]:
X.gender.value_counts()

gender
Female             54708
Male               47055
Unknown/Invalid        3
Name: count, dtype: int64

In [62]:
# Identify rows where 'race' is '?'
rows_to_drop = X[X['race'] == '?'].index
# Drop these rows
X.drop(rows_to_drop, inplace=True)

In [63]:
# Identify rows where 'gender' is 'invalid'
rows_to_drop = X[X['gender'] == 'Unknown/Invalid'].index
# Drop these rows
X.drop(rows_to_drop, inplace=True)

In [64]:
X.diag_1.unique().shape

(715,)

In [65]:
# Identify rows where 'diag_1' is '?'
rows_to_drop = X[X['diag_1'] == '?'].index
# Drop these rows
X.drop(rows_to_drop, inplace=True)

In [66]:
# Identify rows where 'diag_2' is '?'
rows_to_drop = X[X['diag_2'] == '?'].index
# Drop these rows
X.drop(rows_to_drop, inplace=True)

In [67]:
# Identify rows where 'diag_3' is '?'
rows_to_drop = X[X['diag_3'] == '?'].index
# Drop these rows
X.drop(rows_to_drop, inplace=True)

In [68]:
import pandas as pd

# Define the mapping for ICD9 codes to group names
def map_icd9(code):
    try:
        code = str(code)
        if code.startswith("250"):
            return "Diabetes"
        elif 390 <= int(code) <= 459 or code == "785":
            return "Circulatory"
        elif 460 <= int(code) <= 519 or code == "786":
            return "Respiratory"
        elif 520 <= int(code) <= 579 or code == "787":
            return "Digestive"
        elif 800 <= int(code) <= 999:
            return "Injury"
        elif 710 <= int(code) <= 739:
            return "Musculoskeletal"
        elif 580 <= int(code) <= 629 or code == "788":
            return "Genitourinary"
        elif 140 <= int(code) <= 239:
            return "Neoplasms"
        else:
            return "Other"
    except ValueError:
        return "Other"

# Apply the mapping
X['diag_1'] = X['diag_1'].apply(map_icd9)
X['diag_2'] = X['diag_2'].apply(map_icd9)
X['diag_3'] = X['diag_3'].apply(map_icd9)


In [69]:
X['readmitted'] = X['readmitted'].map({
    '>30': 0,
    'NO': 0,
    '<30': 1
})

In [70]:
X.columns

Index(['race', 'gender', 'age', 'admission_type_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'metformin',
       'glipizide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'insulin',
       'change', 'readmitted'],
      dtype='object')

In [71]:
X['readmitted'].value_counts()

readmitted
0    86986
1    11066
Name: count, dtype: int64

Encoding

Ordinal encoding for age

In [72]:
from sklearn.preprocessing import OrdinalEncoder

# Define the order of categories
categories_order = [
    '[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)',
    '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)'
]

# Apply Ordinal Encoding
encoder = OrdinalEncoder(categories=[categories_order])
X['age'] = encoder.fit_transform(X[['age']])

In [73]:
X.head()

Unnamed: 0,race,gender,age,admission_type_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,...,diag_2,diag_3,metformin,glipizide,pioglitazone,rosiglitazone,acarbose,insulin,change,readmitted
1,Caucasian,Female,1.0,1,7,3,59,0,18,0,...,Diabetes,Other,No,No,No,No,No,Up,Ch,0
2,AfricanAmerican,Female,2.0,1,7,2,11,5,13,2,...,Diabetes,Other,No,Steady,No,No,No,No,No,0
3,Caucasian,Male,3.0,1,7,2,44,1,16,0,...,Diabetes,Circulatory,No,No,No,No,No,Up,Ch,0
4,Caucasian,Male,4.0,1,7,1,51,0,8,0,...,Neoplasms,Diabetes,No,Steady,No,No,No,Steady,Ch,0
5,Caucasian,Male,5.0,2,2,3,31,6,16,0,...,Circulatory,Diabetes,No,No,No,No,No,Steady,No,0


Ordinal encoding for medicine, maybe categorical(needs checking)

In [74]:
# Define columns to encode
columns_to_encode = ['metformin', 'glipizide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'insulin']

# Define the order of categories
category_order = ['No', 'Down', 'Steady', 'Up']

# Initialize OrdinalEncoder with the specified order
encoder = OrdinalEncoder(categories=[category_order] * len(columns_to_encode))

# Apply the encoder only to the specified columns
encoded_values = encoder.fit_transform(X[columns_to_encode])

# Replace original columns with encoded values
X[columns_to_encode] = encoded_values

One-hot encoding for categorical variables

In [75]:
df_encoded = pd.get_dummies(X, columns=['admission_type_id','admission_source_id', 'race', 'gender', 'change', 'diag_1', 'diag_2', 'diag_3'])

Checking the dataset after encoding

In [76]:
df_encoded.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,metformin,glipizide,...,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Genitourinary,diag_3_Injury,diag_3_Musculoskeletal,diag_3_Neoplasms,diag_3_Other,diag_3_Respiratory
1,1.0,3,59,0,18,0,0,0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,2.0,2,11,5,13,2,0,1,0.0,2.0,...,False,False,False,False,False,False,False,False,True,False
3,3.0,2,44,1,16,0,0,0,0.0,0.0,...,False,True,False,False,False,False,False,False,False,False
4,4.0,1,51,0,8,0,0,0,0.0,2.0,...,False,False,True,False,False,False,False,False,False,False
5,5.0,3,31,6,16,0,0,0,0.0,0.0,...,False,False,True,False,False,False,False,False,False,False


In [77]:
df_encoded.columns.shape

(76,)

Use scaling if necessary, and also fix class imbalance issue

In [80]:
X.select_dtypes(include='int').columns

Index(['admission_type_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'readmitted'],
      dtype='object')

In [81]:
X.select_dtypes(include='float').columns

Index(['age', 'metformin', 'glipizide', 'pioglitazone', 'rosiglitazone',
       'acarbose', 'insulin'],
      dtype='object')

In [82]:
X.select_dtypes(include='object').columns

Index(['race', 'gender', 'diag_1', 'diag_2', 'diag_3', 'change'], dtype='object')

In [None]:
df.to_csv('filename.csv', index=False)