In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/diabetic_data.csv')
df.replace('?', np.nan, inplace=True)

print("Dataset loaded successfully!")
print("Shape:", df.shape)

Dataset loaded successfully!
Shape: (101766, 50)


In [2]:
# Drop columns with too many missing values or not useful for prediction
cols_to_drop = ['weight', 'payer_code', 'medical_specialty', 
                'encounter_id', 'patient_nbr']

df.drop(columns=cols_to_drop, inplace=True)

print("Remaining columns:", df.shape[1])
print(df.columns.tolist())

Remaining columns: 45
['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']


In [3]:
# Fill missing values in race with mode
df['race'].fillna(df['race'].mode()[0], inplace=True)

# Fill missing diagnosis columns with 'Unknown'
df['diag_1'].fillna('Unknown', inplace=True)
df['diag_2'].fillna('Unknown', inplace=True)
df['diag_3'].fillna('Unknown', inplace=True)

# Verify no missing values remain
print("Missing values remaining:")
print(df.isnull().sum()[df.isnull().sum() > 0])
print("\nIf nothing printed above, all missing values are handled! ✅")

C:\Users\faiza\AppData\Local\Temp\ipykernel_12884\2927058514.py:2: ChainedAssignmentError: A value is being set on a copy of a DataFrame or Series through chained assignment using an inplace method.
Such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy (due to Copy-on-Write).

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object, or try to avoid an inplace operation using 'df[col] = df[col].method(value)'.

See the documentation for a more detailed explanation: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html
  df['race'].fillna(df['race'].mode()[0], inplace=True)
C:\Users\faiza\AppData\Local\Temp\ipykernel_12884\2927058514.py:5: ChainedAssignmentError: A value is being set on a copy of a DataFrame or Series through chained assig

Missing values remaining:
race              2273
diag_1              21
diag_2             358
diag_3            1423
max_glu_serum    96420
A1Cresult        84748
dtype: int64

If nothing printed above, all missing values are handled! ✅


In [4]:
# Correct way to fill missing values in newer pandas
df['race'] = df['race'].fillna(df['race'].mode()[0])

df['diag_1'] = df['diag_1'].fillna('Unknown')
df['diag_2'] = df['diag_2'].fillna('Unknown')
df['diag_3'] = df['diag_3'].fillna('Unknown')

# max_glu_serum and A1Cresult - fill with 'None' (meaning not tested)
df['max_glu_serum'] = df['max_glu_serum'].fillna('None')
df['A1Cresult'] = df['A1Cresult'].fillna('None')

# Verify
missing_remaining = df.isnull().sum()[df.isnull().sum() > 0]
print("Missing values remaining:", len(missing_remaining))
print("\nIf 0 above, all missing values are handled! ✅")

Missing values remaining: 0

If 0 above, all missing values are handled! ✅


In [5]:
# Convert readmitted to binary — 1 if readmitted within 30 days, 0 otherwise
df['readmitted_30'] = df['readmitted'].apply(lambda x: 1 if x == '<30' else 0)

# Drop original readmitted column
df.drop(columns=['readmitted'], inplace=True)

print("Target variable distribution:")
print(df['readmitted_30'].value_counts())
print("\n0 = Not readmitted within 30 days")
print("1 = Readmitted within 30 days")

Target variable distribution:
readmitted_30
0    90409
1    11357
Name: count, dtype: int64

0 = Not readmitted within 30 days
1 = Readmitted within 30 days


In [6]:
from sklearn.preprocessing import LabelEncoder

# Find all categorical columns
cat_cols = df.select_dtypes(include='object').columns.tolist()

print("Categorical columns to encode:")
print(cat_cols)

# Encode each one
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

print("\nEncoding done! ✅")
print("Shape:", df.shape)

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = df.select_dtypes(include='object').columns.tolist()


Categorical columns to encode:
['race', 'gender', 'age', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']

Encoding done! ✅
Shape: (101766, 45)


In [7]:
# Save cleaned data to outputs folder
df.to_csv('../outputs/cleaned_data.csv', index=False)

print("Cleaned dataset saved successfully! ✅")
print("Location: outputs/cleaned_data.csv")
print("Final shape:", df.shape)

Cleaned dataset saved successfully! ✅
Location: outputs/cleaned_data.csv
Final shape: (101766, 45)
