**<center>===========================================================================================================</center>**
**<center>All Necessary Imports</center>**
**<center>===========================================================================================================</center>**

In [1]:
import pandas as pd
import json
import numpy as np

**<center>===========================================================================================================</center>**
**<center>Data Exploration</center>**
**<center>===========================================================================================================</center>**

In [2]:
df = pd.read_csv('data/raw/diabetic_data.csv')
df = df.drop(['encounter_id', 'patient_nbr'], axis=1) # Drop identifier columns

print(df.shape)

(101766, 48)


**<center>===========================================================================================================</center>**
**<center>Merge IDS Mapping</center>**
**<center>===========================================================================================================</center>**

In [3]:
with open('data/raw/IDS_mapping.csv') as f:
    content = f.read().strip()
sections = content.split(',\n')

admission_type_map = {}
discharge_disposition_map = {}
admission_source_map = {}

for section in sections:
    lines = section.strip().split('\n')
    header = lines[0].strip()
    if header.startswith('admission_type_id'):
        for line in lines[1:]:
            id, desc = line.split(',', 1)
            admission_type_map[int(id)] = desc
    elif header.startswith('discharge_disposition_id'):
        for line in lines[1:]:
            id, desc = line.split(',', 1)
            discharge_disposition_map[int(id)] = desc
    elif header.startswith('admission_source_id'):
        for line in lines[1:]:
            id, desc = line.split(',', 1)
            admission_source_map[int(id)] = desc

print(admission_source_map, discharge_disposition_map, admission_type_map)


def map_ids(df, column_name, mapping):
    df[column_name] = df[column_name].map(mapping)
    return df

df = map_ids(df, 'admission_type_id', admission_type_map)
df = map_ids(df, 'discharge_disposition_id', discharge_disposition_map)
df = map_ids(df, 'admission_source_id', admission_source_map)

{1: ' Physician Referral', 2: 'Clinic Referral', 3: 'HMO Referral', 4: 'Transfer from a hospital', 5: ' Transfer from a Skilled Nursing Facility (SNF)', 6: ' Transfer from another health care facility', 7: ' Emergency Room', 8: ' Court/Law Enforcement', 9: ' Not Available', 10: ' Transfer from critial access hospital', 11: 'Normal Delivery', 12: ' Premature Delivery', 13: ' Sick Baby', 14: ' Extramural Birth', 15: 'Not Available', 17: 'NULL', 18: ' Transfer From Another Home Health Agency', 19: 'Readmission to Same Home Health Agency', 20: ' Not Mapped', 21: 'Unknown/Invalid', 22: ' Transfer from hospital inpt/same fac reslt in a sep claim', 23: ' Born inside this hospital', 24: ' Born outside this hospital', 25: ' Transfer from Ambulatory Surgery Center', 26: 'Transfer from Hospice'} {1: 'Discharged to home', 2: 'Discharged/transferred to another short term hospital', 3: 'Discharged/transferred to SNF', 4: 'Discharged/transferred to ICF', 5: 'Discharged/transferred to another type of 

**<center>===========================================================================================================</center>**
**<center>Data Quality Report</center>**
**<center>===========================================================================================================</center>**

In [4]:
# List of placeholders to consider as missing values
placeholders = ['?', 'Not Available', 'Not Mapped', 'NULL', 'Unknown/Invalid']
data_quality_report = pd.DataFrame(columns=['Feature Name', 'Data Type', 'Completeness (%)', 'Unique Values', 'Consistency'])
report_rows = []

for feature in df.columns:
    data_type = df[feature].dtype
    total_missing = df[feature].isnull().sum() + df[feature].isin(placeholders).sum()
    completeness = 100 * (1 - total_missing / len(df))
    uniqueness = df[feature].nunique()
    consistency = 'Valid' if df[feature].notnull().all() and df[feature].apply(lambda x: isinstance(x, (int, float, str))).all() else 'Invalid'
    
    report_rows.append({
        'Feature Name': feature,
        'Data Type': data_type,
        'Completeness (%)': completeness,
        'Unique Values': uniqueness,
        'Consistency': consistency
    })

data_quality_report = pd.DataFrame(report_rows)
duplicate_records_count = df.duplicated().sum()

# Drop features with more than 30% missing values
threshold = 70
features_to_drop = data_quality_report[data_quality_report['Completeness (%)'] < threshold]
df_preprocessed = df.drop(columns=features_to_drop['Feature Name'])

# Replace placeholders with NaN for imputation
df_preprocessed.replace(placeholders, np.nan, inplace=True)

# Fill missing values with the median for continuous features and mode for categorical features
imputed_features = []
for column in df_preprocessed.columns:
    if df_preprocessed[column].isnull().sum() > 0:
        if df_preprocessed[column].dtype == 'object':
            mode_value = df_preprocessed[column].mode()[0]
            mode_percentage = (df_preprocessed[column] == mode_value).mean() * 100
            df_preprocessed[column] = df_preprocessed[column].fillna(mode_value)
            imputation_method = 'Mode'
            imputed_features.append({'Feature Name': column, 'Imputation Method': imputation_method, 'Imputation Value': mode_value})
        else:
            median_value = df_preprocessed[column].median()
            df_preprocessed[column] = df_preprocessed[column].fillna(median_value)
            imputation_method = 'Median'
            imputed_features.append({'Feature Name': column, 'Imputation Method': imputation_method, 'Imputation Value': median_value})

imputed_features_df = pd.DataFrame(imputed_features)

print("Data Quality Report:")
print(data_quality_report)
print("\nNumber of duplicate records before dropping:", duplicate_records_count)
print("\nDropped Features:")
print(features_to_drop[['Feature Name', 'Completeness (%)']])
print("\nImputed Features:")
print(imputed_features_df)
print("\nNumber of duplicate records after dropping:", df_preprocessed.duplicated().sum())
print(df_preprocessed.shape)

Data Quality Report:
                Feature Name Data Type  Completeness (%)  Unique Values  \
0                       race    object         97.766445              6   
1                     gender    object         99.997052              3   
2                        age    object        100.000000             10   
3                     weight    object          3.141521             10   
4          admission_type_id    object         89.784407              8   
5   discharge_disposition_id    object         95.401215             26   
6        admission_source_id    object         93.336674             17   
7           time_in_hospital     int64        100.000000             14   
8                 payer_code    object         60.442584             18   
9          medical_specialty    object         50.917792             73   
10        num_lab_procedures     int64        100.000000            118   
11            num_procedures     int64        100.000000              7   
12  

**<center>===========================================================================================================</center>**
**<center>Saving the Preprocessed Data and Metadata</center>**
**<center>===========================================================================================================</center>**

In [5]:
target_column = 'readmitted'

# Function to classify feature types
def classify_feature(df, column):
    if df[column].dtype == 'object':
        return 'Categorical'
    else:
        return 'Continuous'

features = {col: classify_feature(df_preprocessed, col) for col in df_preprocessed.columns if col != target_column}
features[target_column] = 'Target'
df_features = pd.DataFrame(list(features.items()), columns=['Feature Name', 'Feature Type'])

df_categorical = df_features[df_features['Feature Type'] == 'Categorical']
df_continuous = df_features[df_features['Feature Type'] == 'Continuous']
df_target = df_features[df_features['Feature Type'] == 'Target']
df_preprocessed.to_csv('data/processed/preprocessed_data.csv', index=False)

print("Categorical Features:")
print(df_categorical)
print("\nContinuous Features:")
print(df_continuous)
print("\nTarget Feature:")
print(df_target)

# Create metadata for later use
metadata = {
    'categorical_features': df_categorical['Feature Name'].tolist(),
    'continuous_features': df_continuous['Feature Name'].tolist(),
    'target': df_target['Feature Name'].tolist()
}

with open('data/processed/feature_categories.json', 'w') as f:
    json.dump(metadata, f)

Categorical Features:
                Feature Name Feature Type
0                       race  Categorical
1                     gender  Categorical
2                        age  Categorical
3          admission_type_id  Categorical
4   discharge_disposition_id  Categorical
5        admission_source_id  Categorical
13                    diag_1  Categorical
14                    diag_2  Categorical
15                    diag_3  Categorical
17                 metformin  Categorical
18               repaglinide  Categorical
19               nateglinide  Categorical
20            chlorpropamide  Categorical
21               glimepiride  Categorical
22             acetohexamide  Categorical
23                 glipizide  Categorical
24                 glyburide  Categorical
25               tolbutamide  Categorical
26              pioglitazone  Categorical
27             rosiglitazone  Categorical
28                  acarbose  Categorical
29                  miglitol  Categorical
30          