# lab3: This is the title of the assignment

In [47]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


## Imports

Authors:  
AlaaAbdulrazzaq Abdulrazzaq  
Name Author 2  
(use double space to break lines on markdowns)

In [48]:
import numpy as np
import pandas as pd 
import matplotlib as mp
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns

## Load data

This section should load the raw dataset for the task.  
Remember to use relative paths to load any files in the notebook.

In [49]:
# Always use comments in the code to document specific steps
diabetes_Data = pd.read_csv('raw_data/diabetic_data.csv')
diabetes_Data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [50]:
admission_Data = pd.read_csv('raw_data/IDS_mapping.csv')
admission_Data.head()

Unnamed: 0,admission_type_id,description
0,1,Emergency
1,2,Urgent
2,3,Elective
3,4,Newborn
4,5,Not Available


## Task 1: Cleanup the data

This section handles the cleanup of the dataset and splitting of the data

both data.csv and IDS mapping files are loaded, the different sections of the IDS mappings are sorted and taken into account when reading the file. 
Description of admission are merged into the dataset file 

In [51]:
# Load main dataset
file_path = "raw_data/diabetic_data.csv"
df = pd.read_csv(file_path, dtype=str)
# Load IDS mapping file
mapping_file_path = "raw_data/IDS_mapping.csv"
mapping_df = pd.read_csv(mapping_file_path, dtype=str)

# Replace '?' and other non-standard missing values with NaN
df.replace(['?', 'Unknown/Invalid', 'Not Available', 'NULL'], np.nan, inplace=True)

# Convert age ranges to numeric values (mean of range)
age_map = {
    "[0-10)": 5, "[10-20)": 15, "[20-30)": 25, "[30-40)": 35,
    "[40-50)": 45, "[50-60)": 55, "[60-70)": 65, "[70-80)": 75,
    "[80-90)": 85, "[90-100)": 95
}
df["age"] = df["age"].map(age_map)

# Convert weight ranges to numeric (if available)
weight_map = {
    "[0-25)": 12.5, "[25-50)": 37.5, "[50-75)": 62.5, "[75-100)": 87.5,
    "[100-125)": 112.5, "[125-150)": 137.5, "[150-175)": 162.5, "[175-200)": 187.5
}
df["weight"] = df["weight"].map(weight_map)

# Identify breakpoints where the column names change in the mapping file
section_starts = mapping_df[mapping_df.iloc[:, 0].isna()].index.tolist()

# Extract each section dynamically
admission_type_map = mapping_df.iloc[:section_starts[0]].dropna()
discharge_disposition_map = mapping_df.iloc[section_starts[0]+1:section_starts[1]].dropna()
admission_source_map = mapping_df.iloc[section_starts[1]+1:].dropna()

# Rename columns properly
admission_type_map.columns = ["admission_type_id", "admission_type_description"]
discharge_disposition_map.columns = ["discharge_disposition_id", "discharge_disposition_description"]
admission_source_map.columns = ["admission_source_id", "admission_source_description"]

# Convert ID columns to numeric for merging
id_columns = ["admission_type_id", "discharge_disposition_id", "admission_source_id"]
for col in id_columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

admission_type_map["admission_type_id"] = pd.to_numeric(admission_type_map["admission_type_id"], errors="coerce")
discharge_disposition_map["discharge_disposition_id"] = pd.to_numeric(discharge_disposition_map["discharge_disposition_id"], errors="coerce")
admission_source_map["admission_source_id"] = pd.to_numeric(admission_source_map["admission_source_id"], errors="coerce")

# Merge descriptions into the main dataset
df = df.merge(admission_type_map, on="admission_type_id", how="left")
df = df.merge(discharge_disposition_map, on="discharge_disposition_id", how="left")
df = df.merge(admission_source_map, on="admission_source_id", how="left")

# Drop original ID columns after merging
df.drop(columns=["admission_type_id", "discharge_disposition_id", "admission_source_id"], inplace=True, errors="ignore")

# Convert numeric columns properly
numeric_cols = [
    "time_in_hospital", "num_lab_procedures", "num_procedures",
    "num_medications", "number_outpatient", "number_emergency", "number_inpatient",
    "number_diagnoses", "weight"
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")
    df[col] = df.groupby("age")[col].transform(lambda x: x.fillna(x.mean()))  # Fill by age group average

# Handle missing values for payer_code and medical_specialty
df["payer_code"].fillna("Unknown", inplace=True)
df["medical_specialty"].fillna("Other", inplace=True)

# Fill categorical missing values with the most frequent value
categorical_cols = df.select_dtypes(include=["object"]).columns
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Display cleaned dataset summary
df.to_csv("cleaned_diabetic_data.csv", index=False)
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["payer_code"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["medical_specialty"].fillna("Other", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setti

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_description,discharge_disposition_description,admission_source_description
0,2278392,8222157,Caucasian,Female,5,22.5,1,Unknown,Pediatrics-Endocrinology,41,...,No,No,No,No,No,No,NO,Emergency,Not Mapped,Physician Referral
1,149190,55629189,Caucasian,Female,15,56.944444,3,Unknown,Other,59,...,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
2,64410,86047875,AfricanAmerican,Female,25,76.630435,2,Unknown,Other,11,...,No,No,No,No,No,Yes,NO,Emergency,Discharged to home,Emergency Room
3,500364,82442376,Caucasian,Male,35,87.202381,2,Unknown,Other,44,...,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
4,16680,42519267,Caucasian,Male,45,95.183983,1,Unknown,Other,51,...,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room


Splitting the dataset: 70% training, 15% validation, 15% test

In [52]:
# Correcting the dataset split process

# Ensure required libraries are imported
from sklearn.model_selection import train_test_split
import pandas as pd

# Define features (X) and target variable (y)
X = df.drop(columns=["readmitted"])  # All columns except the target
y = df["readmitted"]  # Target variable

# Split the dataset: 70% Train, 15% Validation, 15% Test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)  # 30% for validation + test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)  # 50% of remaining for test

# Display the sizes of each split
split_sizes = {
    "Training set": len(X_train),
    "Validation set": len(X_val),
    "Test set": len(X_test)
}

# Create a DataFrame for visualization
df_splits = pd.DataFrame(split_sizes.items(), columns=["Dataset", "Number of Samples"])



Splitting X_temp and y_temp into validation and test sets

In [53]:
# Check the structure of X_temp to ensure it's not an empty or single-dimensional array
X_temp.shape, y_temp.shape

# Display split sizes again

print("Dataset Split Sizes:")
print(f"Training set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")

# Check the unique values in the target variable to ensure it's numeric
y_train_unique = y_train.unique()
y_val_unique = y_val.unique()
y_test_unique = y_test.unique()

y_train_unique, y_val_unique, y_test_unique

Dataset Split Sizes:
Training set: 71236 samples
Validation set: 15265 samples
Test set: 15265 samples


(array(['NO', '>30', '<30'], dtype=object),
 array(['>30', 'NO', '<30'], dtype=object),
 array(['<30', 'NO', '>30'], dtype=object))

## Training

This section should contain:
- Results.
- Summary of best model performance:
    - Name of best model file as saved in /models.
    - Relevant scores such as: accuracy, precision, recall, F1-score, etc.
- Key discussion points.

In [54]:
# Always use comments in the code to document specific steps