In [42]:
import pandas as pd
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr


# Data Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Machine Learning Model
from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb

# Evaluation Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

ModuleNotFoundError: No module named 'xgboost'

In [43]:
pip install xgboost

Collecting xgboost
  Using cached xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Using cached xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [29]:

# Load the dataset
file_path = './data/Data Insights - Synthetic Dataset.xlsx'
df = pd.read_excel(file_path, sheet_name='Data Insights - Synthetic Datas')

In [30]:
# Function to convert object columns to float
def convert_to_float(df, columns):
    for col in columns:
        try:
            # Attempt to convert the column to float
            df[col] = pd.to_numeric(df[col], errors='coerce')
            print(f"Successfully converted {col} to float.")
        except Exception as e:
            print(f"Error converting {col}: {e}")
    return df


charge_columns = ['AccommodationCharge', 'CCU_Charges', 'ICU_Charge', 'TheatreCharge', 'PharmacyCharge', 'ProsthesisCharge', 'OtherCharges', 'BundledCharges']

In [31]:

df = convert_to_float(df, charge_columns)

Successfully converted AccommodationCharge to float.
Successfully converted CCU_Charges to float.
Successfully converted ICU_Charge to float.
Successfully converted TheatreCharge to float.
Successfully converted PharmacyCharge to float.
Successfully converted ProsthesisCharge to float.
Successfully converted OtherCharges to float.
Successfully converted BundledCharges to float.


In [32]:
# Fill NaN values in the charge columns with 0
df[charge_columns] = df[charge_columns].fillna(0)

### 2. Create Features from the Data Set

#### 2.1 Create Length of Stay Column

In [33]:
# Function to calculate the Length of Stay (LOS)
def calculate_length_of_stay(df, admission_col, separation_col):
    # Convert admission and separation dates to datetime format
    df[admission_col] = pd.to_datetime(df[admission_col], errors='coerce')
    df[separation_col] = pd.to_datetime(df[separation_col], errors='coerce')
    
    # Calculate the difference in days between SeparationDate and AdmissionDate
    df['LengthOfStay'] = (df[separation_col] - df[admission_col]).dt.days
    
    # Handle cases where LOS is negative or missing (e.g., errors in dates)
    df['LengthOfStay'] = df['LengthOfStay'].apply(lambda x: x if x >= 0 else None)
    
    return df

# Applying the function
df = calculate_length_of_stay(df, 'AdmissionDate', 'SeparationDate')


#### 2.2 Create Total Charges Column

In [34]:
# Calculate Total Charge without PharmacyCharge
df['TotalCharges'] = df[['AccommodationCharge', 'TheatreCharge', 
                         'ProsthesisCharge', 'OtherCharges', 
                         'BundledCharges', 'CCU_Charges', 'ICU_Charge']].sum(axis=1, skipna=True)

In [35]:
#### 2.3 Map Diagnosis Codes to Categories
# Define ICD-10 chapters with their corresponding code ranges
icd10_chapters = [
    ('A00', 'B99', 'Certain infectious and parasitic diseases'),
    ('C00', 'D48', 'Neoplasms'),
    ('D50', 'D89', 'Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism'),
    ('E00', 'E89', 'Endocrine, nutritional and metabolic diseases'),
    ('F00', 'F99', 'Mental and behavioural disorders'),
    ('G00', 'G99', 'Diseases of the nervous system'),
    ('H00', 'H59', 'Diseases of the eye and adnexa'),
    ('H60', 'H95', 'Diseases of the ear and mastoid process'),
    ('I00', 'I99', 'Diseases of the circulatory system'),
    ('J00', 'J99', 'Diseases of the respiratory system'),
    ('K00', 'K95', 'Diseases of the digestive system'),
    ('L00', 'L99', 'Diseases of the skin and subcutaneous tissue'),
    ('M00', 'M99', 'Diseases of the musculoskeletal system and connective tissue'),
    ('N00', 'N99', 'Diseases of the genitourinary system'),
    ('O00', 'O99', 'Pregnancy, childbirth and the puerperium'),
    ('P00', 'P96', 'Certain conditions originating in the perinatal period'),
    ('Q00', 'Q99', 'Congenital malformations, deformations and chromosomal abnormalities'),
    ('R00', 'R99', 'Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified'),
    ('S00', 'T98', 'Injury, poisoning and certain other consequences of external causes'),
    ('U00', 'U99', 'Codes for special purposes'),
    ('V00', 'Y99', 'External causes of morbidity and mortality'),
    ('Z00', 'Z99', 'Factors influencing health status and contact with health services')
]

def map_icd10_to_chapter(code: str) -> str:
    """
    Maps an ICD-10 diagnosis code to its corresponding chapter.
    """
    code = code.upper().strip()
    if len(code) < 3:
        code_prefix = code.ljust(3, '0')
    else:
        code_prefix = code[:3]
    
    for start, end, chapter in icd10_chapters:
        if start <= code_prefix <= end:
            return chapter
    return 'Unknown'

def add_icd10_chapters(df: pd.DataFrame, diagnosis_cols: list) -> pd.DataFrame:
    """
    Adds ICD-10 chapter mappings to the DataFrame for specified diagnosis columns.
    """
    for diag_col in diagnosis_cols:
        chapter_col = diag_col + '_Chapter'
        df[chapter_col] = df[diag_col].apply(map_icd10_to_chapter)
    return df

In [36]:
# Add ICD-10 Chapters
df = add_icd10_chapters(df, ['PrincipalDiagnosis'])

In [37]:
# Visualization (Optional)
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')


In [38]:
def preprocess_data(numerical_cols: list, categorical_cols: list) -> ColumnTransformer:
    """
    Creates a ColumnTransformer for preprocessing numerical and categorical data.
    
    Parameters:
    - numerical_cols (list): List of numerical column names.
    - categorical_cols (list): List of categorical column names.
    
    Returns:
    - preprocessor (ColumnTransformer): Preprocessing pipeline.
    """
    # Numerical preprocessing pipeline
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Categorical preprocessing pipeline
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    
    return preprocessor


In [39]:
def create_model_pipeline(preprocessor: ColumnTransformer, model_type: str = 'RandomForest') -> Pipeline:
    """
    Creates a machine learning pipeline with preprocessing and regression model.
    
    Parameters:
    - preprocessor (ColumnTransformer): Preprocessing pipeline.
    - model_type (str): Type of regression model ('RandomForest', 'GradientBoosting', 'LinearRegression').
    
    Returns:
    - model (Pipeline): Complete machine learning pipeline.
    """
    if model_type == 'RandomForest':
        regressor = RandomForestRegressor(n_estimators=100, random_state=42)
    elif model_type == 'GradientBoosting':
        regressor = GradientBoostingRegressor(n_estimators=100, random_state=42)
    elif model_type == 'LinearRegression':
        regressor = LinearRegression()
    elif model_type == 'XGBoost':
        regressor = xgb.XGBRegressor(n_estimators=100, random_state=42, objective='reg:squarederror')
    else:
        raise ValueError("Unsupported model_type. Choose from 'RandomForest', 'GradientBoosting', 'LinearRegression'.")
    
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', regressor)
    ])
    
    return model

In [40]:
def cross_validate_model(model: Pipeline, X: pd.DataFrame, y: pd.Series, n_splits: int = 5) -> dict:
    """
    Performs cross-validation and returns evaluation metrics.
    
    Parameters:
    - model (Pipeline): Machine learning pipeline.
    - X (pd.DataFrame): Feature set.
    - y (pd.Series): Target variable.
    - n_splits (int): Number of cross-validation folds.
    
    Returns:
    - cv_metrics (dict): Dictionary containing cross-validation metrics.
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']
    
    cv_results = cross_validate(model, X, y, cv=kf, scoring=scoring, return_train_score=False)
    
    # Convert negative metrics to positive
    cv_metrics = {
        'Mean Absolute Error (MAE)': -cv_results['test_neg_mean_absolute_error'].mean(),
        'Mean Squared Error (MSE)': -cv_results['test_neg_mean_squared_error'].mean(),
        'R-squared (R²)': cv_results['test_r2'].mean()
    }
    
    print("\nCross-Validation Metrics:")
    for metric, value in cv_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    return cv_metrics

In [41]:
def train_and_test_model(model: Pipeline, X: pd.DataFrame, y: pd.Series, test_size: float = 0.2) -> (dict, np.ndarray):
    """
    Splits the data, trains the model, and evaluates it on the test set.
    
    Parameters:
    - model (Pipeline): Machine learning pipeline.
    - X (pd.DataFrame): Feature set.
    - y (pd.Series): Target variable.
    - test_size (float): Proportion of the dataset to include in the test split.
    
    Returns:
    - test_metrics (dict): Dictionary containing test set evaluation metrics.
    - y_pred (np.ndarray): Predicted values on the test set.
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    test_metrics = {
        'Mean Absolute Error (MAE)': mae,
        'Root Mean Squared Error (RMSE)': rmse,
        'R-squared (R²)': r2
    }
    
    print("\nTest Set Evaluation:")
    for metric, value in test_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    return test_metrics, y_pred