In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("hospital_readmissions.csv")
data.head()

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,no,yes,no
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,no,yes,no
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,yes,yes,yes
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,yes,yes,yes
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,no,yes,no


In [3]:
# Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

# Example of filling missing values (customize based on your needs)
data['medical_specialty'].fillna('Missing', inplace=True)

age                  0
time_in_hospital     0
n_lab_procedures     0
n_procedures         0
n_medications        0
n_outpatient         0
n_inpatient          0
n_emergency          0
medical_specialty    0
diag_1               0
diag_2               0
diag_3               0
glucose_test         0
A1Ctest              0
change               0
diabetes_med         0
readmitted           0
dtype: int64


In [4]:
# Convert categorical variables to numerical using one-hot encoding or label encoding
categorical_columns = ['age', 'medical_specialty', 'glucose_test', 'A1Ctest', 'change', 'diabetes_med', 'diag_1', 'diag_2', 'diag_3']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

In [5]:
# Convert the target variable 'readmitted' to numerical
data['readmitted'] = data['readmitted'].apply(lambda x: 1 if x == 'yes' else 0)

In [6]:
# Create new features if needed (e.g., interaction terms, derived metrics)
# Example: Total visits
data['total_visits'] = data['n_outpatient'] + data['n_inpatient'] + data['n_emergency']

In [7]:
from sklearn.model_selection import train_test_split

X = data.drop('readmitted', axis=1)  # Features
y = data['readmitted']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create a base model
rf = RandomForestClassifier(random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Train the model with the best parameters
best_rf = grid_search.best_estimator_
best_rf.fit(X_train, y_train)

# Predict on the test set
y_pred = best_rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC Score: {roc_auc}')

Fitting 3 folds for each of 216 candidates, totalling 648 fits




Best parameters found:  {'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 0.6166
Precision: 0.6088069636456733
Recall: 0.5076857386848848
F1 Score: 0.5536670547147846
ROC AUC Score: 0.6101257888307795


In [10]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.3-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
   ---------------------------------------- 258.3/258.3 kB 2.0 MB/s eta 0:00:00
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.3 imblearn-0.0




[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

# Train the model with balanced data
model = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
model.fit(X_res, y_res)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC Score: {roc_auc}')

Fitting 3 folds for each of 216 candidates, totalling 648 fits
Accuracy: 0.618
Precision: 0.5969479353680431
Recall: 0.5678906917164817
F1 Score: 0.5820568927789934
ROC AUC Score: 0.615021342848459


In [15]:
# Sanitize column names
data.columns = data.columns.str.replace('[^A-Za-z0-9_]', '', regex=True)

In [16]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Create an XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


ValueError: 
All the 729 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
729 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 1500, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 521, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 958, in _create_dmatrix
    return QuantileDMatrix(
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 1529, in __init__
    self._init(
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 1588, in _init
    it.reraise()
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 576, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 557, in _handle_exception
    return fn()
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 641, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\data.py", line 1280, in next
    input_data(**self.kwargs)
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 633, in input_data
    self.proxy.set_info(
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 946, in set_info
    self.feature_names = feature_names
  File "c:\Users\Bhaskar Banerjee\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 1322, in feature_names
    raise ValueError(
ValueError: feature_names must be string, and may not contain [, ] or <


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the dataset
file_path = 'hospital_readmissions.csv'
data = pd.read_csv(file_path)

# Sanitize column names
data.columns = data.columns.str.replace('[^A-Za-z0-9_]', '', regex=True)

# Handle missing values
data['medical_specialty'].fillna('Missing', inplace=True)

# Convert categorical variables to numerical using one-hot encoding or label encoding
categorical_columns = ['age', 'medical_specialty', 'glucose_test', 'A1Ctest', 'change', 'diabetes_med', 'diag_1', 'diag_2', 'diag_3']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Convert the target variable 'readmitted' to numerical
data['readmitted'] = data['readmitted'].apply(lambda x: 1 if x == 'yes' else 0)

# Split the data into training and testing sets
X = data.drop('readmitted', axis=1)  # Features
y = data['readmitted']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC Score: {roc_auc}')

ValueError: feature_names must be string, and may not contain [, ] or <

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the dataset
file_path = 'hospital_readmissions.csv'
data = pd.read_csv(file_path)

# Sanitize column names
data.columns = data.columns.str.replace('[^A-Za-z0-9_]', '', regex=True)

# Handle missing values
data['medical_specialty'].fillna('Missing', inplace=True)

# Convert categorical variables to numerical using one-hot encoding or label encoding
categorical_columns = ['age', 'medical_specialty', 'glucose_test', 'A1Ctest', 'change', 'diabetes_med', 'diag_1', 'diag_2', 'diag_3']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Convert the target variable 'readmitted' to numerical
data['readmitted'] = data['readmitted'].apply(lambda x: 1 if x == 'yes' else 0)

# Ensure all column names are valid
valid_column_names = {col: col.replace('<', '').replace('>', '').replace('[', '').replace(']', '').replace('{', '').replace('}', '') for col in data.columns}
data.rename(columns=valid_column_names, inplace=True)

# Split the data into training and testing sets
X = data.drop('readmitted', axis=1)  # Features
y = data['readmitted']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the neural network
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Predict on the test set
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC Score: {roc_auc}')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy: 0.577
Precision: 0.5574683544303798
Recall: 0.4701110162254483
F1 Score: 0.5100764419735928
ROC AUC Score: 0.5934071706573263


In [19]:
from sklearn.ensemble import AdaBoostClassifier
# Train the AdaBoost model
adaboost_model = AdaBoostClassifier(n_estimators=100, random_state=42)
adaboost_model.fit(X_train, y_train)

# Predict on the test set
y_pred = adaboost_model.predict(X_test)
y_pred_proba = adaboost_model.predict_proba(X_test)[:, 1]

In [20]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC Score: {roc_auc}')

Accuracy: 0.6132
Precision: 0.6167048054919908
Recall: 0.46029035012809566
F1 Score: 0.5271393643031785
ROC AUC Score: 0.6491910247587324


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the dataset
file_path = 'hospital_readmissions.csv'
data = pd.read_csv(file_path)

# Sanitize column names
data.columns = data.columns.str.replace('[^A-Za-z0-9_]', '', regex=True)

# Handle missing values
data['medical_specialty'].fillna('Missing', inplace=True)

# Convert categorical variables to numerical using one-hot encoding or label encoding
categorical_columns = ['age', 'medical_specialty', 'glucose_test', 'A1Ctest', 'change', 'diabetes_med', 'diag_1', 'diag_2', 'diag_3']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Convert the target variable 'readmitted' to numerical
data['readmitted'] = data['readmitted'].apply(lambda x: 1 if x == 'yes' else 0)

# Ensure all column names are valid
valid_column_names = {col: col.replace('<', '').replace('>', '').replace('[', '').replace(']', '').replace('{', '').replace('}', '') for col in data.columns}
data.rename(columns=valid_column_names, inplace=True)

# Split the data into features and target
X = data.drop('readmitted', axis=1)  # Features
y = data['readmitted']  # Target variable

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the models to compare
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
    'AdaBoost': AdaBoostClassifier(n_estimators=100),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'SVM': SVC(probability=True)
}

# Function to evaluate the models
def evaluate_model(model, X, y):
    accuracy = cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()
    precision = cross_val_score(model, X, y, cv=5, scoring='precision').mean()
    recall = cross_val_score(model, X, y, cv=5, scoring='recall').mean()
    f1 = cross_val_score(model, X, y, cv=5, scoring='f1').mean()
    roc_auc = cross_val_score(model, X, y, cv=5, scoring='roc_auc').mean()
    return accuracy, precision, recall, f1, roc_auc

# Evaluate each model and store the results
results = {}
for model_name, model in models.items():
    accuracy, precision, recall, f1, roc_auc = evaluate_model(model, X_scaled, y)
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    }

# Display the results
results_df = pd.DataFrame(results).T
print(results_df)


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

In [19]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC Score: {roc_auc}')

Accuracy: 0.609
Precision: 0.5971873430436966
Recall: 0.5076857386848848
F1 Score: 0.5488114470343873
ROC AUC Score: 0.6029775570775816


In [20]:
importances = model.feature_importances_
feature_names = X.columns
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print(feature_importances)

                                     Feature  Importance
1                           n_lab_procedures    0.150624
3                              n_medications    0.131824
0                           time_in_hospital    0.094116
2                               n_procedures    0.060463
45                              total_visits    0.047161
5                                n_inpatient    0.034965
43                              diag_3_Other    0.025746
36                              diag_2_Other    0.025428
22                                change_yes    0.021304
29                              diag_1_Other    0.020992
15                 medical_specialty_Missing    0.019920
4                               n_outpatient    0.019004
9                                age_[70-80)    0.018845
8                                age_[60-70)    0.018691
38                           diag_3_Diabetes    0.018685
30                        diag_1_Respiratory    0.017319
20                             