In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load the dataset
data = pd.read_csv('diabetic_data.csv')

# Display basic information about the dataset
print(data.info())
print(data.describe())

# Data preprocessing
# Remove columns with a single unique value
data = data[[col for col in data.columns if data[col].nunique() > 1]]

# Handle missing values if necessary
data.replace('?', np.nan, inplace=True)
data.fillna(method='ffill', inplace=True)

# Encode categorical features
categorical_columns = data.select_dtypes(include=['object']).columns
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Print the value counts of the 'readmitted' column before transformation
print("Value counts of the 'readmitted' column before transformation:")
print(data['readmitted'].value_counts())

# Define the feature matrix and target vector
X = data.drop(columns=['readmitted'])
# Map readmitted column to binary values: 1 for '<30', 0 otherwise
y = data['readmitted'].map(lambda x: 1 if x == 2 else 0)

# Ensure that the target variable has both classes
print("Value counts of the target variable (y):")
print(y.value_counts())

# Check if the target variable has at least two classes
if y.nunique() < 2:
    raise ValueError("The target variable 'y' must have at least two classes. Please check the preprocessing steps.")

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the gradient boosting classifier
gbc = GradientBoostingClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(estimator=gbc, param_grid=param_grid, cv=5, n_jobs=-1, scoring='roc_auc')
grid_search.fit(X_train_scaled, y_train)

# Best model
best_gbc = grid_search.best_estimator_

# Evaluate the model
y_pred = best_gbc.predict(X_test_scaled)
y_pred_proba = best_gbc.predict_proba(X_test_scaled)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("ROC AUC Score:")
print(roc_auc_score(y_test, y_pred_proba))

# Model performance
accuracy = np.mean(y_pred == y_test)
random_accuracy = np.mean(y_test)
improvement_factor = accuracy / random_accuracy
print(f"Model Accuracy: {accuracy:.2f}")
print(f"Random Guessing Accuracy: {random_accuracy:.2f}")
print(f"Improvement Factor: {improvement_factor:.2f}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load the dataset
data = pd.read_csv('diabetic_data.csv')

# Display basic information about the dataset
print(data.info())
print(data.describe())

# Data preprocessing
# Remove columns with a single unique value
data = data[[col for col in data.columns if data[col].nunique() > 1]]

# Handle missing values
data.replace('?', np.nan, inplace=True)
data.fillna(method='ffill', inplace=True)

# Encode categorical features
categorical_columns = data.select_dtypes(include=['object']).columns
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Print the value counts of the 'readmitted' column before transformation
print("Value counts of the 'readmitted' column before transformation:")
print(data['readmitted'].value_counts())

# Define the feature matrix and target vector
X = data.drop(columns=['readmitted'])
# Map readmitted column to binary values: 1 for '<30', 0 otherwise
y = data['readmitted'].map(lambda x: 1 if x == 2 else 0)

# Ensure that the target variable has both classes
print("Value counts of the target variable (y):")
print(y.value_counts())

# Check if the target variable has at least two classes
if y.nunique() < 2:
    raise ValueError("The target variable 'y' must have at least two classes. Please check the preprocessing steps.")

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the gradient boosting classifier
gbc = GradientBoostingClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(estimator=gbc, param_grid=param_grid, cv=5, n_jobs=-1, scoring='roc_auc')
grid_search.fit(X_train_scaled, y_train)

# Best model
best_gbc = grid_search.best_estimator_

# Evaluate the model
y_pred = best_gbc.predict(X_test_scaled)
y_pred_proba = best_gbc.predict_proba(X_test_scaled)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("ROC AUC Score:")
print(roc_auc_score(y_test, y_pred_proba))

# Model performance
accuracy = np.mean(y_pred == y_test)
random_accuracy = np.mean(y_test)
improvement_factor = accuracy / random_accuracy
print(f"Model Accuracy: {accuracy:.2f}")
print(f"Random Guessing Accuracy: {random_accuracy:.2f}")
print(f"Improvement Factor: {improvement_factor:.2f}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import xgboost as xgb

# Load the dataset
data = pd.read_csv('diabetic_data.csv')

# Display basic information about the dataset
print(data.info())
print(data.describe())

# Data preprocessing
# Remove columns with a single unique value
data = data[[col for col in data.columns if data[col].nunique() > 1]]

# Handle missing values
data.replace('?', np.nan, inplace=True)
data.fillna(method='ffill', inplace=True)

# Encode categorical features
categorical_columns = data.select_dtypes(include=['object']).columns
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Print the value counts of the 'readmitted' column before transformation
print("Value counts of the 'readmitted' column before transformation:")
print(data['readmitted'].value_counts())

# Define the feature matrix and target vector
X = data.drop(columns=['readmitted'])
# Map readmitted column to binary values: 1 for '<30', 0 otherwise
y = data['readmitted'].map(lambda x: 1 if x == 2 else 0)

# Ensure that the target variable has both classes
print("Value counts of the target variable (y):")
print(y.value_counts())

# Check if the target variable has at least two classes
if y.nunique() < 2:
    raise ValueError("The target variable 'y' must have at least two classes. Please check the preprocessing steps.")

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Gradient Boosting Classifier
gbc = GradientBoostingClassifier(random_state=42)
param_grid_gbc = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 4, 5]
}
grid_search_gbc = GridSearchCV(estimator=gbc, param_grid=param_grid_gbc, cv=5, n_jobs=-1, scoring='roc_auc')
grid_search_gbc.fit(X_train_scaled, y_train)
best_gbc = grid_search_gbc.best_estimator_

# Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10]
}
grid_search_rf = GridSearchCV(estimator=rf_clf, param_grid=param_grid_rf, cv=5, n_jobs=-1, scoring='roc_auc')
grid_search_rf.fit(X_train_scaled, y_train)
best_rf = grid_search_rf.best_estimator_

# XGBoost Classifier
xgb_clf = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
grid_search_xgb = GridSearchCV(estimator=xgb_clf, param_grid=param_grid_xgb, cv=5, n_jobs=-1, scoring='roc_auc')
grid_search_xgb.fit(X_train_scaled, y_train)
best_xgb = grid_search_xgb.best_estimator_

# Ensemble Method: Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('rf', best_rf),
    ('gb', best_gbc),
    ('xgb', best_xgb)
], voting='soft')
voting_clf.fit(X_train_scaled, y_train)

# Evaluate all models
models = {
    "Gradient Boosting": best_gbc,
    "Random Forest": best_rf,
    "XGBoost": best_xgb,
    "Voting Classifier": voting_clf
}

for model_name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"{model_name} Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"{model_name} ROC AUC Score:")
    print(roc_auc_score(y_test, y_pred_proba))

# Model performance
    accuracy = np.mean(y_pred == y_test)
    random_accuracy = np.mean(y_test)
    improvement_factor = accuracy / random_accuracy
    print(f"{model_name} Model Accuracy: {accuracy:.2f}")
    print(f"{model_name} Random Guessing Accuracy: {random_accuracy:.2f}")
    print(f"{model_name} Improvement Factor: {improvement_factor:.2f}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_




Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.61      0.63      9381
           1       0.68      0.73      0.70     10973

    accuracy                           0.67     20354
   macro avg       0.67      0.67      0.67     20354
weighted avg       0.67      0.67      0.67     20354

Gradient Boosting Confusion Matrix:
[[5714 3667]
 [3012 7961]]
Gradient Boosting ROC AUC Score:
0.7340286450700532
Gradient Boosting Model Accuracy: 0.67
Gradient Boosting Random Guessing Accuracy: 0.54
Gradient Boosting Improvement Factor: 1.25

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.44      0.53      9381
           1       0.63      0.80      0.70     10973

    accuracy                           0.64     20354
   macro avg       0.64      0.62      0.62     20354
weighted avg       0.64      0.64      0.62     20354

Random Forest Confus

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import xgboost as xgb

# Load the dataset
data = pd.read_csv('diabetic_data.csv')

# Display basic information about the dataset
print(data.info())
print(data.describe())

# Data preprocessing
# Remove columns with a single unique value
data = data[[col for col in data.columns if data[col].nunique() > 1]]

# Handle missing values
data.replace('?', np.nan, inplace=True)
data.fillna(method='ffill', inplace=True)

# Encode categorical features
categorical_columns = data.select_dtypes(include=['object']).columns
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Print the value counts of the 'readmitted' column before transformation
print("Value counts of the 'readmitted' column before transformation:")
print(data['readmitted'].value_counts())

# Define the feature matrix and target vector
X = data.drop(columns=['readmitted'])
# Map readmitted column to binary values: 1 for '<30', 0 otherwise
y = data['readmitted'].map(lambda x: 1 if x == 2 else 0)

# Ensure that the target variable has both classes
print("Value counts of the target variable (y):")
print(y.value_counts())

# Check if the target variable has at least two classes
if y.nunique() < 2:
    raise ValueError("The target variable 'y' must have at least two classes. Please check the preprocessing steps.")

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Gradient Boosting Classifier
gbc = GradientBoostingClassifier(random_state=42)
param_grid_gbc = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 4, 5]
}
grid_search_gbc = GridSearchCV(estimator=gbc, param_grid=param_grid_gbc, cv=5, n_jobs=2, scoring='roc_auc')
grid_search_gbc.fit(X_train_scaled, y_train)
best_gbc = grid_search_gbc.best_estimator_

# Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10]
}
grid_search_rf = GridSearchCV(estimator=rf_clf, param_grid=param_grid_rf, cv=5, n_jobs=2, scoring='roc_auc')
grid_search_rf.fit(X_train_scaled, y_train)
best_rf = grid_search_rf.best_estimator_

# XGBoost Classifier
xgb_clf = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
grid_search_xgb = GridSearchCV(estimator=xgb_clf, param_grid=param_grid_xgb, cv=5, n_jobs=2, scoring='roc_auc')
grid_search_xgb.fit(X_train_scaled, y_train)
best_xgb = grid_search_xgb.best_estimator_

# Ensemble Method: Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('rf', best_rf),
    ('gb', best_gbc),
    ('xgb', best_xgb)
], voting='soft')
voting_clf.fit(X_train_scaled, y_train)

# Evaluate all models
models = {
    "Gradient Boosting": best_gbc,
    "Random Forest": best_rf,
    "XGBoost": best_xgb,
    "Voting Classifier": voting_clf
}

for model_name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"{model_name} Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"{model_name} ROC AUC Score:")
    print(roc_auc_score(y_test, y_pred_proba))

    # Model performance
    accuracy = np.mean(y_pred == y_test)
    random_accuracy = np.mean(y_test)
    improvement_factor = accuracy / random_accuracy
    print(f"{model_name} Model Accuracy: {accuracy:.2f}")
    print(f"{model_name} Random Guessing Accuracy: {random_accuracy:.2f}")
    print(f"{model_name} Improvement Factor: {improvement_factor:.2f}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_




Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.61      0.63      9381
           1       0.68      0.73      0.70     10973

    accuracy                           0.67     20354
   macro avg       0.67      0.67      0.67     20354
weighted avg       0.67      0.67      0.67     20354

Gradient Boosting Confusion Matrix:
[[5714 3667]
 [3012 7961]]
Gradient Boosting ROC AUC Score:
0.7340286450700532
Gradient Boosting Model Accuracy: 0.67
Gradient Boosting Random Guessing Accuracy: 0.54
Gradient Boosting Improvement Factor: 1.25

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.44      0.53      9381
           1       0.63      0.80      0.70     10973

    accuracy                           0.64     20354
   macro avg       0.64      0.62      0.62     20354
weighted avg       0.64      0.64      0.62     20354

Random Forest Confus

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('diabetic_data.csv')

# Display basic information about the dataset
print(data.info())
print(data.describe())

# Data preprocessing
# Remove columns with a single unique value
data = data[[col for col in data.columns if data[col].nunique() > 1]]

# Handle missing values
data.replace('?', np.nan, inplace=True)
data.fillna(method='ffill', inplace=True)

# Encode categorical features
categorical_columns = data.select_dtypes(include=['object']).columns
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Print the value counts of the 'readmitted' column before transformation
print("Value counts of the 'readmitted' column before transformation:")
print(data['readmitted'].value_counts())

# Define the feature matrix and target vector
X = data.drop(columns=['readmitted'])
# Map readmitted column to binary values: 1 for '<30', 0 otherwise
y = data['readmitted'].map(lambda x: 1 if x == 2 else 0)

# Ensure that the target variable has both classes
print("Value counts of the target variable (y):")
print(y.value_counts())

# Check if the target variable has at least two classes
if y.nunique() < 2:
    raise ValueError("The target variable 'y' must have at least two classes. Please check the preprocessing steps.")

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the models and their hyperparameters for GridSearchCV
models = {
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'param_grid': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.05],
            'max_depth': [3, 4, 5]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'param_grid': {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 4, 5],
            'min_samples_split': [2, 5, 10]
        }
    },
    'XGBoost': {
        'model': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
        'param_grid': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.05],
            'max_depth': [3, 4, 5],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0]
        }
    }
}

# Perform GridSearchCV and select the best model for each algorithm
best_estimators = {}
for model_name, model_info in models.items():
    grid_search = GridSearchCV(estimator=model_info['model'], param_grid=model_info['param_grid'], cv=5, n_jobs=2, scoring='roc_auc')
    grid_search.fit(X_train_scaled, y_train)
    best_estimators[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")

# Ensemble Method: Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('rf', best_estimators['Random Forest']),
    ('gb', best_estimators['Gradient Boosting']),
    ('xgb', best_estimators['XGBoost'])
], voting='soft')
voting_clf.fit(X_train_scaled, y_train)

# Evaluate all models using cross-validation
for model_name, model in best_estimators.items():
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='roc_auc')
    print(f"{model_name} Cross-Validation ROC AUC Score: {cv_scores.mean()}")

# Evaluate all models on the test set
models['Voting Classifier'] = {'model': voting_clf}
for model_name, model_info in models.items():
    model = model_info['model']
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"{model_name} Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"{model_name} ROC AUC Score:")
    print(roc_auc_score(y_test, y_pred_proba))

    # Model performance
    accuracy = np.mean(y_pred == y_test)
    random_accuracy = np.mean(y_test)
    improvement_factor = accuracy / random_accuracy
    print(f"{model_name} Model Accuracy: {accuracy:.2f}")
    print(f"{model_name} Random Guessing Accuracy: {random_accuracy:.2f}")
    print(f"{model_name} Improvement Factor: {improvement_factor:.2f}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

KeyboardInterrupt: 

In [None]:
# Data Preprocessing and Model Building in One Cell
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Load the dataset
data = pd.read_csv('diabetic_data.csv')
data.replace('?', np.nan, inplace=True)
data.drop(columns=['weight', 'payer_code', 'medical_specialty'], inplace=True)
data.dropna(subset=['race', 'gender', 'age'], inplace=True)

# Feature Engineering
data['num_medications_age'] = data['num_medications'] * data['age']
data['num_lab_procedures_num_medications'] = data['num_lab_procedures'] * data['num_medications']

# Encode categorical variables
categorical_columns = ['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
                       'max_glu_serum', 'A1Cresult', 'change', 'diabetesMed']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Encode the target variable
data['readmitted'] = data['readmitted'].apply(lambda x: 1 if x == '<30' else 0)

# Define features and target variable
X = data.drop(columns=['readmitted', 'encounter_id', 'patient_nbr'])
y = data['readmitted']

# Ensure all columns are numeric
non_numeric_columns = X.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

# Encode any remaining non-numeric columns
for col in non_numeric_columns:
    X[col] = pd.Categorical(X[col]).codes

# Ensure there are no non-numeric columns left
print("Columns after encoding:", X.select_dtypes(include=['object']).columns)

# Split the dataset into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Lasso for variable selection
lasso = LassoCV(cv=5, n_jobs=-1).fit(X_train_smote, y_train_smote)
importance = np.abs(lasso.coef_)
selected_features = X.columns[importance > 0]
print("Selected features:", selected_features)

# Use selected features
X_train_selected = X_train_smote[:, importance > 0]
X_valid_selected = X_valid_scaled[:, importance > 0]

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_selected, y_train_smote)
y_pred_dt = dt.predict(X_valid_selected)
print("Decision Tree:")
print("Accuracy:", accuracy_score(y_valid, y_pred_dt))
print("Classification Report:\n", classification_report(y_valid, y_pred_dt))
print("ROC-AUC:", roc_auc_score(y_valid, y_pred_dt))

# Logistic Regression
log_reg = LogisticRegression(solver='lbfgs', max_iter=500, n_jobs=-1)
log_reg.fit(X_train_selected, y_train_smote)
y_pred_log_reg = log_reg.predict(X_valid_selected)
print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_valid, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_valid, y_pred_log_reg))
print("ROC-AUC:", roc_auc_score(y_valid, y_pred_log_reg))

# Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train_selected, y_train_smote)
y_pred_gb = gb.predict(X_valid_selected)
print("Gradient Boosting:")
print("Accuracy:", accuracy_score(y_valid, y_pred_gb))
print("Classification Report:\n", classification_report(y_valid, y_pred_gb))
print("ROC-AUC:", roc_auc_score(y_valid, y_pred_gb))

# XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
xgb_model.fit(X_train_selected, y_train_smote)
y_pred_xgb = xgb_model.predict(X_valid_selected)
print("XGBoost:")
print("Accuracy:", accuracy_score(y_valid, y_pred_xgb))
print("Classification Report:\n", classification_report(y_valid, y_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_valid, y_pred_xgb))

# Support Vector Machine
svm = SVC(kernel='linear', probability=True, random_state=42)
svm.fit(X_train_selected, y_train_smote)
y_pred_svm = svm.predict(X_valid_selected)
print("Support Vector Machine:")
print("Accuracy:", accuracy_score(y_valid, y_pred_svm))
print("Classification Report:\n", classification_report(y_valid, y_pred_svm))
print("ROC-AUC:", roc_auc_score(y_valid, y_pred_svm))

# Model comparison
models = ['Decision Tree', 'Logistic Regression', 'Gradient Boosting', 'XGBoost', 'SVM']
accuracies = [accuracy_score(y_valid, y_pred_dt), accuracy_score(y_valid, y_pred_log_reg), accuracy_score(y_valid, y_pred_gb), accuracy_score(y_valid, y_pred_xgb), accuracy_score(y_valid, y_pred_svm)]
roc_aucs = [roc_auc_score(y_valid, y_pred_dt), roc_auc_score(y_valid, y_pred_log_reg), roc_auc_score(y_valid, y_pred_gb), roc_auc_score(y_valid, y_pred_xgb), roc_auc_score(y_valid, y_pred_svm)]

model_comparison = pd.DataFrame({
    'Model': models,
    'Accuracy': accuracies,
    'ROC-AUC': roc_aucs
})

print(model_comparison)

# Hyperparameter tuning for SVM
param_grid_svm = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(SVC(probability=True, random_state=42), param_grid_svm, cv=5, scoring='roc_auc', n_jobs=-1)
grid_svm.fit(X_train_selected, y_train_smote)

# Best parameters and score for SVM
print("Best parameters for SVM:", grid_svm.best_params_)
print("Best ROC-AUC score for SVM:", grid_svm.best_score_)

# Evaluate on the test set
best_svm = grid_svm.best_estimator_
y_pred_svm_best = best_svm.predict(X_valid_selected)

print("SVM with Best Parameters:")
print("Accuracy:", accuracy_score(y_valid, y_pred_svm_best))
print("Classification Report:\n", classification_report(y_valid, y_pred_svm_best))
print("ROC-AUC:", roc_auc_score(y_valid, y_pred_svm_best))

conf_matrix_svm_best = confusion_matrix(y_valid, y_pred_svm_best)
sns.heatmap(conf_matrix_svm_best, annot=True, fmt='d')
plt.title("SVM with Best Parameters Confusion Matrix")
plt.show()


Non-numeric columns: Index(['diag_1', 'diag_2', 'diag_3', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'num_medications_age'],
      dtype='object')
Columns after encoding: Index([], dtype='object')
Selected features: Index(['time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', '

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt

# Load and preprocess the dataset
data = pd.read_csv('diabetic_data.csv')
data.replace('?', np.nan, inplace=True)
data.drop(columns=['weight', 'payer_code', 'medical_specialty'], inplace=True)
data.dropna(subset=['race', 'gender', 'age'], inplace=True)

# Feature Engineering
data['num_medications_age'] = data['num_medications'] * data['age']
data['num_lab_procedures_num_medications'] = data['num_lab_procedures'] * data['num_medications']

# Encode categorical variables
categorical_columns = ['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'max_glu_serum', 'A1Cresult', 'change', 'diabetesMed']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Encode the target variable
data['readmitted'] = data['readmitted'].apply(lambda x: 1 if x == '<30' else 0)

# Define features and target variable
X = data.drop(columns=['readmitted', 'encounter_id', 'patient_nbr'])
y = data['readmitted']

# Encode any remaining non-numeric columns
non_numeric_columns = X.select_dtypes(include=['object']).columns
for col in non_numeric_columns:
    X[col] = pd.Categorical(X[col]).codes

# Split the dataset into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Feature selection with Lasso
lasso = LassoCV(cv=5, n_jobs=-1).fit(X_train_smote, y_train_smote)
importance = np.abs(lasso.coef_)
selected_features = X.columns[importance > 0]

# Use selected features
X_train_selected = X_train_smote[:, importance > 0]
X_valid_selected = X_valid_scaled[:, importance > 0]

# Model evaluation function
def evaluate_model(model, X_valid, y_valid):
    y_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    roc_auc = roc_auc_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred)
    recall = recall_score(y_valid, y_pred)
    f1 = f1_score(y_valid, y_pred)
    return accuracy, roc_auc, precision, recall, f1

# Initialize models
models = {
    #'Decision Tree': DecisionTreeClassifier(random_state=42),
    #'Logistic Regression': LogisticRegression(solver='lbfgs', max_iter=500, n_jobs=-1),
    #'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    #'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1),
    'SVM': SVC(kernel='linear', probability=True, random_state=42)
}

# Train and evaluate models
results = []
for model_name, model in models.items():
    model.fit(X_train_selected, y_train_smote)
    accuracy, roc_auc, precision, recall, f1 = evaluate_model(model, X_valid_selected, y_valid)
    results.append([model_name, accuracy, roc_auc, precision, recall, f1])

# Display results
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'ROC-AUC', 'Precision', 'Recall', 'F1'])
print(results_df)

# Hyperparameter tuning for SVM
param_grid_svm = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(SVC(probability=True, random_state=42), param_grid_svm, cv=5, scoring='roc_auc', n_jobs=-1)
grid_svm.fit(X_train_selected, y_train_smote)

# Best parameters and score for SVM
print("Best parameters for SVM:", grid_svm.best_params_)
print("Best ROC-AUC score for SVM:", grid_svm.best_score_)

# Evaluate on the validation set
best_svm = grid_svm.best_estimator_
accuracy, roc_auc, precision, recall, f1 = evaluate_model(best_svm, X_valid_selected, y_valid)

print("SVM with Best Parameters:")
print(f"Accuracy: {accuracy}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

conf_matrix_svm_best = confusion_matrix(y_valid, best_svm.predict(X_valid_selected))
sns.heatmap(conf_matrix_svm_best, annot=True, fmt='d')
plt.title("SVM with Best Parameters Confusion Matrix")
plt.show()


In [1]:
import cudf
import cupy as cp
import pandas as pd
import numpy as np
from cuml.preprocessing.model_selection import train_test_split
from cuml.linear_model import LogisticRegression
from cuml.ensemble import RandomForestClassifier
from cuml.svm import SVC
from cuml.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt

# Load and preprocess the dataset using cuDF
data = cudf.read_csv('diabetic_data.csv')
data = data.replace('?', np.nan)
data = data.drop(columns=['weight', 'payer_code', 'medical_specialty'])
data = data.dropna(subset=['race', 'gender', 'age'])

# Feature Engineering
data['num_medications_age'] = data['num_medications'] * data['age']
data['num_lab_procedures_num_medications'] = data['num_lab_procedures'] * data['num_medications']

# Encode categorical variables using cuDF
categorical_columns = ['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'max_glu_serum', 'A1Cresult', 'change', 'diabetesMed']
data = cudf.get_dummies(data, columns=categorical_columns, drop_first=True)

# Encode the target variable
data['readmitted'] = data['readmitted'].applymap(lambda x: 1 if x == '<30' else 0)

# Define features and target variable
X = data.drop(columns=['readmitted', 'encounter_id', 'patient_nbr'])
y = data['readmitted']

# Encode any remaining non-numeric columns
non_numeric_columns = X.select_dtypes(include=['object']).columns
for col in non_numeric_columns:
    X[col] = X[col].astype('category').cat.codes

# Split the dataset into training and validation sets using cuML
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data and apply SMOTE
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(cp.asnumpy(X_train_scaled), cp.asnumpy(y_train))

X_train_smote = cp.array(X_train_smote)
y_train_smote = cp.array(y_train_smote)

# Model evaluation function
def evaluate_model(model, X_valid, y_valid):
    y_pred = model.predict(X_valid)
    y_pred = cp.asnumpy(y_pred)
    y_valid = cp.asnumpy(y_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    roc_auc = roc_auc_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred)
    recall = recall_score(y_valid, y_pred)
    f1 = f1_score(y_valid, y_pred)
    return accuracy, roc_auc, precision, recall, f1

# Initialize models
models = {
    #'Logistic Regression': LogisticRegression(max_iter=500),
    #'Random Forest': RandomForestClassifier(random_state=42),
    #'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, tree_method='gpu_hist'),
    'SVM': SVC(kernel='linear', probability=True, random_state=42)
}

# Train and evaluate models
results = []
for model_name, model in models.items():
    model.fit(X_train_smote, y_train_smote)
    accuracy, roc_auc, precision, recall, f1 = evaluate_model(model, X_valid_scaled, y_valid)
    results.append([model_name, accuracy, roc_auc, precision, recall, f1])

# Display results
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'ROC-AUC', 'Precision', 'Recall', 'F1'])
print(results_df)

# Hyperparameter tuning for SVM using cuML
param_distributions_svm = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# You may need to implement your own RandomizedSearchCV or use the existing one from scikit-learn with n_jobs set to 1
# because cuML doesn't have a direct implementation of RandomizedSearchCV.
from sklearn.model_selection import RandomizedSearchCV

grid_svm = RandomizedSearchCV(SVC(probability=True, random_state=42), param_distributions_svm, n_iter=20, cv=5, scoring='roc_auc', random_state=42)
grid_svm.fit(cp.asnumpy(X_train_smote), cp.asnumpy(y_train_smote))

# Best parameters and score for SVM
print("Best parameters for SVM:", grid_svm.best_params_)
print("Best ROC-AUC score for SVM:", grid_svm.best_score_)

# Evaluate on the validation set
best_svm = grid_svm.best_estimator_
accuracy, roc_auc, precision, recall, f1 = evaluate_model(best_svm, X_valid_scaled, y_valid)

print("SVM with Best Parameters:")
print(f"Accuracy: {accuracy}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

conf_matrix_svm_best = confusion_matrix(cp.asnumpy(y_valid), cp.asnumpy(best_svm.predict(X_valid_scaled)))
sns.heatmap(conf_matrix_svm_best, annot=True, fmt='d')
plt.title("SVM with Best Parameters Confusion Matrix")
plt.show()


ModuleNotFoundError: No module named 'cudf'