Imports and Data Loading

In [41]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline




In [34]:
# Loading dataset

data_set=pd.read_excel("Healthcare_dataset.xlsx")

In [45]:
# Define the feature matrix and target vector
X = data_set.drop(columns=['Persistency_Flag'])
y = data_set['Persistency_Flag']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000,random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
    'SVM': SVC(kernel='linear', random_state=42)
}

# Initialize a dictionary to store results
results_combined = {}

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Evaluate each model
for model_name, model in models.items():
    # Create and evaluate the pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='Persistent')
    recall = recall_score(y_test, y_pred, pos_label='Persistent')
    roc_auc = roc_auc_score(pd.get_dummies(y_test)['Persistent'], pd.get_dummies(y_pred)['Persistent'])

    # Store results
    results_combined[model_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'roc_auc': roc_auc
    }

# Convert results to a DataFrame
metrics_df = pd.DataFrame(results_combined)

metrics_df

Unnamed: 0,Random Forest,Logistic Regression,Gradient Boosting,SVM
accuracy,0.816058,0.808759,0.805839,0.80438
precision,0.793578,0.766234,0.778802,0.763158
recall,0.681102,0.69685,0.665354,0.685039
roc_auc,0.788347,0.78578,0.776993,0.779875
