# Step 1: Prepare a dataset filled missing values with mean and mode

In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Sample DataFrame (replace with your data)

df = pd.read_csv('/workspaces/Chronic-Kidney-Disease/Data files/df_features_selected.csv')


for column in df.columns:
    if df[column].dtype in ['int64', 'float64']:  # Numerical column → mean
        imputer = SimpleImputer(strategy='mean')
    else:  # Categorical column → mode
        imputer = SimpleImputer(strategy='most_frequent')
    df[column] = imputer.fit_transform(df[[column]]).ravel()

encoding_rules = {
    # Binary encoding (yes->1, no->0)
    'hypertension': {'yes': 1, 'no': 0},  # Hypertension
    'diabetes mellitus': {'yes': 1, 'no': 0},    # Diabetes Mellitus
    'coronary artery disease': {'yes': 1, 'no': 0},   # Coronary Artery Disease
    'appetite': {'good': 0, 'poor': 1},
    'pedal edema': {'yes': 1, 'no': 0},    # Pedal Edema
    'anemia': {'yes': 1, 'no': 0},   # Anemia
    'ckd or not ckd': {'ckd': 1, 'notckd': 0}  # Target variable
}

# Apply encoding
for column, mapping in encoding_rules.items():
    if column in df.columns:
        df[column] = df[column].map(mapping).astype('float64')



#df.to_csv("df_filled.csv", index=False)


In [3]:
# Step 2: Try to input a data to evaluate using the best conditions found

In [4]:
df = pd.read_csv('df_filled.csv')
#user data
data = {
    'age': [30],
    'hypertension': [1],
    'diabetes mellitus': [0],
    'coronary artery disease': [0],
    'appetite': [0],
    'anemia': [0],
    'pedal edema': [0]
}
df_user = pd.DataFrame(data)

from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
df['age']=scaler.fit_transform(df[['age']])
df_user['age']=scaler.transform(df_user[['age']])
X=df.drop(['ckd or not ckd'],axis=1)
y=df['ckd or not ckd']

from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier
)
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.base import clone

models = {
    'RandomForestClassifier': RandomForestClassifier(
        bootstrap=True,
        class_weight='balanced',
        criterion='gini',
        max_depth=None,
        max_features='sqrt',
        min_samples_leaf=2,
        min_samples_split=5,
        n_estimators=100,
        random_state=42
    ),
    'ExtraTreesClassifier': ExtraTreesClassifier(
        bootstrap=True,
        class_weight='balanced',
        criterion='gini',
        max_depth=None,
        max_features='sqrt',
        min_samples_leaf=2,
        min_samples_split=5,
        n_estimators=100,
        random_state=42
    ),
    'GradientBoostingClassifier': GradientBoostingClassifier(
        criterion='friedman_mse',
        learning_rate=0.01,
        max_depth=5,
        max_features='sqrt',
        min_samples_leaf=1,
        min_samples_split=2,
        n_estimators=100,
        subsample=0.8,
        random_state=42
    ),
    'AdaBoostClassifier': AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=2),
        learning_rate=1.0,
        n_estimators=100,
        random_state=42
    ),
    'GaussianNB': GaussianNB(
        var_smoothing=1e-11
    ),
    'LogisticRegression': LogisticRegression(
        C=1,
        class_weight='balanced',
        max_iter=1000,
        penalty='l2',
        solver='lbfgs',
        random_state=42
    ),
    'DecisionTreeClassifier': DecisionTreeClassifier(
        ccp_alpha=0.0,
        class_weight='balanced',
        criterion='gini',
        max_depth=None,
        max_features='sqrt',
        min_samples_leaf=2,
        min_samples_split=2,
        splitter='best',
        random_state=42
    ),
    'SVC': SVC(
        C=10,
        class_weight='balanced',
        degree=2,
        gamma='scale',
        kernel='rbf',
        probability=True,
        random_state=42
    ),
    'XGBClassifier': XGBClassifier(
        colsample_bytree=0.8,
        learning_rate=0.01,
        max_depth=3,
        min_child_weight=1,
        n_estimators=100,
        objective='binary:logistic',
        subsample=0.8,
        tree_method='auto',
        random_state=42,
        eval_metric='logloss'
    ),
    'KNeighborsClassifier': KNeighborsClassifier(
        algorithm='auto',
        leaf_size=30,
        n_neighbors=3,
        weights='uniform'
    )
}

tier1_clf = VotingClassifier(
    estimators=[('GaussianNB', models['GaussianNB']), ('LogisticRegression', models['LogisticRegression'])],
    voting='soft'  # Weighted probability average
)

tier1_clf.fit(X, y)
result = [tier1_clf.predict(df_user),tier1_clf.predict_proba(df_user)]
print("High Sensitivity Voting Classifier\nDiagnosis:",(result[0] == float(1)),"\nIndex:",result[1].reshape(-1)[1])

tier2_clf = VotingClassifier(
    estimators=[(name, clone(model)) for name, model in models.items()],
    voting='soft'
)

tier2_clf.fit(X, y)
result = [tier2_clf.predict(df_user),tier2_clf.predict_proba(df_user)]
print("Full Ensemble Voting Classifier\nDiagnosis:",(result[0] == float(1)),"\nIndex:",result[1].reshape(-1)[1])

High Sensitivity Voting Classifier
Diagnosis: [ True] 
Index: 0.9056489909061978
Full Ensemble Voting Classifier
Diagnosis: [ True] 
Index: 0.836547408235852
