In [None]:
import numpy as np
import pandas as pd

FILE_PATH = "Chronic_Kidney_Disease_data.csv"

NOISE_FACTOR = 0.05  
TARGET_COLUMN = 'Diagnosis' 

CONTINUOUS_COLS = [
    'Age', 'BMI', 'SystolicBP', 'DiastolicBP', 'FastingBloodSugar', 'HbA1c',
    'SerumCreatinine', 'BUNLevels', 'GFR', 'HemoglobinLevels', 
    'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides',
    'SerumElectrolytesSodium', 'SerumElectrolytesPotassium', 
    'SerumElectrolytesCalcium', 'SerumElectrolytesPhosphorus'
]

DISCRETE_COLS = [
    'Gender', 'Ethnicity', 'SocioeconomicStatus', 'EducationLevel', 'Smoking', 'AlcoholConsumption', 
    'PhysicalActivity', 'DietQuality', 'SleepQuality', 'FamilyHistoryKidneyDisease', 
    'FamilyHistoryHypertension', 'FamilyHistoryDiabetes', 'PreviousAcuteKidneyInjury', 
    'UrinaryTractInfections', 'ACEInhibitors', 'Diuretics', 'NSAIDsUse', 
    'Statins', 'AntidiabeticMedications', 'Edema', 'FatigueLevels', 'NauseaVomiting', 
    'MuscleCramps', 'Itching', 'ProteinInUrine', 'ACR', 
    'WaterQuality', 'MedicalCheckupsFrequency', 'MedicationAdherence', 'HealthLiteracy'
] 

def load_and_preprocess_data(file_path):
 
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}. Please ensure the file is available.")
        return None

    if TARGET_COLUMN in df.columns:
        df.dropna(subset=[TARGET_COLUMN], inplace=True)
        df = df[df[TARGET_COLUMN].astype(str).str.lower() != 'confidential']
        
        unique_labels = df[TARGET_COLUMN].astype(str).unique()
        
        if len(unique_labels) >= 2:
            label_counts = df[TARGET_COLUMN].value_counts().sort_values(ascending=False)
            label_map = {label_counts.index[0]: 0, label_counts.index[1]: 1}
            df['target'] = df[TARGET_COLUMN].map(label_map)
            df.dropna(subset=['target'], inplace=True)
            df['target'] = df['target'].astype(int)
        else:
            print(f"Error: Could not find two distinct, valid labels in the '{TARGET_COLUMN}' column for binary classification.")
            return None
    else:
        print(f"Error: Target column '{TARGET_COLUMN}' not found in the file.")
        return None
    
    df.drop(columns=[TARGET_COLUMN, 'PatientID', 'DoctorInCharge', 'QualityOfLifeScore'], errors='ignore', inplace=True)
    
    df = df.replace('?', np.nan)
    df = df.replace('Confidential', np.nan)

    for col in DISCRETE_COLS:
        if col not in df.columns: continue
        df[col] = pd.to_numeric(df[col], errors='coerce')
        mode_val = df[col].mode()[0]
        df[col] = df[col].fillna(mode_val).astype(int)

    for col in CONTINUOUS_COLS:
        if col not in df.columns: continue
        df[col] = pd.to_numeric(df[col], errors='coerce')
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
        
    return df

def augment_continuous_features(X_continuous, noise_factor):
    std_devs = X_continuous.std(axis=0)
    std_devs[std_devs == 0] = 1e-6 
    
    noise = np.random.normal(loc=0.0, scale=1.0, size=X_continuous.shape)
    scaled_noise = noise * std_devs * noise_factor

    X_augmented = X_continuous + scaled_noise
    return X_augmented

def perform_oversampling(df_original):
    class_counts = df_original['target'].value_counts()
    
    TARGET_CLASS_COUNT = class_counts.max()

    target_0 = class_counts.index[0]
    target_1 = class_counts.index[1]
    
    df_class_0 = df_original[df_original['target'] == target_0].drop(columns=['target'])
    df_class_1 = df_original[df_original['target'] == target_1].drop(columns=['target'])
    
    n_augment_0 = TARGET_CLASS_COUNT - len(df_class_0)
    n_augment_1 = TARGET_CLASS_COUNT - len(df_class_1)
    
    all_features = CONTINUOUS_COLS + DISCRETE_COLS
    
    def generate_augmented_data(df_class, n_augment, target_class):
        if n_augment <= 0:
            return pd.DataFrame() 

        sample_indices = np.random.choice(len(df_class), size=n_augment, replace=True)
        
        current_continuous_cols = [col for col in CONTINUOUS_COLS if col in df_class.columns]
        current_discrete_cols = [col for col in DISCRETE_COLS if col in df_class.columns]
        
        X_continuous = df_class[current_continuous_cols].iloc[sample_indices].values
        X_discrete = df_class[current_discrete_cols].iloc[sample_indices].values
        
        X_aug_continuous = augment_continuous_features(X_continuous, NOISE_FACTOR)
        
        df_aug_continuous = pd.DataFrame(X_aug_continuous, columns=current_continuous_cols)
        df_aug_discrete = pd.DataFrame(X_discrete, columns=current_discrete_cols)
        
        for col in current_discrete_cols:
            df_aug_discrete[col] = df_aug_discrete[col].astype(int)
        
        df_aug = pd.concat([df_aug_continuous, df_aug_discrete], axis=1)
        df_aug = df_aug[[col for col in all_features if col in df_aug.columns]]
        df_aug['target'] = target_class
        return df_aug

    df_aug_0 = generate_augmented_data(df_class_0, n_augment_0, target_0)
    
    df_aug_1 = generate_augmented_data(df_class_1, n_augment_1, target_1)

    df_augmented = pd.concat([df_aug_0, df_aug_1], ignore_index=True)
    
    df_original_with_target = df_original.copy()
    df_original_with_target['target'] = df_original['target']
    
    df_final = pd.concat([df_original_with_target, df_augmented], ignore_index=True)
    
    return df_final, df_augmented, TARGET_CLASS_COUNT

initial_df = load_and_preprocess_data(FILE_PATH)

if initial_df is not None:
    final_df, augmented_df, target_count = perform_oversampling(initial_df)

final_df = final_df.rename(columns={'target': 'class'})
final_df.to_csv('CKD_Augmented.csv', index=False)

In [None]:
class_column = 'class'
class_counts = df_336[class_column].value_counts().sort_values(ascending=False)
labels = class_counts.index
sizes = class_counts.values
print("Lables: ", labels.values)
print("Sizes: ", sizes)

plt.figure(figsize=(3, 3))
plt.pie(
        sizes,
        labels=labels,
        # Format the percentage to one decimal place
        autopct='%1.1f%%',
        startangle=90, # Start the first slice at the top
        colors=plt.cm.viridis(np.linspace(0, 1, len(labels))), # Use a color map
        wedgeprops={'edgecolor': 'black', 'linewidth': 1, 'antialiased': True} # Add borders
    )
plt.axis('equal')