In [1]:
import pandas as pd
import numpy as np

In [2]:
from ucimlrepo import fetch_ucirepo
dataset_857 = fetch_ucirepo(id=857)
df_857 = pd.concat([dataset_857.data.features, dataset_857.data.targets], axis=1)

In [3]:
col = [
    # Demographics
    "age",
    
    # Clinical conditions
    "htn",      # Hypertension
    "dm",       # Diabetes Mellitus
    "cad",      # Coronary Artery Disease
    
    # Urine tests
    "su",       # Sugar
    "sg",       # Specific Gravity
    "al",       # Albumin
    
    # Blood tests
    "sc",       # Serum Creatinine
    "grf",      # Glomerular Filtration Rate (eGFR)
    "hemo",     # Hemoglobin
    "rbcc",     # Red Blood Cell Count
    
    # Target variable
    "class"
]

In [4]:
pd.set_option('display.max_columns', None)
print(df_857[col].head(6))

      age  htn  dm  cad   su             sg     al      sc                grf  \
0    < 12    0   0    0  < 0  1.019 - 1.021  1-Jan  < 3.65          ≥ 227.944   
1    < 12    0   0    0  < 0  1.009 - 1.011    < 0  < 3.65          ≥ 227.944   
2    < 12    0   0    0  < 0  1.009 - 1.011    ≥ 4  < 3.65  127.281 - 152.446   
3    < 12    0   0    0  < 0  1.009 - 1.011  3-Mar  < 3.65  127.281 - 152.446   
4  20-Dec    0   1    0  < 0  1.015 - 1.017    < 0  < 3.65  127.281 - 152.446   
5  20-Dec    0   0    0  < 0        ≥ 1.023    < 0  < 3.65  102.115 - 127.281   

          hemo         rbcc   class  
0  11.3 - 12.6  4.46 - 5.05     ckd  
1  11.3 - 12.6  4.46 - 5.05     ckd  
2     8.7 - 10  4.46 - 5.05     ckd  
3  13.9 - 15.2  4.46 - 5.05     ckd  
4  13.9 - 15.2  5.05 - 5.64     ckd  
5       ≥ 16.5  5.05 - 5.64  notckd  


In [5]:
for column in col:
    print(f"\nUnique values in '{column}':")
    print(df_857[column].unique())
    print(f"Count: {df_857[column].nunique()}")


Unique values in 'age':
['< 12' '20-Dec' '20 - 27' '27 - 35' '35 - 43' '43 - 51' '51 - 59'
 '59 - 66' '66 - 74' '≥ 74']
Count: 10

Unique values in 'htn':
[0 1]
Count: 2

Unique values in 'dm':
[0 1]
Count: 2

Unique values in 'cad':
[0 1]
Count: 2

Unique values in 'su':
['< 0' '4-Apr' '2-Feb' '4-Mar' '2-Jan' '≥ 4']
Count: 6

Unique values in 'sg':
['1.019 - 1.021' '1.009 - 1.011' '1.015 - 1.017' '≥ 1.023' '< 1.007']
Count: 5

Unique values in 'al':
['1-Jan' '< 0' '≥ 4' '3-Mar' '2-Feb']
Count: 5

Unique values in 'sc':
['< 3.65' '3.65 - 6.8' '16.25 - 19.4' '6.8 - 9.95' '13.1 - 16.25'
 '9.95 - 13.1' '≥ 28.85']
Count: 7

Unique values in 'grf':
['≥ 227.944' '127.281 - 152.446' '102.115 - 127.281' '177.612 - 202.778'
 '26.6175 - 51.7832' '51.7832 - 76.949' '76.949 - 102.115'
 '152.446 - 177.612' '202.778 - 227.944' '< 26.6175' ' p ']
Count: 11

Unique values in 'hemo':
['11.3 - 12.6' '8.7 - 10' '13.9 - 15.2' '≥ 16.5' '10 - 11.3' '7.4 - 8.7'
 '12.6 - 13.9' '15.2 - 16.5' '< 6.1' '6.1 - 7.

In [6]:
sugar_mapping = {
    '< 0': 0,
    '≥ 4': 4,      # or 5, depending on your scale
    '4-Apr': 4,
    '4-Mar': 3,
    '2-Feb': 2,
    '2-Jan': 1,
    # Also handle possible variations
    '<0': 0,
    '>=4': 4,
    '≥4': 4,
    # Handle if already numeric strings
    '0': 0,
    '1': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    '5': 5,
}
df_857['su'] = df_857['su'].astype(str).str.strip()  # Clean whitespace
df_857['su'] = df_857['su'].replace(sugar_mapping)
df_857['su'] = pd.to_numeric(df_857['su'], errors='coerce')
for idx in df_857[df_857['su'].isnull()].index:
    if df_857.loc[idx, 'dm'] == 1:  
        df_857.loc[idx, 'su'] = df_857[df_857['dm'] == 1]['su'].mode()[0]
    else:
        df_857.loc[idx, 'su'] = 0
df_857['su'] = df_857['su'].astype(float)

In [7]:
sg_mapping = {
    '< 1.007': 1.005,      # Severe impairment
    '1.009 - 1.011': 1.010, # Mild impairment
    '1.015 - 1.017': 1.016, # Normal
    '1.019 - 1.021': 1.020, # Normal/concentrated
    '≥ 1.023': 1.025,       # Concentrated
    # Handle variations
    '<1.007': 1.005,
    '>=1.023': 1.025,
    '≥1.023': 1.025,
}

# Step 2: Apply mapping
df_857['sg'] = df_857['sg'].astype(str).str.strip()
df_857['sg'] = df_857['sg'].replace(sg_mapping)

# Step 3: Convert to float
df_857['sg'] = pd.to_numeric(df_857['sg'], errors='coerce')

# Step 4: Handle missing values with median
df_857['sg'].fillna(df_857['sg'].median(), inplace=True)

In [8]:
rbcc_mapping = {
    '< 2.69': 2.40,           # Severe anemia
    '2.69 - 3.28': 2.985,     # Moderate anemia
    '3.28 - 3.87': 3.575,     # Mild anemia
    '3.87 - 4.46': 4.165,     # Low normal
    '4.46 - 5.05': 4.755,     # Normal
    '5.05 - 5.64': 5.345,     # Normal
    '5.64 - 6.23': 5.935,     # Normal/high
    '6.23 - 6.82': 6.525,     # High
    '≥ 7.41': 7.70,           # Polycythemia
    # Handle variations
    '<2.69': 2.40,
    '>=7.41': 7.70,
    '≥7.41': 7.70,
}

# Apply mapping
df_857['rbcc'] = df_857['rbcc'].astype(str).str.strip()
df_857['rbcc'] = df_857['rbcc'].replace(rbcc_mapping)
df_857['rbcc'] = pd.to_numeric(df_857['rbcc'], errors='coerce')

# Impute missing values
df_857['rbcc'].fillna(df_857['rbcc'].median(), inplace=True)

In [9]:
age_mapping = {
    '< 12': 10,
    '20-Dec': 16,      # Excel corruption of "12-20"
    '12-Dec': 16,
    '20 - 27': 24,
    '27 - 35': 31,
    '35 - 43': 39,
    '43 - 51': 47,
    '51 - 59': 55,
    '59 - 66': 63,
    '66 - 74': 70,
    '≥ 74': 77,
}

# Clean
df_857['age'] = df_857['age'].astype(str).str.strip()
df_857['age'] = df_857['age'].replace(age_mapping)
df_857['age'] = pd.to_numeric(df_857['age'], errors='coerce')
df_857['age'].fillna(df_857['age'].median(), inplace=True)
df_857['age'] = df_857['age'].round(0).astype(int)

In [10]:
albumin_mapping = {
    '< 0': 0,
    '1-Jan': 1,
    '2-Feb': 2,
    '3-Mar': 3,
    '4-Apr': 4,
    '≥ 4': 4,
    '5-May': 5,
}

df_857['al'] = df_857['al'].astype(str).str.strip()
df_857['al'] = df_857['al'].replace(albumin_mapping)
df_857['al'] = pd.to_numeric(df_857['al'], errors='coerce')
df_857['al'].fillna(df_857['al'].median(), inplace=True)
df_857['al'] = df_857['al'].astype(float)

In [11]:
sc_mapping = {
    '< 3.65': 3.15,          # Near normal (below mild impairment)
    '3.65 - 6.8': 5.225,     # Moderate kidney impairment
    '6.8 - 9.95': 8.375,     # Severe kidney impairment
    '9.95 - 13.1': 11.525,   # Very severe impairment
    '13.1 - 16.25': 14.675,  # Advanced kidney failure
    '16.25 - 19.4': 17.825,  # End-stage kidney disease
    '≥ 28.85': 29.85,        # Critical kidney failure
    # Handle variations
    '<3.65': 3.15,
    '>=28.85': 29.85,
    '≥28.85': 29.85,
}

# Apply mapping
df_857['sc'] = df_857['sc'].astype(str).str.strip()
df_857['sc'] = df_857['sc'].replace(sc_mapping)
df_857['sc'] = pd.to_numeric(df_857['sc'], errors='coerce')

# Impute missing values
df_857['sc'] = df_857['sc'].fillna(df_857['sc'].median()).astype(float)

In [12]:
hemo_mapping = {
    '< 6.1': 5.6,          # Life-threatening anemia
    '6.1 - 7.4': 6.75,     # Severe anemia
    '7.4 - 8.7': 8.05,     # Severe anemia
    '8.7 - 10': 9.35,      # Moderate anemia
    '10 - 11.3': 10.65,    # Moderate anemia
    '11.3 - 12.6': 11.95,  # Mild anemia
    '12.6 - 13.9': 13.25,  # Low normal to mild anemia
    '13.9 - 15.2': 14.55,  # Normal
    '15.2 - 16.5': 15.85,  # Normal
    '≥ 16.5': 17.0,        # Normal/high
    # Handle variations
    '<6.1': 5.6,
    '>=16.5': 17.0,
    '≥16.5': 17.0,
}

# Apply mapping
df_857['hemo'] = df_857['hemo'].astype(str).str.strip()
df_857['hemo'] = df_857['hemo'].replace(hemo_mapping)
df_857['hemo'] = pd.to_numeric(df_857['hemo'], errors='coerce')

# Impute missing values
df_857['hemo'] = df_857['hemo'].fillna(df_857['hemo'].median()).astype(float)

In [13]:
grf_mapping = {
    ' p ': np.nan,              # Corrupted data
    'p': np.nan,
    '< 26.6175': 21.62,         # Stage 5: Kidney failure
    '26.6175 - 51.7832': 39.20, # Stage 4: Severe impairment
    '51.7832 - 76.949': 64.37,  # Stage 3b: Moderate-severe impairment
    '76.949 - 102.115': 89.53,  # Stage 2-3a: Mild-moderate impairment
    '102.115 - 127.281': 114.70, # Stage 2: Mild impairment
    '127.281 - 152.446': 139.86, # Stage 1-2: Normal to mild
    '152.446 - 177.612': 165.03, # Stage 1: Normal
    '177.612 - 202.778': 190.20, # Stage 1: Normal/high
    '202.778 - 227.944': 215.36, # Stage 1: High
    '≥ 227.944': 237.94,         # Very high (hyperfiltration)
    # Handle variations
    '<26.6175': 21.62,
    '>=227.944': 237.94,
    '≥227.944': 237.94,
}

# Apply mapping
df_857['grf'] = df_857['grf'].astype(str).str.strip()
df_857['grf'] = df_857['grf'].replace(grf_mapping)
df_857['grf'] = pd.to_numeric(df_857['grf'], errors='coerce')

# Impute missing values
df_857['grf'] = df_857['grf'].fillna(df_857['grf'].median()).astype(float)

In [14]:
for column in col:
    print(f"\nUnique values in '{column}':")
    print(df_857[column].unique())
    print(f"Count: {df_857[column].nunique()}")


Unique values in 'age':
[10 16 24 31 39 47 55 63 70 77]
Count: 10

Unique values in 'htn':
[0 1]
Count: 2

Unique values in 'dm':
[0 1]
Count: 2

Unique values in 'cad':
[0 1]
Count: 2

Unique values in 'su':
[0. 4. 2. 3. 1.]
Count: 5

Unique values in 'sg':
[1.02  1.01  1.016 1.025 1.005]
Count: 5

Unique values in 'al':
[1. 0. 4. 3. 2.]
Count: 5

Unique values in 'sc':
[ 3.15   5.225 17.825  8.375 14.675 11.525 29.85 ]
Count: 7

Unique values in 'grf':
[237.94 139.86 114.7  190.2   39.2   64.37  89.53 165.03 215.36  21.62]
Count: 10

Unique values in 'hemo':
[11.95  9.35 14.55 17.   10.65  8.05 13.25 15.85  5.6   6.75]
Count: 10

Unique values in 'rbcc':
[4.755 5.345 3.575 4.165 6.525 5.935 2.985 2.4   7.7  ]
Count: 9

Unique values in 'class':
['ckd' 'notckd']
Count: 2


In [15]:
df_857['class'] = df_857['class'].str.strip()
df_857['class'] = df_857['class'].str.lower()
df_857['class'] = df_857['class'].map({'ckd': 1, 'notckd': 0})
df_export = df_857[col]
df_export = df_export.rename(columns={'grf': 'gfr'})
df_export.to_csv('UCI_857_Cleaned.csv', index=False)

In [16]:
FILE_PATH = "UCI_857_Cleaned.csv"

NOISE_FACTOR = 0.05  
TARGET_CLASS_COUNT = 200 

CONTINUOUS_COLS = ['age', 'sc', 'gfr', 'hemo', 'rbcc'] 
DISCRETE_COLS = ['htn', 'dm', 'cad', 'su', 'sg', 'al'] 

def load_and_preprocess_data(file_path):
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}. Please ensure the file is available.")
        return None

    for col in DISCRETE_COLS:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        mode_val = df[col].mode()[0]
        df[col] = df[col].fillna(mode_val).astype(int)

    for col in CONTINUOUS_COLS:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
        
    return df

def augment_continuous_features(X_continuous, noise_factor):
    std_devs = X_continuous.std(axis=0)
    
    std_devs[std_devs == 0] = 1e-6 
    
    noise = np.random.normal(loc=0.0, scale=1.0, size=X_continuous.shape)
    scaled_noise = noise * std_devs * noise_factor
    
    X_augmented = X_continuous + scaled_noise
    return X_augmented

def perform_oversampling(df_original):
    df_class_0 = df_original[df_original['class'] == 0].drop(columns=['class'])
    df_class_1 = df_original[df_original['class'] == 1].drop(columns=['class'])

    n_augment_0 = TARGET_CLASS_COUNT - len(df_class_0)
    n_augment_1 = TARGET_CLASS_COUNT - len(df_class_1)
    
    all_features = CONTINUOUS_COLS + DISCRETE_COLS
    
    def generate_augmented_data(df_class, n_augment, target_class):
        if n_augment <= 0:
            return pd.DataFrame() 

        sample_indices = np.random.choice(len(df_class), size=n_augment, replace=True)
        
        X_continuous = df_class[CONTINUOUS_COLS].iloc[sample_indices].values
        X_discrete = df_class[DISCRETE_COLS].iloc[sample_indices].values
        
        X_aug_continuous = augment_continuous_features(X_continuous, NOISE_FACTOR)
        
        df_aug_continuous = pd.DataFrame(X_aug_continuous, columns=CONTINUOUS_COLS)
        df_aug_discrete = pd.DataFrame(X_discrete, columns=DISCRETE_COLS)
        
        for col in DISCRETE_COLS:
            df_aug_discrete[col] = df_aug_discrete[col].astype(int)
        
        df_aug = pd.concat([df_aug_continuous, df_aug_discrete], axis=1)
        df_aug = df_aug[all_features] 
        df_aug['class'] = target_class
        
        return df_aug

    df_aug_0 = generate_augmented_data(df_class_0, n_augment_0, 0)
    
    df_aug_1 = generate_augmented_data(df_class_1, n_augment_1, 1)

    df_augmented = pd.concat([df_aug_0, df_aug_1], ignore_index=True)
    
    df_final = pd.concat([df_original, df_augmented], ignore_index=True)
    
    return df_final, df_augmented

initial_df = load_and_preprocess_data(FILE_PATH)

if initial_df is not None:
    final_df, augmented_df = perform_oversampling(initial_df)

final_df.to_csv('UCI_857_Augmented.csv', index=False)