In [1]:
import pandas as pd
import numpy as np

In [2]:
from ucimlrepo import fetch_ucirepo
dataset_336 = fetch_ucirepo(id=336)
df_336 = pd.concat([dataset_336.data.features, dataset_336.data.targets], axis=1)

In [3]:
col = [
    # Demographics
    "age",
    
    # Clinical conditions
    "htn",      # Hypertension
    "dm",       # Diabetes Mellitus
    "cad",      # Coronary Artery Disease
    
    # Urine tests
    "su",       # Sugar
    "sg",       # Specific Gravity
    "al",       # Albumin
    
    # Blood tests
    "bp",       # Blood Pressure
    "sc",       # Serum Creatinine
    "hemo",     # Hemoglobin
    "rbcc",     # Red Blood Cell Count
    
    # Target variable
    "class"
]

In [4]:
pd.set_option('display.max_columns', None)
print(df_336[col].head(6))

    age  htn   dm cad   su     sg   al    bp   sc  hemo  rbcc class
0  48.0  yes  yes  no  0.0  1.020  1.0  80.0  1.2  15.4   5.2   ckd
1   7.0   no   no  no  0.0  1.020  4.0  50.0  0.8  11.3   NaN   ckd
2  62.0   no  yes  no  3.0  1.010  2.0  80.0  1.8   9.6   NaN   ckd
3  48.0  yes   no  no  0.0  1.005  4.0  70.0  3.8  11.2   3.9   ckd
4  51.0   no   no  no  0.0  1.010  2.0  80.0  1.4  11.6   4.6   ckd
5  60.0  yes  yes  no  0.0  1.015  3.0  90.0  1.1  12.2   4.4   ckd


In [5]:
for column in col:
    print(f"\nUnique values in '{column}':")
    print(df_336[column].unique())
    print(f"Count: {df_336[column].nunique()}")


Unique values in 'age':
[48.  7. 62. 51. 60. 68. 24. 52. 53. 50. 63. 40. 47. 61. 21. 42. 75. 69.
 nan 73. 70. 65. 76. 72. 82. 46. 45. 35. 54. 11. 59. 67. 15. 55. 44. 26.
 64. 56.  5. 74. 38. 58. 71. 34. 17. 12. 43. 41. 57.  8. 39. 66. 81. 14.
 27. 83. 30.  4.  3.  6. 32. 80. 49. 90. 78. 19.  2. 33. 36. 37. 23. 25.
 20. 29. 28. 22. 79.]
Count: 76

Unique values in 'htn':
['yes' 'no' nan]
Count: 2

Unique values in 'dm':
['yes' 'no' '\tno' nan]
Count: 3

Unique values in 'cad':
['no' 'yes' nan]
Count: 2

Unique values in 'su':
[ 0.  3.  4.  1. nan  2.  5.]
Count: 6

Unique values in 'sg':
[1.02  1.01  1.005 1.015   nan 1.025]
Count: 5

Unique values in 'al':
[ 1.  4.  2.  3.  0. nan  5.]
Count: 6

Unique values in 'bp':
[ 80.  50.  70.  90.  nan 100.  60. 110. 140. 180. 120.]
Count: 10

Unique values in 'sc':
[ 1.2   0.8   1.8   3.8   1.4   1.1  24.    1.9   7.2   4.    2.7   2.1
  4.6   4.1   9.6   2.2   5.2   1.3   1.6   3.9  76.    7.7    nan  2.4
  7.3   1.5   2.5   2.    3.4   0.7 

In [6]:
for column in col:
    print(f"\nNull values in '{column}':")
    print(df_336[column].isnull().sum())


Null values in 'age':
9

Null values in 'htn':
2

Null values in 'dm':
2

Null values in 'cad':
2

Null values in 'su':
49

Null values in 'sg':
47

Null values in 'al':
46

Null values in 'bp':
12

Null values in 'sc':
17

Null values in 'hemo':
52

Null values in 'rbcc':
131

Null values in 'class':
0


In [7]:
df_336['su'] = df_336['su'].fillna(df_336['su'].mode()[0]).astype(float)
df_336['htn'] = df_336['htn'].fillna(df_336['htn'].mode()[0])
df_336['dm'] = df_336['dm'].str.strip()
df_336['dm'] = df_336['dm'].str.lower()
df_336['dm'].fillna(df_336['dm'].mode()[0], inplace=True)
df_336['sg'] = df_336['sg'].fillna(df_336['sg'].median()).astype(float)
df_336['rbcc'] = df_336['rbcc'].fillna(df_336['rbcc'].median()).astype(float)
df_336['cad'] = df_336['cad'].fillna(df_336['cad'].mode()[0])
df_336['age'] = df_336['age'].fillna(df_336['age'].median()).astype(int)
df_336['al'] = df_336['al'].fillna(df_336['al'].mode()[0]).astype(float)
df_336['sc'] = df_336['sc'].fillna(df_336['sc'].median()).astype(float)
df_336['hemo'] = df_336['hemo'].fillna(df_336['hemo'].median()).astype(float)
df_336['bp'] = df_336['bp'].fillna(df_336['bp'].median()).astype(float)
df_336['class'] = df_336['class'].str.strip()
df_336['class'] = df_336['class'].str.lower()
df_336['class'] = df_336['class'].map({'ckd': 1, 'notckd': 0})

In [8]:
yes_no = ['htn', 'dm', 'cad']
for col_name in yes_no:
    df_336[col_name] = df_336[col_name].map({'yes': 1, 'no': 0})

In [9]:
df_export = df_336[col]
df_export.to_csv('UCI_336_Cleaned.csv', index=False)

In [10]:
FILE_PATH = "UCI_336_Cleaned.csv"
NOISE_FACTOR = 0.05  
TARGET_CLASS_COUNT = 400
CONTINUOUS_COLS = ['age', 'bp', 'sc', 'hemo', 'rbcc']
DISCRETE_COLS = ['htn', 'dm', 'cad', 'su', 'sg', 'al']

In [11]:
def load_and_preprocess_data(file_path):
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}. Please ensure the file is available.")
        return None
    
    for col in DISCRETE_COLS:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        mode_val = df[col].mode()[0]
        df[col] = df[col].fillna(mode_val).astype(int)

    for col in CONTINUOUS_COLS:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
  
    return df

In [12]:
def augment_continuous_features(X_continuous, noise_factor):
    std_devs = X_continuous.std(axis=0)
    
    std_devs[std_devs == 0] = 1e-6 
    
    noise = np.random.normal(loc=0.0, scale=1.0, size=X_continuous.shape)
    scaled_noise = noise * std_devs * noise_factor
    
    X_augmented = X_continuous + scaled_noise
    return X_augmented

In [13]:
def perform_oversampling(df_original):
    df_class_0 = df_original[df_original['class'] == 0].drop(columns=['class'])
    df_class_1 = df_original[df_original['class'] == 1].drop(columns=['class'])

    n_augment_0 = TARGET_CLASS_COUNT - len(df_class_0)  
    n_augment_1 = TARGET_CLASS_COUNT - len(df_class_1)  
    
    all_features = CONTINUOUS_COLS + DISCRETE_COLS
    
    def generate_augmented_data(df_class, n_augment, target_class):
        sample_indices = np.random.choice(len(df_class), size=n_augment, replace=True)
        
        X_continuous = df_class[CONTINUOUS_COLS].iloc[sample_indices].values
        X_discrete = df_class[DISCRETE_COLS].iloc[sample_indices].values
        
        X_aug_continuous = augment_continuous_features(X_continuous, NOISE_FACTOR)
        
        df_aug_continuous = pd.DataFrame(X_aug_continuous, columns=CONTINUOUS_COLS)
        df_aug_discrete = pd.DataFrame(X_discrete, columns=DISCRETE_COLS)
        
        for col in DISCRETE_COLS:
            df_aug_discrete[col] = df_aug_discrete[col].astype(int)
        
        df_aug = pd.concat([df_aug_continuous, df_aug_discrete], axis=1)
        df_aug = df_aug[all_features] 
        df_aug['class'] = target_class
        return df_aug

    df_aug_0 = generate_augmented_data(df_class_0, n_augment_0, 0)
    
    df_aug_1 = generate_augmented_data(df_class_1, n_augment_1, 1)

    df_augmented = pd.concat([df_aug_0, df_aug_1], ignore_index=True)
    
    df_final = pd.concat([df_original, df_augmented], ignore_index=True)
       
    return df_final, df_augmented

In [14]:
initial_df = load_and_preprocess_data(FILE_PATH)
final_df, augmented_df = perform_oversampling(initial_df)
final_df.to_csv('UCI_336_Augmented.csv', index=False)