# Main

## 1) Loading the data

In [17]:
import pandas as pd
import numpy as np

data = pd.read_csv('kidney_disease.csv')

data 

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [18]:
# Decode strings
data.columns = data.columns.str.strip()

# Replace '?' with NaN
data.replace('?', np.nan)

# Convert columns to numeric, forcing errors to NaN
data = data.apply(pd.to_numeric, errors='ignore')

# Print the data types to verify the conversion
# print(data.dtypes)

# Convert nominal data to 'category' data type
categorical_features = ['sg','al','su','rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane'] # the 14 nominal attributes minus 'class'
for feature in categorical_features:
    data[feature] = data[feature].astype('category')

# Verify the conversion
# print(data.dtypes)

# Remove the "ground-truth" column and store its values aside
ground_truth = data['classification']
data = data.drop(columns=['classification'])

# Verify the removal
# print(data.head())
# print(ground_truth.head())

# Replace missing numeric values with the feature average
numeric_features = data.select_dtypes(exclude='category').columns
for feature in numeric_features:
    data[feature] = pd.to_numeric(data[feature], errors='coerce')  # Convert to numeric to avoid errors
    data[feature].fillna(data[feature].mean(), inplace=True)

# Replace missing categorical values with the most occurring value
for feature in categorical_features:
    most_frequent_value = data[feature].mode().iloc[0]
    data[feature].fillna(most_frequent_value, inplace=True)

# # Verify that there are no more NaN values
# nan_counts = data.isnull().sum()
# # print("Number of NaN values in each column:")
# # print(nan_counts)

# # Check if there are any columns with NaN values
# if nan_counts.sum() == 0:
#     print("There are no more NaN values in the DataFrame.")
# else:
#     print("There are still NaN values in the DataFrame.")

# One-hot encode categorical features
data_encoded = pd.get_dummies(data, columns=categorical_features)

# Verify the encoding
# print(data_encoded.head())

# Convert boolean columns to integers (0 and 1)
boolean_columns = data_encoded.select_dtypes(include=['bool']).columns
data_encoded[boolean_columns] = data_encoded[boolean_columns].astype('int')

# print(data_encoded.head())

data_normalized = (data_encoded - data_encoded.mean()) / data_encoded.std()

# Convert the "ground-truth" data into numeric format
ground_truth_numeric = ground_truth.map(lambda x: 1 if x == 'ckd' else 0).to_numpy()

Y=ground_truth_numeric

X = data_normalized.to_numpy()

# Combine X and Y into a single dataset
data = np.column_stack((X, Y))


  data = data.apply(pd.to_numeric, errors='ignore')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[feature].fillna(data[feature].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[feature].fillna(most_frequent_value, inplace=True)


In [19]:
data

array([[-1.72556507, -0.20520665,  0.26201023, ...,  0.41955859,
        -0.41955859,  1.        ],
       [-1.71691562, -2.6205281 , -1.96412048, ...,  0.41955859,
        -0.41955859,  1.        ],
       [-1.70826617,  0.61953726,  0.26201023, ..., -2.37749869,
         2.37749869,  1.        ],
       ...,
       [ 1.70826617, -2.3259767 ,  0.26201023, ...,  0.41955859,
        -0.41955859,  0.        ],
       [ 1.71691562, -2.03142531, -1.22207691, ...,  0.41955859,
        -0.41955859,  0.        ],
       [ 1.72556507,  0.38389614,  0.26201023, ...,  0.41955859,
        -0.41955859,  0.        ]])