In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
data = pd.read_csv("kidney.csv")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   sc              383 non-null    float64
 6   hemo            348 non-null    float64
 7   htn             398 non-null    float64
 8   dm              398 non-null    float64
 9   appet           399 non-null    float64
 10  ane             399 non-null    float64
 11  classification  400 non-null    int64  
dtypes: float64(10), int64(2)
memory usage: 37.6 KB


In [5]:
data.head()

Unnamed: 0,id,age,bp,sg,al,sc,hemo,htn,dm,appet,ane,classification
0,0,48.0,80.0,1.02,1.0,1.2,15.4,1.0,1.0,1.0,0.0,1
1,1,7.0,50.0,1.02,4.0,0.8,11.3,0.0,0.0,1.0,0.0,1
2,2,62.0,80.0,1.01,2.0,1.8,9.6,0.0,1.0,0.0,1.0,1
3,3,48.0,70.0,1.005,4.0,3.8,11.2,1.0,0.0,0.0,1.0,1
4,4,51.0,80.0,1.01,2.0,1.4,11.6,0.0,0.0,1.0,0.0,1


In [6]:
data.drop('id', axis=1, inplace=True)

In [7]:
# Convert all columns to numeric
data = data.apply(pd.to_numeric, errors="coerce")
data.fillna(data.mean(), inplace=True)  # Handle missing values

In [8]:
X = data.drop(['classification'], axis=1)
y = data['classification']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)  # Keep feature names
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [11]:
param_dist = {
    'n_estimators': [10, 30, 50],
    'max_depth': [3, 5, 7],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [3, 5],
    'max_features': ['sqrt'],
    'bootstrap': [True]
}

In [12]:
random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced_subsample'), 
                                   param_distributions=param_dist, cv=5, n_jobs=-1, scoring='accuracy', n_iter=5, random_state=42)
random_search.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [13]:
best_model = random_search.best_estimator_

In [14]:
# best_model.fit(X_train, y_train)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)


In [16]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)


In [19]:
print(f'Accuracy: {accuracy * 100:.2f}%')
print('Confusion Matrix:\n', conf_matrix)

Accuracy: 100.00%
Confusion Matrix:
 [[45  0]
 [ 0 75]]


In [20]:
import pickle

model_data = {"model": model, "feature_names": X.columns.tolist(), "scaler": scaler}

with open("models/kidney.pkl", "wb") as model_file:
    pickle.dump(model_data, model_file)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# Load dataset
data = pd.read_csv("kidney.csv")

# Drop unnecessary column
data.drop('id', axis=1, inplace=True)

# Convert all columns to numeric and handle missing values
data = data.apply(pd.to_numeric, errors="coerce")
data.fillna(data.mean(), inplace=True)

# Split features and target
X = data.drop(['classification'], axis=1)
y = data['classification']

# Split into train and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Standardization
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)  # Keep feature names
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

# Define multiple models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(kernel='linear', random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{name} Accuracy: {round(accuracy * 100, 2)}%")



Random Forest Accuracy: 100.0%
Support Vector Machine Accuracy: 98.33%
K-Nearest Neighbors Accuracy: 96.67%
Logistic Regression Accuracy: 98.33%
Naive Bayes Accuracy: 95.0%
Decision Tree Accuracy: 99.17%
Gradient Boosting Accuracy: 99.17%
