# LOAD DATASET

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Kidney_data.csv')
df = data.copy()
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,hemo,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,1,1,0,0,121.0,...,15.4,7800.0,5.2,1,1,0,0,0,0,1
1,7.0,50.0,1.02,4.0,0.0,1,1,0,0,121.0,...,11.3,6000.0,5.2,0,0,0,0,0,0,1
2,62.0,80.0,1.01,2.0,3.0,1,1,0,0,423.0,...,9.6,7500.0,5.2,0,1,0,1,0,1,1
3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,11.2,6700.0,3.9,1,0,0,1,1,1,1
4,51.0,80.0,1.01,2.0,0.0,1,1,0,0,106.0,...,11.6,7300.0,4.6,0,0,0,0,0,0,1


# TRAIN vs TEST
First step is separate the dataset in features (X) and the result (y).

In [3]:
X, y = df.drop('classification', axis=1), df.classification

Now, we will use the **MinMaxScaler** from *Sklearn* to put the same weight on each feature.

In [4]:
from sklearn.preprocessing import MinMaxScaler

In [5]:
scaler = MinMaxScaler()
X_new = scaler.fit_transform(X)

To the next step, we will use the **train_test_split** method to separate our dataset between train and test, thus minimizing the chance of bias.

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, train_size=2/3)

Now, let's test some classification model and to see which has the best accuracy.

In [8]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [15]:
svc = SVC()
svc.fit(X_train, y_train)
print(f"A precisão foi de {np.round(svc.score(X_test, y_test)*100, 2)}%")

A precisão foi de 96.18%


In [16]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
print(f"The accuracy was {np.round(random_forest.score(X_test, y_test)*100, 2)}%")

The accuracy was 100.0%


In [17]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
print(f"The accuracy was {np.round(decision_tree.score(X_test, y_test)*100, 2)}%")

The accuracy was 96.95%


In [18]:
neighbors = KNeighborsClassifier()
neighbors.fit(X_train, y_train)
print(f"The accuracy was {np.round(neighbors.score(X_test, y_test)*100, 2)}%")

The accuracy was 95.42%


# Conclusion
As we can see, the RandomForestClassifier model had the best accuracy, so it will be the model we will use.

In [None]:
import joblib

In [90]:
joblib.dump(random_forest, r'..\dist\classifier.joblib')

['..\\dist\\classifier.pkl']