# LOAD DATASET

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('Kidney_data.csv')
df = data.copy()
df.head()

# TRAIN vs TEST
First step is separate the dataset in features (X) and the result (y).

In [None]:
X, y = df.drop('classification', axis=1), df.classification

Now, we will use the **MinMaxScaler** from *Sklearn* to put the same weight on each feature.

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
X_new = scaler.fit_transform(X)

To the next step, we will use the **train_test_split** method to separate our dataset between train and test, thus minimizing the chance of bias.

In [None]:
from sklearn.model_selection import train_test_split

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, train_size=2/3)

Now, let's test some classification model and to see which has the best accuracy.

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [70]:
svc = SVC()
svc.fit(X_train, y_train)
np.round(svc.score(X_test, y_test)*100, 2)

97.71

In [71]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
np.round(random_forest.score(X_test, y_test)*100, 2)

99.24

In [72]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
np.round(decision_tree.score(X_test, y_test)*100, 2)

98.47

In [73]:
neighbors = KNeighborsClassifier()
neighbors.fit(X_train, y_train)
np.round(neighbors.score(X_test, y_test)*100, 2)

97.71

# Conclusion
As we can see, the RandomForestClassifier model had the best accuracy, so it will be the model we will use.

In [None]:
import joblib

In [90]:
joblib.dump(random_forest, r'..\dist\classifier.joblib')

['..\\dist\\classifier.pkl']