In [29]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

In [30]:
df = pd.read_csv('diabetes.csv')
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [31]:
X = df.drop('Outcome',axis = 1)
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [32]:
y = df['Outcome']
y, df['Outcome'].value_counts(normalize=True)*100

(0      1
 1      0
 2      1
 3      0
 4      1
       ..
 763    0
 764    0
 765    0
 766    1
 767    0
 Name: Outcome, Length: 768, dtype: int64,
 0    65.104167
 1    34.895833
 Name: Outcome, dtype: float64)

In [33]:
def model_training(X, y, model, split):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size =  split, random_state = 10, stratify=y )
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    cm =confusion_matrix(y_test, y_predict)
    accuracy  = accuracy_score(y_test, y_predict)
    score = f1_score(y_test, y_predict)
    precision = precision_score(y_test, y_predict)
    recall = recall_score(y_test, y_predict)
    return {
        "confusion_matrix":cm,
        "Accuracy" : accuracy,
        "f1_score":score, 
        "precision":precision,
        "recall":recall
    }

In [34]:
models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    'SVM': SVC(kernel='linear', C=1000, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB()
}
splits = {
    '80/20': 0.8,
    '70/30': 0.7,
    '60/40': 0.6
}

In [36]:
for model_name, model in models.items():
    print(f"\n______{model_name}______")
    for split_name, split in splits.items():
            print(f"\n______{split_name}______")
            result = model_training(X, y, model, split)
            print("\n")
            for name,res in result.items():
                print(f"{name}:{res}\n")


______Decision Tree______

______80/20______


confusion_matrix:[[74 26]
 [25 29]]

Accuracy:0.6688311688311688

f1_score:0.5321100917431193

precision:0.5272727272727272

recall:0.5370370370370371


______70/30______


confusion_matrix:[[115  35]
 [ 31  50]]

Accuracy:0.7142857142857143

f1_score:0.6024096385542169

precision:0.5882352941176471

recall:0.6172839506172839


______60/40______


confusion_matrix:[[159  42]
 [ 51  56]]

Accuracy:0.698051948051948

f1_score:0.5463414634146342

precision:0.5714285714285714

recall:0.5233644859813084


______SVM______

______80/20______


confusion_matrix:[[80 20]
 [21 33]]

Accuracy:0.7337662337662337

f1_score:0.616822429906542

precision:0.6226415094339622

recall:0.6111111111111112


______70/30______


confusion_matrix:[[127  23]
 [ 26  55]]

Accuracy:0.7878787878787878

f1_score:0.6918238993710691

precision:0.7051282051282052

recall:0.6790123456790124


______60/40______


confusion_matrix:[[173  28]
 [ 44  63]]

Accuracy:0.76623376