In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression

In [2]:
from google.colab import files
uploaded = files.upload()

Saving diabetes.csv to diabetes.csv


In [6]:
df = pd.read_csv("diabetes.csv")
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [7]:
df.shape

(768, 9)

In [8]:
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [10]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [9]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [17]:
X = df.drop(["Outcome"], axis=1)
y = df["Outcome"]

In [18]:
# Стандартизируем данные
scaler = StandardScaler()
X_st = scaler.fit_transform(X)

# Нормализуем данные
scaler = MinMaxScaler()
X_norm = scaler.fit_transform(X)

In [19]:
# Knn модель со стандартизированными данными

X_train, X_test, y_train, y_test = train_test_split(X_st, y, test_size = 0.3, random_state = 42)

In [20]:
# Выбираем наилучшее число соседей
scores = []
for i in range(1,15):
    knn = KNeighborsClassifier(i)
    knn.fit(X_train, y_train)
    answers = knn.predict(X_test)

    scores.append(knn.score(X_test, y_test))

for i in range(0, len(scores)): print(i,': ', scores[i])

0 :  0.6233766233766234
1 :  0.670995670995671
2 :  0.70995670995671
3 :  0.6926406926406926
4 :  0.6926406926406926
5 :  0.6883116883116883
6 :  0.7186147186147186
7 :  0.683982683982684
8 :  0.7012987012987013
9 :  0.6926406926406926
10 :  0.7012987012987013
11 :  0.7142857142857143
12 :  0.7142857142857143
13 :  0.7186147186147186


In [28]:
# Создаём модель 
knn_st = KNeighborsClassifier(7)

# Обучаем модели
knn_st.fit(X_train, y_train)

# Предсказываем значения
y_pred = knn_st.predict(X_test)

# Смотрим матрицу ошибок
print(confusion_matrix(y_test, y_pred), '\n')

# Оценка модели
accuracy_knn_st = accuracy_score(y_test, y_pred)
recall_knn_st = recall_score(y_test, y_pred)
precision_knn_st = precision_score(y_test, y_pred)

print('Accuracy:', accuracy_knn_st)
print('Recall:', recall_knn_st)
print('Precision:', precision_knn_st)

[[123  28]
 [ 38  42]] 

Accuracy: 0.7142857142857143
Recall: 0.525
Precision: 0.6


In [23]:
# Knn модель со нормализированными данными

X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size = 0.3, random_state = 42)

In [24]:
# Выбираем наилучшее число соседей для новой модели
scores = []
for i in range(1,15):
    knn = KNeighborsClassifier(i)
    knn.fit(X_train, y_train)
    answers = knn.predict(X_test)

    scores.append(knn.score(X_test, y_test))

for i in range(0, len(scores)): print(i,': ', scores[i])

0 :  0.6753246753246753
1 :  0.6796536796536796
2 :  0.6926406926406926
3 :  0.696969696969697
4 :  0.6796536796536796
5 :  0.6666666666666666
6 :  0.7142857142857143
7 :  0.696969696969697
8 :  0.7229437229437229
9 :  0.7056277056277056
10 :  0.7012987012987013
11 :  0.6883116883116883
12 :  0.6926406926406926
13 :  0.6883116883116883


In [27]:
# Создаём модель 
knn_norm = KNeighborsClassifier(5)

# Обучаем модель
knn_norm.fit(X_train, y_train)

# Предсказываем значения
y_pred = knn_norm.predict(X_test)

# Смотрим матрицу ошибок
print(confusion_matrix(y_test, y_pred), '\n')

# Оценка модели
accuracy_knn_norm = accuracy_score(y_test, y_pred)
recall_knn_norm = recall_score(y_test, y_pred)
precision_knn_norm = precision_score(y_test, y_pred)

print('Accuracy:', accuracy_knn_norm)
print('Recall:', recall_knn_norm)
print('Precision:', precision_knn_norm)

[[118  33]
 [ 41  39]] 

Accuracy: 0.6796536796536796
Recall: 0.4875
Precision: 0.5416666666666666


In [29]:
# Логистическая регрессия с стандартизированными данными

X_train, X_test, y_train, y_test = train_test_split(X_st, y, test_size = 0.3, random_state = 42)

In [30]:
# Cоздаём модель 
lr_st = LogisticRegression()

# Обучаем модели
lr_st.fit(X_train, y_train)

# Предсказываем значения
y_pred = lr_st.predict(X_test)

# Смотрим матрицу ошибок
print(confusion_matrix(y_test, y_pred), '\n')

# Оценка модели
accuracy_lr_st = accuracy_score(y_test, y_pred)
recall_lr_st = recall_score(y_test, y_pred)
precision_lr_st = precision_score(y_test, y_pred)

print('Accuracy: ', accuracy_lr_st)
print('Recall: ', recall_lr_st)
print('Precision: ', precision_lr_st)

[[120  31]
 [ 30  50]] 

Accuracy:  0.7359307359307359
Recall:  0.625
Precision:  0.6172839506172839


In [33]:
# Логистическая регрессия с нормализованными данными

X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size = 0.3, random_state = 42)

In [35]:
# Создаём модель 
lr_norm = LogisticRegression()

# Обучаем модель
lr_norm.fit(X_train, y_train)

# Предсказываем значения
y_pred = lr_norm.predict(X_test)

# Смотрим матрицу ошибок
print(confusion_matrix(y_test, y_pred), '\n')

# Оценка модели
accuracy_lr_norm = accuracy_score(y_test, y_pred)
recall_lr_norm = recall_score(y_test, y_pred)
precision_lr_norm = precision_score(y_test, y_pred)

print('Accuracy: ', accuracy_lr_norm)
print('Recall: ', recall_lr_norm)
print('Precision: ', precision_lr_norm)

[[127  24]
 [ 35  45]] 

Accuracy:  0.7445887445887446
Recall:  0.5625
Precision:  0.6521739130434783


In [38]:
print('___________ACCURACY___________\n')
print('Knn + st-: ', accuracy_knn_st)
print('Knn + norm: ', accuracy_knn_norm)
print('LR + st: ', accuracy_lr_st)
print('LR + norm: ', accuracy_lr_norm)

print('\n___________RECALL___________\n')
print('Knn + st: ', recall_knn_st)
print('Knn + norm: ', recall_knn_norm)
print('LR + st: ', recall_lr_st)
print('LR + norm: ', recall_lr_norm)

print('\n___________PRECISION___________\n')
print('Knn + st: ', precision_knn_st)
print('Knn + norm: ', precision_knn_norm)
print('LR + st: ', precision_lr_st)
print('LR + norm: ', precision_lr_norm)
# Лучший результат получился в логистической регрессии с нормализованными данными, тк в двух оценках из трех получились максимальные результаты

___________ACCURACY___________

Knn + st-:  0.7142857142857143
Knn + norm:  0.6796536796536796
LR + st:  0.7359307359307359
LR + norm:  0.7445887445887446

___________RECALL___________

Knn + st:  0.525
Knn + norm:  0.4875
LR + st:  0.625
LR + norm:  0.5625

___________PRECISION___________

Knn + st:  0.6
Knn + norm:  0.5416666666666666
LR + st:  0.6172839506172839
LR + norm:  0.6521739130434783
