In [21]:
import pandas as pd


In [22]:
data = pd.read_csv("diabetes.csv")
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [23]:
X = data.drop(['Outcome'], axis=1)
y = data['Outcome']

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=1/3, random_state=42)

tree = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1)

tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)

In [25]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

print("Accuracy:", accuracy_score(y_test,y_pred_tree))
print("Recall:", recall_score(y_test,y_pred_tree))
print("Precision:", precision_score(y_test,y_pred_tree))

Accuracy: 0.72265625
Recall: 0.7209302325581395
Precision: 0.5688073394495413


In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)

tree = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1)

cvs = cross_val_score(estimator=tree, X=X, y=y, cv=kf, scoring='accuracy')

print(cvs)
print(cvs.mean())

[0.73376623 0.65584416 0.78571429 0.82352941 0.7254902 ]
0.7448688566335624


In [27]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_norm = scaler.fit_transform(X)
X_norm

array([[0.35294118, 0.74371859, 0.59016393, ..., 0.50074516, 0.23441503,
        0.48333333],
       [0.05882353, 0.42713568, 0.54098361, ..., 0.39642325, 0.11656704,
        0.16666667],
       [0.47058824, 0.91959799, 0.52459016, ..., 0.34724292, 0.25362938,
        0.18333333],
       ...,
       [0.29411765, 0.6080402 , 0.59016393, ..., 0.390462  , 0.07130658,
        0.15      ],
       [0.05882353, 0.63316583, 0.49180328, ..., 0.4485842 , 0.11571307,
        0.43333333],
       [0.05882353, 0.46733668, 0.57377049, ..., 0.45305514, 0.10119556,
        0.03333333]])

In [28]:
X_train,X_test,y_train,y_test = train_test_split(X_norm, y, test_size=1/3, random_state=42)
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)

In [29]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

print("Accuracy:", accuracy_score(y_test,y_pred_tree))
print("Recall:", recall_score(y_test,y_pred_tree))
print("Precision:", precision_score(y_test,y_pred_tree))

Accuracy: 0.72265625
Recall: 0.7209302325581395
Precision: 0.5688073394495413


In [30]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_st = scaler.fit_transform(X)
X_st

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [47]:
X_train,X_test,y_train,y_test = train_test_split(X_st, y, test_size=1/3, random_state=42)
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)

In [48]:
print("Accuracy:", accuracy_score(y_test,y_pred_tree))
print("Recall:", recall_score(y_test,y_pred_tree))
print("Precision:", precision_score(y_test,y_pred_tree))

Accuracy: 0.71875
Recall: 0.6744186046511628
Precision: 0.5686274509803921


In [49]:
# Дерево, обученное на стандартизированных данных дает такую же оценку, как и на нормализованных.

In [50]:
# Попробуем поиграть с параметрами модели дерева класификации.
# Для начала переберем значения максимальной глубины дерева и посмотрим, как от этого изменится результат.
for i in range(6):
    tree = DecisionTreeClassifier(criterion='gini', max_depth=i+1, random_state=1)
    X_train,X_test,y_train,y_test = train_test_split(X_norm, y, test_size=1/3, random_state=42)
    tree.fit(X_train, y_train)
    y_pred_tree = tree.predict(X_test)
    print("max_depth=", i+1)
    print("Accuracy:", accuracy_score(y_test,y_pred_tree))
    print("Recall:", recall_score(y_test,y_pred_tree))
    print("Precision:", precision_score(y_test,y_pred_tree))
    print("-"*60)

max_depth= 1
Accuracy: 0.71875
Recall: 0.313953488372093
Precision: 0.675
------------------------------------------------------------
max_depth= 2
Accuracy: 0.71875
Recall: 0.313953488372093
Precision: 0.675
------------------------------------------------------------
max_depth= 3
Accuracy: 0.69140625
Recall: 0.686046511627907
Precision: 0.5315315315315315
------------------------------------------------------------
max_depth= 4
Accuracy: 0.72265625
Recall: 0.7209302325581395
Precision: 0.5688073394495413
------------------------------------------------------------
max_depth= 5
Accuracy: 0.76171875
Recall: 0.5697674418604651
Precision: 0.6712328767123288
------------------------------------------------------------
max_depth= 6
Accuracy: 0.71484375
Recall: 0.7441860465116279
Precision: 0.5565217391304348
------------------------------------------------------------


In [51]:
# Заметим, что с повышением глубины оценка растет. В особенности это заметно на Recall

# теперь проведем тот же опыт, но заменим criterion на 'entropy'
for i in range(6):
    tree = DecisionTreeClassifier(criterion='entropy', max_depth=i+1, random_state=1)
    X_train,X_test,y_train,y_test = train_test_split(X_norm, y, test_size=1/3, random_state=42)
    tree.fit(X_train, y_train)
    y_pred_tree = tree.predict(X_test)
    print("max_depth=", i+1)
    print("Accuracy:", accuracy_score(y_test,y_pred_tree))
    print("Recall:", recall_score(y_test,y_pred_tree))
    print("Precision:", precision_score(y_test,y_pred_tree))
    print("-"*60)

max_depth= 1
Accuracy: 0.71875
Recall: 0.313953488372093
Precision: 0.675
------------------------------------------------------------
max_depth= 2
Accuracy: 0.71875
Recall: 0.313953488372093
Precision: 0.675
------------------------------------------------------------
max_depth= 3
Accuracy: 0.6875
Recall: 0.6976744186046512
Precision: 0.5263157894736842
------------------------------------------------------------
max_depth= 4
Accuracy: 0.7265625
Recall: 0.6744186046511628
Precision: 0.58
------------------------------------------------------------
max_depth= 5
Accuracy: 0.71875
Recall: 0.7558139534883721
Precision: 0.5603448275862069
------------------------------------------------------------
max_depth= 6
Accuracy: 0.71875
Recall: 0.6744186046511628
Precision: 0.5686274509803921
------------------------------------------------------------


In [54]:
#сравнивая оценки, можно заметить, что с нашими данными энтропия работает хуже чем gini.
from sklearn.linear_model import LogisticRegression

re = LogisticRegression()
cvs = cross_val_score(estimator=re, X=X_norm, y=y, cv=kf, scoring='accuracy')
print(cvs)
print(cvs.mean())

[0.77272727 0.68831169 0.77272727 0.81045752 0.76470588]
0.7617859264918089


In [55]:
from sklearn.neighbors import KNeighborsClassifier
scores = []
X_train,X_test,y_train,y_test = train_test_split(X_norm, y, test_size=1/3, random_state=42)
# для начала подберем оптимальное число соседей
for i in range(1,15):

    knn = KNeighborsClassifier(i)
    knn.fit(X_train,y_train)
    answers = knn.predict(X_test)
    
    scores.append(knn.score(X_test,y_test))

print(scores)

[0.68359375, 0.6796875, 0.6796875, 0.69140625, 0.6796875, 0.69140625, 0.7109375, 0.7109375, 0.71484375, 0.6953125, 0.6875, 0.69140625, 0.6953125, 0.69921875]


In [57]:
knn = KNeighborsClassifier(5)
cvs = cross_val_score(estimator=knn, X=X_norm, y=y, cv=kf, scoring='accuracy')

print(cvs)
print(cvs.mean())

[0.77922078 0.66883117 0.73376623 0.78431373 0.73202614]
0.7396316102198455


In [None]:
# В предыдущем опыте у меня вышло accuracy = 0.6796875 для KNN и accuracy = 0.765625 для логисстической регрессии.
# Следовательно accuracy для логисстической регрессии почти не изменилось, но для KNN оно увеличилось, если делать его через кроссвалидацию.