In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('diabetes.csv')

# Оценка количества дупликатов и их устранение
df_full_size = len(df)
df = df.drop_duplicates()
print(f'Количество дубликатов в датасете: {df_full_size - len(df)}')

# Заменяем нулевые занчения на средние значения соотвествующих стоблцов
for column_name in ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction']:
    column_values = df[column_name]
    column_mean_value = column_values[column_values != 0].mean()
    df[column_name] = column_values.mask(column_values == 0).fillna(column_mean_value)

# Вводим новый класс диабета с нулевым инсулином
first_df = df.copy()
second_df = df.copy()
first_df = df[df['Insulin'] == 0].drop('Insulin', axis=1)
second_df = df[df['Insulin'] != 0].drop('Insulin', axis=1)

print(f"Отношение размеров кластеров для первой модели: {sum(first_df['Outcome'] == 0) / len(first_df)}")
print(f"Отношение размеров кластеров для второй модели: {sum(second_df['Outcome'] == 0) / len(second_df)}")

scaler = StandardScaler().set_output(transform="pandas")

# Разделение выборки на обучающую и тестовую
first_train, first_test = train_test_split(first_df, test_size=.2, random_state=42)
first_train_X, first_train_y = scaler.fit_transform(first_train.loc[:, first_train.columns != 'Outcome'].copy()), first_train['Outcome'].copy()
first_test_X, first_test_y = scaler.fit_transform(first_test.loc[:, first_test.columns != 'Outcome'].copy()), first_test['Outcome'].copy()

second_train, second_test = train_test_split(second_df, test_size=.2, random_state=42)
second_train_X, second_train_y = scaler.fit_transform(second_train.loc[:, second_train.columns != 'Outcome'].copy()), second_train['Outcome'].copy()
second_test_X, second_test_y = scaler.fit_transform(second_test.loc[:, second_test.columns != 'Outcome'].copy()), second_test['Outcome'].copy()

Количество дубликатов в датасете: 0
Отношение размеров кластеров для первой модели: 0.6310160427807486
Отношение размеров кластеров для второй модели: 0.6700507614213198


In [3]:
print('Матрица корреляции датасета для первой модели')
first_df.corr().style.background_gradient(cmap='coolwarm')

Матрица корреляции датасета для первой модели


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.072056,0.161614,0.070743,0.101627,-0.01113,0.409821,0.179527
Glucose,0.072056,1.0,0.244162,0.19613,0.251536,0.130751,0.228564,0.473633
BloodPressure,0.161614,0.244162,1.0,0.132995,0.294533,0.080659,0.313439,0.127841
SkinThickness,0.070743,0.19613,0.132995,1.0,0.363946,0.006726,0.092266,0.161869
BMI,0.101627,0.251536,0.294533,0.363946,1.0,0.12056,0.024331,0.367581
DiabetesPedigreeFunction,-0.01113,0.130751,0.080659,0.006726,0.12056,1.0,0.068771,0.164146
Age,0.409821,0.228564,0.313439,0.092266,0.024331,0.068771,1.0,0.14038
Outcome,0.179527,0.473633,0.127841,0.161869,0.367581,0.164146,0.14038,1.0


In [4]:
print('Матрица корреляции датасета для второй модели')
second_df.corr().style.background_gradient(cmap='coolwarm')

Матрица корреляции датасета для второй модели


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.198337,0.21374,0.095997,-0.023652,-0.000402,0.68092,0.258846
Glucose,0.198337,1.0,0.210108,0.198862,0.209444,0.1367,0.343259,0.515362
BloodPressure,0.21374,0.210108,1.0,0.232342,0.303632,-0.020834,0.299845,0.192819
SkinThickness,0.095997,0.198862,0.232342,1.0,0.664752,0.154056,0.170694,0.257854
BMI,-0.023652,0.209444,0.303632,0.664752,1.0,0.156629,0.071695,0.271103
DiabetesPedigreeFunction,-0.000402,0.1367,-0.020834,0.154056,0.156629,1.0,0.076609,0.200759
Age,0.68092,0.343259,0.299845,0.170694,0.071695,0.076609,1.0,0.352982
Outcome,0.258846,0.515362,0.192819,0.257854,0.271103,0.200759,0.352982,1.0


In [5]:
def logRegr(train_X, train_y, test_X, test_y):
    clf = LogisticRegression(
        penalty='l2',
        tol=1e-5,
        fit_intercept=True,
        solver='liblinear',
        max_iter=200,
        random_state=42
    ).fit(train_X, train_y)

    y_true = test_y
    y_pred = clf.predict(test_X)
    print(f'logRegr: Accuracy = {accuracy_score(y_true, y_pred)}')
    print(f'logRegr: Recall = {recall_score(y_true, y_pred)}')
    print(f'logRegr: Precision = {precision_score(y_true, y_pred)}\n')

In [19]:
def rndForest(train_X, train_y, test_X, test_y):
    clf = RandomForestClassifier(
        n_estimators=300,
        criterion='entropy',
        max_depth=100,
        min_samples_split=6,
        max_features='log2',
        bootstrap=False,
        n_jobs=10,
        random_state=42,
    ).fit(train_X, train_y)

    y_true = test_y
    y_pred = clf.predict(test_X)
    print(f'rndForest: Accuracy = {accuracy_score(y_true, y_pred)}')
    print(f'rndForest: Recall = {recall_score(y_true, y_pred)}')
    print(f'rndForest: Precision = {precision_score(y_true, y_pred)}\n')

In [7]:
def SVM(train_X, train_y, test_X, test_y):
    clf = svm.SVC(
        kernel='poly',
        degree=5,
        random_state=42,
    ).fit(train_X, train_y)

    y_true = test_y
    y_pred = clf.predict(test_X)
    print(f'SVM: Accuracy = {accuracy_score(y_true, y_pred)}')
    print(f'SVM: Recall = {recall_score(y_true, y_pred)}')
    print(f'SVM: Precision = {precision_score(y_true, y_pred)}\n')

In [22]:
logRegr(first_train_X, first_train_y, first_test_X, first_test_y)
rndForest(first_train_X, first_train_y, first_test_X, first_test_y)
SVM(first_train_X, first_train_y, first_test_X, first_test_y)

logRegr: Accuracy = 0.8466666666666667
logRegr: Recall = 0.6357142857142857
logRegr: Precision = 0.8142857142857143

rndForest: Accuracy = 0.8033333333333332
rndForest: Recall = 0.8071428571428572
rndForest: Precision = 0.7738461538461539

SVM: Accuracy = 0.8066666666666666
SVM: Recall = 0.3857142857142857
SVM: Precision = 0.9



In [23]:
logRegr(second_train_X, second_train_y, second_test_X, second_test_y)
rndForest(second_train_X, second_train_y, second_test_X, second_test_y)
SVM(second_train_X, second_train_y, second_test_X, second_test_y)

logRegr: Accuracy = 0.910126582278481
logRegr: Recall = 0.78
logRegr: Precision = 0.8083333333333333

rndForest: Accuracy = 0.8548101265822785
rndForest: Recall = 0.8400000000000001
rndForest: Precision = 0.7866666666666666

SVM: Accuracy = 0.859493670886076
SVM: Recall = 0.5
SVM: Precision = 0.8142857142857143

