In [158]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score

In [65]:
df = pd.read_csv('diabetes.csv')

# Оценка количества дупликатов и их устранение
df_full_size = len(df)
df = df.drop_duplicates()
print(f'Количество дубликатов: {df_full_size - len(df)}')

# Заменяем нулевые занчения на средние значения соотвествующих стоблцов
for column_name in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction']:
    column_values = df[column_name]
    column_mean_value = column_values[column_values != 0].mean()
    df[column_name] = column_values.mask(column_values == 0).fillna(column_mean_value)

# Разделение выборки на обучающую и тестовую
train, test = train_test_split(df, test_size=.2, random_state=42)
train_X, train_y = train.loc[:, train.columns != 'Outcome'].copy(), train['Outcome']
test_X, test_y = test.loc[:, test.columns != 'Outcome'].copy(), test['Outcome']

Количество дубликатов: 0


In [154]:
clf = LogisticRegression(
    penalty='l2',
    tol=1e-5,
    fit_intercept=True,
    solver='liblinear',
    max_iter=200
).fit(train_X, train_y)

y_true = test_y
y_pred = clf.predict(test_X)
print(f'Accuracy = {accuracy_score(y_true, y_pred)}')
print(f'Recall = {recall_score(y_true, y_pred)}')
print(f'Precision = {precision_score(y_true, y_pred)}')

Accuracy = 0.7857142857142857
Recall = 0.6
Precision = 0.75


In [217]:
clf = RandomForestClassifier(
    n_estimators=150,
    criterion='entropy',
    max_depth=100,
    min_samples_split=6,
    max_features='log2',
    bootstrap=False
).fit(train_X, train_y)

y_true = test_y
y_pred = clf.predict(test_X)
print(f'Accuracy = {accuracy_score(y_true, y_pred)}')
print(f'Recall = {recall_score(y_true, y_pred)}')
print(f'Precision = {precision_score(y_true, y_pred)}')

Accuracy = 0.7467532467532467
Recall = 0.6909090909090909
Precision = 0.6333333333333333
