In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

df = pd.read_csv("diabetes.csv")

print(f"Количество дубликатов до удаления: {df.duplicated().sum()}")
df = df.drop_duplicates()
print(f"Количество дубликатов после удаления: {df.duplicated().sum()}")

df_type1 = df[(df['Outcome'] == 0) | (df['Insulin'] == 0)]
df_type2 = df[(df['Outcome'] == 0) | (df['Insulin'] > 0)]

print(f"Количество людей с диабетом 1 типа: {len(df_type1) - len(df[df['Outcome'] == 0])}")
print(f"Количество людей с диабетом 2 типа: {len(df_type2) - len(df[df['Outcome'] == 0])}")
print(f"Количество здоровых людей: {len(df[df['Outcome'] == 0])}")

X_type1 = df_type1.drop(columns=['Outcome'])
y_type1 = df_type1['Outcome']

X_type2 = df_type2.drop(columns=['Outcome'])
y_type2 = df_type2['Outcome']

X_train_type1, X_test_type1, y_train_type1, y_test_type1 = train_test_split(X_type1, y_type1, test_size=0.2, random_state=42, stratify=y_type1)
X_train_type2, X_test_type2, y_train_type2, y_test_type2 = train_test_split(X_type2, y_type2, test_size=0.2, random_state=42, stratify=y_type2)

smote = SMOTE(random_state=42)
X_train_type1_balanced, y_train_type1_balanced = smote.fit_resample(X_train_type1, y_train_type1)
X_train_type2_balanced, y_train_type2_balanced = smote.fit_resample(X_train_type2, y_train_type2)

scaler = StandardScaler()
X_train_type1_scaled = scaler.fit_transform(X_train_type1_balanced)
X_test_type1_scaled = scaler.transform(X_test_type1)

X_train_type2_scaled = scaler.fit_transform(X_train_type2_balanced)
X_test_type2_scaled = scaler.transform(X_test_type2)

log_reg_type1 = LogisticRegression()
log_reg_type1.fit(X_train_type1_scaled, y_train_type1_balanced)
y_pred_type1_log_reg = log_reg_type1.predict(X_test_type1_scaled)

log_reg_type2 = LogisticRegression()
log_reg_type2.fit(X_train_type2_scaled, y_train_type2_balanced)
y_pred_type2_log_reg = log_reg_type2.predict(X_test_type2_scaled)

svm_type1 = SVC()
svm_type1.fit(X_train_type1_scaled, y_train_type1_balanced)
y_pred_type1_svm = svm_type1.predict(X_test_type1_scaled)

svm_type2 = SVC()
svm_type2.fit(X_train_type2_scaled, y_train_type2_balanced)
y_pred_type2_svm = svm_type2.predict(X_test_type2_scaled)

rf_type1 = RandomForestClassifier(random_state=42)
rf_type1.fit(X_train_type1_scaled, y_train_type1_balanced)
y_pred_type1_rf = rf_type1.predict(X_test_type1_scaled)

rf_type2 = RandomForestClassifier(random_state=42)
rf_type2.fit(X_train_type2_scaled, y_train_type2_balanced)
y_pred_type2_rf = rf_type2.predict(X_test_type2_scaled)

def print_metrics(y_true, y_pred, model_name):
    tp = ((y_true == 1) & (y_pred == 1)).sum()
    tn = ((y_true == 0) & (y_pred == 0)).sum()
    fp = ((y_true == 0) & (y_pred == 1)).sum()
    fn = ((y_true == 1) & (y_pred == 0)).sum()
    
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    print(f"Метрики для {model_name}:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}\n")

print_metrics(y_test_type1, y_pred_type1_log_reg, "Диабет 1 типа (Логистическая регрессия)")
print_metrics(y_test_type2, y_pred_type2_log_reg, "Диабет 2 типа (Логистическая регрессия)")
print_metrics(y_test_type1, y_pred_type1_svm, "Диабет 1 типа (SVM)")
print_metrics(y_test_type2, y_pred_type2_svm, "Диабет 2 типа (SVM)")
print_metrics(y_test_type1, y_pred_type1_rf, "Диабет 1 типа (Случайный лес)")
print_metrics(y_test_type2, y_pred_type2_rf, "Диабет 2 типа (Случайный лес)")

Количество дубликатов до удаления: 0
Количество дубликатов после удаления: 0
Количество людей с диабетом 1 типа: 138
Количество людей с диабетом 2 типа: 130
Количество здоровых людей: 500
Метрики для Диабет 1 типа (Логистическая регрессия):
Accuracy: 0.83
Precision: 0.57
Recall: 0.86

Метрики для Диабет 2 типа (Логистическая регрессия):
Accuracy: 0.78
Precision: 0.48
Recall: 0.85

Метрики для Диабет 1 типа (SVM):
Accuracy: 0.79
Precision: 0.51
Recall: 0.79

Метрики для Диабет 2 типа (SVM):
Accuracy: 0.82
Precision: 0.54
Recall: 0.77

Метрики для Диабет 1 типа (Случайный лес):
Accuracy: 0.83
Precision: 0.58
Recall: 0.79

Метрики для Диабет 2 типа (Случайный лес):
Accuracy: 0.86
Precision: 0.62
Recall: 0.77

