In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score, LeaveOneOut, ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

# **Данные**

In [68]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv("pima-indians-diabetes.data.csv", names=names)
array = data.values
X = array[:,0:8]
Y = array[:,8]

На своих данных модели выдавали ошибку, поэтому взял данные указанные в задании

# **Обучающая, валидационная, тестовая выборка**

**Train and Test Sets**

In [54]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print(f"Accuracy: {round(result*100)} %")

Accuracy: 76 %


Точность результатов модели - 76%, значит, модель более менее точно предсказывает результат

**k-fold Cross-Validation**

In [55]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold)
print(f"Mean accuracy: {round(results.mean()*100.0)} %")
print(f"Deviation accuracy: {round( results.std()*100.0)} %")

Mean accuracy: 77 %
Deviation accuracy: 5 %


Среднее значение точности - 77%, величина отклонения - 5%

**Leave One Out Cross-Validation**

In [67]:
loocv = LeaveOneOut()
model = LogisticRegression(solver='liblinear')
results = cross_val_score(model, X, Y, cv=loocv)
print(f"Mean accuracy: {round(results.mean()*100.0)} %")
print(f"Deviation accuracy: {round( results.std()*100.0)} %")

Mean accuracy: 77 %
Deviation accuracy: 42 %


Среднее значение точности - 77%, величина отклонения - 42%


**Repeated Random Test-Train Splits**

In [57]:
kfold = ShuffleSplit(n_splits=10, test_size=0.33, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold)
print(f"Mean accuracy: {round(results.mean()*100.0)} %")
print(f"Deviation accuracy: {round( results.std()*100.0)} %")

Mean accuracy: 77 %
Deviation accuracy: 2 %


Среднее значение точности - 77%, величина отклонения - 2%

# **Метрики качества моделей**

**Classification Accuracy**

In [58]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = LogisticRegression(solver='liblinear')
scoring = 'accuracy'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(f"Mean accuracy: {round(results.mean()*100.0)} %")
print(f"Deviation accuracy: {round( results.std()*100.0)} %")

Mean accuracy: 77 %
Deviation accuracy: 5 %


Среднее значение точности - 77%, величина отклонения - 5%

**Logistic Loss**

In [59]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = LogisticRegression(solver='liblinear')
scoring = 'neg_log_loss'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(f"Mean Logloss: {(results.mean())} %")
print(f"Deviation Logloss: {( results.std())} %")

Mean Logloss: -0.49356237960494703 %
Deviation Logloss: 0.04206771628974085 %


Среднее значение Logloss - -49%, величина отклонения - 5%

**Area Under ROC Curve**

In [60]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = LogisticRegression(solver='liblinear')
scoring = 'roc_auc'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(f"Mean AUC: {(results.mean())} %")
print(f"Deviation AUC: {( results.std())} %")

Mean AUC: 0.8257627921677674 %
Deviation AUC: 0.05005520303752424 %


Среднее значение AUC - 82%, величина отклонения - 5%

**Confusion Matrix**

In [61]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33,random_state=7)
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
matrix

array([[141,  21],
       [ 41,  51]])

**Classification Report**

In [62]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33,random_state=7)
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)

              precision    recall  f1-score   support

         0.0       0.77      0.87      0.82       162
         1.0       0.71      0.55      0.62        92

    accuracy                           0.76       254
   macro avg       0.74      0.71      0.72       254
weighted avg       0.75      0.76      0.75       254



**Mean Absolute Error**

In [63]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = LinearRegression()
scoring = 'neg_mean_absolute_error'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(f"Mean MAE: {(results.mean())} %")
print(f"Deviation MAE: {( results.std())} %")

Mean MAE: -0.3368404421024537 %
Deviation MAE: 0.019780732404569353 %


В данном случае средняя абсолютная ошибка -33%, а величина отклонения 2%

**Mean Squared Error**

In [64]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = LinearRegression()
scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(f"Mean AUC: {(results.mean())} %")
print(f"Deviation AUC: {( results.std())} %")

Mean AUC: -0.16320738575907065 %
Deviation AUC: 0.020251428034588136 %


В данном случае величина средней квадратической ошибки -16%, а величина отклонения 2%

**R2**

In [65]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = LinearRegression()
scoring = 'r2'
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(f"Mean R2: {(results.mean())} %")
print(f"Deviation R2: {( results.std())} %")

Mean R2: 0.26432881386388385 %
Deviation R2: 0.10236184260132887 %


Прогнозы не соответствуют фактическим значениям