In [21]:
import numpy as np
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report, recall_score

def train_best_model(X):
    """
    Обучает и выбирает лучшую модель на основе кросс-валидации с учетом групп
    Возвращает обученную модель с наивысшим recall для класса 1
    
    Параметры:
    X - матрица признаков (n_samples, n_features)
    y - вектор меток (n_samples,)
    groups - вектор групп для кросс-валидации (n_samples,)
    """
    n_subs = X.shape[0]
    print(X.shape)
    y = np.array([0]*(n_subs//2) + [1]*(n_subs//2))
    groups = np.repeat(np.arange(n_subs//2), 2)
    models = {
        'Logistic Regression (L1)': make_pipeline(
            StandardScaler(),
            LogisticRegression(penalty='l1', solver='saga', max_iter=10000, random_state=42)
        ),
        'Linear SVM': make_pipeline(
            StandardScaler(),
            SVC(kernel='linear', random_state=42)
        ),
        'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, max_depth=2, random_state=42),
        'SVM (RBF)': make_pipeline(
            StandardScaler(),
            SVC(kernel='rbf', random_state=42)
        ),
        'MLP': make_pipeline(
            StandardScaler(),
            MLPClassifier(hidden_layer_sizes=(50,), alpha=0.01, max_iter=1000, random_state=42)
        )
    }

    logo = LeaveOneGroupOut()
    best_recall = 0
    best_model = None
    best_model_name = ''

    # Перебор моделей и выбор лучшей по recall класса 1
    for model_name, model in models.items():
        recalls = []
        for train_idx, test_idx in logo.split(X, y, groups):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            recalls.append(recall_score(y_test, y_pred, pos_label=1))  # focus on recall for class 1
        
        mean_recall = np.mean(recalls)
        print(f'{model_name} - Средний recall класса 1: {mean_recall:.3f}')
        
        if mean_recall > best_recall:
            best_recall = mean_recall
            best_model = model
            best_model_name = model_name


    best_model = models['Gradient Boosting']
    # Финалное обучение лучшей модели на всех данных
    print(f'\nЛучшая модель: {best_model_name} с recall класса 1: {best_recall:.3f}')
    best_model.fit(X, y)
    
    return best_model

In [21]:
test_path = '/home/aaanpilov/diploma/project/numpy_matrixes/average_matrix/test/'
train_path = '/home/aaanpilov/diploma/project/numpy_matrixes/average_matrix/HC/'

In [22]:
options = ['auc', 'max', 'min', 'max_min']

for option in options:
    print(option)
    matrix = np.load(train_path + option + '.npy')
    matrix_test = np.load(test_path + option + '.npy')
    model = train_best_model(matrix)

    n_test = matrix_test.shape[0]    
    labels_test = np.array([0] * (n_test//2) + [1] * (n_test//2))
    print(classification_report(labels_test, model.predict(matrix_test)))
    print(model.predict(matrix_test))
    break

auc
(34, 132)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression (L1) - Средний recall класса 1: 0.500
Linear SVM - Средний recall класса 1: 0.471


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest - Средний recall класса 1: 0.441


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Gradient Boosting - Средний recall класса 1: 0.471
SVM (RBF) - Средний recall класса 1: 0.529


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


MLP - Средний recall класса 1: 0.382


KeyError: 'SVM (RBF)'

In [11]:
train_path = '/home/aaanpilov/diploma/project/numpy_matrixes/average_matrix/HC/auc.npy'
test_path = '/home/aaanpilov/diploma/project/numpy_matrixes/average_matrix/propose/test/auc'

matrix = np.load(train_path)
model = train_best_model(matrix)

for k in range(5):
    matrix_test = np.load(test_path + str(k) + '.npy')
    n_test = matrix_test.shape[0]    
    labels_test = np.array([0] * (n_test//2) + [1] * (n_test//2))
    print(classification_report(labels_test, model.predict(matrix_test)))
    print(model.predict(matrix_test))

(34, 132)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression (L1) - Средний recall класса 1: 0.500
Linear SVM - Средний recall класса 1: 0.471


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest - Средний recall класса 1: 0.441


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Gradient Boosting - Средний recall класса 1: 0.471
SVM (RBF) - Средний recall класса 1: 0.529


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


MLP - Средний recall класса 1: 0.382

Лучшая модель: SVM (RBF) с recall класса 1: 0.529
              precision    recall  f1-score   support

           0       0.53      1.00      0.69        10
           1       1.00      0.10      0.18        10

    accuracy                           0.55        20
   macro avg       0.76      0.55      0.44        20
weighted avg       0.76      0.55      0.44        20

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
              precision    recall  f1-score   support

           0       0.47      0.80      0.59        10
           1       0.33      0.10      0.15        10

    accuracy                           0.45        20
   macro avg       0.40      0.45      0.37        20
weighted avg       0.40      0.45      0.37        20

[1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
              precision    recall  f1-score   support

           0       0.47      0.90      0.62        10
           1       0.00      0.00      0.00        10

    accura

In [22]:
train_path = '/home/aaanpilov/diploma/project/numpy_matrixes/average_matrix/test/auc.npy'
test_path = '/home/aaanpilov/diploma/project/numpy_matrixes/average_matrix/propose/HC/auc'

matrix = np.load(train_path)
model = train_best_model(matrix)

for k in range(5):
    matrix_test = np.load(test_path + str(k) + '.npy')
    n_test = matrix_test.shape[0]    
    labels_test = np.array([0] * (n_test//2) + [1] * (n_test//2))
    print(classification_report(labels_test, model.predict(matrix_test)))
    print(model.predict(matrix_test))

(20, 132)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression (L1) - Средний recall класса 1: 0.300
Linear SVM - Средний recall класса 1: 0.450


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest - Средний recall класса 1: 0.350


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Gradient Boosting - Средний recall класса 1: 0.450
SVM (RBF) - Средний recall класса 1: 0.450


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


MLP - Средний recall класса 1: 0.350

Лучшая модель: Linear SVM с recall класса 1: 0.450
              precision    recall  f1-score   support

           0       0.50      1.00      0.67        17
           1       0.00      0.00      0.00        17

    accuracy                           0.50        34
   macro avg       0.25      0.50      0.33        34
weighted avg       0.25      0.50      0.33        34

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
              precision    recall  f1-score   support

           0       0.52      0.94      0.67        17
           1       0.67      0.12      0.20        17

    accuracy                           0.53        34
   macro avg       0.59      0.53      0.43        34
weighted avg       0.59      0.53      0.43        34

[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0]
              precision    recall  f1-score   support

           0       0.53      1.00      0.69        17
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
