In [None]:
%pip uninstall librosa
%pip install librosa==0.9.2

In [None]:
%pip install pydub

In [57]:
import librosa
import numpy as np
from pydub import AudioSegment
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

## load cdpd

In [17]:
df = pd.read_csv('../data/CDPD/training_data.csv') 

In [18]:
len(df)

942

In [19]:
df = df[df['Locations'].isin(['AV+PV+TV+MV', 'AV+PV+MV', 'AV+AV+PV+PV+TV+MV', 'AV+MV+MV', 
                                        'AV+PV+MV+Phc+Phc', 'AV+PV+TV+TV+MV', 'AV+AV+MV+MV', 'AV+AV+PV+TV+MV', 'AV+PV+TV+MV+Phc', 'AV+AV+AV+MV', 'AV+AV+PV+TV+MV+MV', ])&(df['Murmur']!='Unknown')]

In [20]:
len(df)

595

In [34]:
df[df['Patient ID'].isna()]

Unnamed: 0,Patient ID,Locations,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur grading,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Campaign,Additional ID


In [21]:
df.head()

Unnamed: 0,Patient ID,Locations,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur grading,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Campaign,Additional ID
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,,,CC2015,
1,9979,AV+PV+TV+MV,Child,Female,103.0,13.1,False,Present,AV+MV+PV+TV,TV,...,III/VI,High,Harsh,,,,,,CC2015,
3,13918,AV+PV+TV+MV,Child,Male,98.0,15.9,False,Present,TV,TV,...,I/VI,Low,Blowing,,,,,,CC2015,
4,14241,AV+PV+TV+MV,Child,Male,87.0,11.2,False,Present,AV+MV+PV+TV,PV,...,II/VI,Low,Harsh,,,,,,CC2015,
5,14998,AV+PV+TV+MV,Child,Male,,,False,Absent,,,...,,,,,,,,,CC2015,


In [40]:
# Определение порядка объединения файлов
order = ['MV', 'AV', 'PV', 'Ph', 'TV']
directory = '../data/CDPD/training_data/'

def combine_audio(filename):
    # Получение списка файлов для текущей строки
    files_to_combine = []
    for location in order:
        file_name = f"{directory}{filename}_{location}.wav"
        if os.path.exists(file_name):
            files_to_combine.append(file_name)
    
    # Объединение файлов
    combined_audio = AudioSegment.empty()
    for file in files_to_combine:
        audio = AudioSegment.from_wav(file)
        combined_audio += audio
    
    # Сохранение объединенного аудио
    output_file_name = f"{directory}{filename}_combined.wav"
    combined_audio.export(output_file_name, format="wav")
    return output_file_name

# Применение функции к каждой строке
# df.apply(combine_audio, axis=1)


In [52]:

def load_audio_data(file_names):
    X = []
    for file_name in tqdm(file_names):
        # Объединение аудиофайлов
        combined_file_name = combine_audio(file_name)
        
        # Загрузка объединенного аудиофайла и извлечение признаков
        audio, sr = librosa.load(combined_file_name)
        mfcc = librosa.feature.mfcc(y=audio, sr=sr)
        mfcc_mean = np.mean(mfcc, axis=1)
        
        # Удаление временного объединенного файла
        os.remove(combined_file_name)
        
        X.append(mfcc_mean)
    
    return np.array(X)

## load cda

In [65]:
patients = "../data/Пациенты.csv"
norm_dir = "../data/норма"
patology_dir = "../data/патология"

In [66]:
def find_wav_files(directory):
    X = []

    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".wav"):
                audio, sr = librosa.load(os.path.join(root, file))
                mfcc = librosa.feature.mfcc(y=audio, sr=sr)
                mfcc_mean = np.mean(mfcc, axis=1)
                
                X.append(mfcc_mean)
    
    return X

## prepare data

In [76]:
cda = find_wav_files(norm_dir)
norm = len(cda)
cda.extend(find_wav_files(patology_dir))
patology = len(cda) - norm

cda = np.array(cda)
cda_labels = [1]* norm + [0] * patology

In [None]:
file_names_cdpd = list(df['Patient ID']) 

# 1 - patology, 0 - normal
cdpd_data = load_audio_data(file_names_cdpd)
cdpd_labels = np.array(df['Murmur'].apply(lambda x: 1 if x=='Absent' else 0))

In [79]:
# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(cdpd_data, cdpd_labels, test_size=0.2, random_state=42)

# Масштабирование признаков
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [80]:
# Добавление новых данных в тестовую выборку
X_test_cdatest = np.vstack((X_test, cda))
y_test_cdatest = np.concatenate((y_test, cda_labels))

X_test_scaled_cdatest = scaler.transform(X_test_cdatest)

print("Новые размеры тестовых данных:")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"X_test_cdatest shape: {X_test_cdatest.shape}")
print(f"y_test_cdatest shape: {y_test_cdatest.shape}")

Новые размеры тестовых данных:
X_test shape: (119, 20)
y_test shape: (119,)
X_test_cdatest shape: (170, 20)
y_test_cdatest shape: (170,)


In [90]:
# cdpd cda train test
X_train_cda, X_test_cda, y_train_cda, y_test_cda = train_test_split(cda, cda_labels, test_size=0.2, random_state=42)


X_train_cdpd_cda = np.vstack((X_train, X_train_cda))
y_train_cdpd_cda = np.concatenate((y_train, y_train_cda))
X_test_cdpd_cda = np.vstack((X_test, X_test_cda))
y_test_cdpd_cda = np.concatenate((y_test, y_test_cda))

scaler = StandardScaler()
X_train_scaled_cdpd_cda = scaler.fit_transform(X_train_cdpd_cda)
X_test_scaled_cdpd_cda = scaler.transform(X_test_cdpd_cda)

## models

In [84]:
models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'C': [0.1, 1, 10],
            'penalty': ['l2']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 5, 10]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.1, 0.5, 1]
        }
    },
    'SVC': {
        'model': SVC(probability=True),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    }
}

## experiments

### cdpd full

In [81]:
def train_val(config, X_train_scaled, X_test_scaled, y_train, y_test):
    results = {}
    for name, config in models.items():
        grid_search = GridSearchCV(config['model'], config['params'], cv=5)
        grid_search.fit(X_train_scaled, y_train)
        
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test_scaled)
        y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]
        
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        # пресижн и рекол хде, вы шо
        auc = roc_auc_score(y_test, y_pred_proba)
        
        results[name] = {
            'Accuracy': accuracy,
            'F1-score': f1,
            'AUC-ROC': auc,
            'Best params': grid_search.best_params_
        }

    # Вывод результатов
    for model_name, metrics in results.items():
        print(f"Модель: {model_name}")
        print(f"Accuracy: {metrics['Accuracy']:.3f}")
        print(f"F1-score: {metrics['F1-score']:.3f}")
        print(f"AUC-ROC: {metrics['AUC-ROC']:.3f}")
        print(f"Лучшие параметры: {metrics['Best params']}")
        print()

### cdpd full

In [85]:
train_val(config,  X_train_scaled, X_test_scaled, y_train, y_test)

Модель: LogisticRegression
Accuracy: 0.866
F1-score: 0.922
AUC-ROC: 0.724
Лучшие параметры: {'C': 1, 'penalty': 'l2'}

Модель: RandomForest
Accuracy: 0.840
F1-score: 0.905
AUC-ROC: 0.728
Лучшие параметры: {'max_depth': None, 'n_estimators': 50}

Модель: GradientBoosting
Accuracy: 0.798
F1-score: 0.878
AUC-ROC: 0.699
Лучшие параметры: {'learning_rate': 1, 'n_estimators': 100}

Модель: SVC
Accuracy: 0.840
F1-score: 0.906
AUC-ROC: 0.739
Лучшие параметры: {'C': 1, 'kernel': 'rbf'}



### cdpd + cda test

In [86]:
train_val(config,  X_train_scaled, X_test_scaled_cdatest, y_train, y_test_cdatest)

Модель: LogisticRegression
Accuracy: 0.629
F1-score: 0.755
AUC-ROC: 0.563
Лучшие параметры: {'C': 1, 'penalty': 'l2'}

Модель: RandomForest
Accuracy: 0.618
F1-score: 0.749
AUC-ROC: 0.799
Лучшие параметры: {'max_depth': 5, 'n_estimators': 100}

Модель: GradientBoosting
Accuracy: 0.576
F1-score: 0.712
AUC-ROC: 0.749
Лучшие параметры: {'learning_rate': 0.5, 'n_estimators': 200}

Модель: SVC
Accuracy: 0.606
F1-score: 0.739
AUC-ROC: 0.809
Лучшие параметры: {'C': 1, 'kernel': 'rbf'}



### cdpd cda full

In [91]:
train_val(config, X_train_scaled_cdpd_cda, X_test_scaled_cdpd_cda, y_train_cdpd_cda, y_test_cdpd_cda)

Модель: LogisticRegression
Accuracy: 0.885
F1-score: 0.927
AUC-ROC: 0.806
Лучшие параметры: {'C': 10, 'penalty': 'l2'}

Модель: RandomForest
Accuracy: 0.838
F1-score: 0.897
AUC-ROC: 0.807
Лучшие параметры: {'max_depth': None, 'n_estimators': 200}

Модель: GradientBoosting
Accuracy: 0.808
F1-score: 0.874
AUC-ROC: 0.768
Лучшие параметры: {'learning_rate': 0.5, 'n_estimators': 100}

Модель: SVC
Accuracy: 0.862
F1-score: 0.913
AUC-ROC: 0.825
Лучшие параметры: {'C': 1, 'kernel': 'rbf'}



### 