In [1]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from scipy.stats import hmean
from torch.utils.data import Dataset
from datasets import Dataset as HFDataset
from transformers import (
    Wav2Vec2FeatureExtractor,
    WavLMForSequenceClassification,
    Wav2Vec2Processor,
    Wav2Vec2ForSequenceClassification,
    TrainingArguments,
    Trainer
)
import torchaudio
from sklearn.model_selection import train_test_split




In [None]:
# import subprocess
# from pathlib import Path
# from tqdm import tqdm

# def convert_opus_to_wav(opus_dir: str = "train_opus/audio", output_dir_name: str = "train_opus/converted_wav"):
#     """
#     Конвертирует все .opus файлы из opus_dir в .wav формат.
#     Сохраняет результат в поддиректорию output_dir_name.
#     """
#     input_dir = Path(opus_dir)
#     output_dir = Path(output_dir_name)
#     output_dir.mkdir(exist_ok=True, parents=True)

#     opus_files = list(input_dir.glob("*.opus"))
#     if not opus_files:
#         print(f"❌ В каталоге {input_dir} нет .opus файлов.")
#         return

#     print(f"Найдено {len(opus_files)} файлов. Начинаю конвертацию...")

#     for input_file in tqdm(opus_files, desc="Конвертация", ncols=100):
#         output_file = output_dir / (input_file.stem + ".wav")
#         cmd = [
#             "ffmpeg",
#             "-y",
#             "-i", str(input_file),
#             "-vn",             # без видео
#             "-acodec", "pcm_s16le",  # кодек WAV
#             "-ar", "16000",
#             "-ac", "1",
#             str(output_file)
#         ]
#         subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

#     print(f"\n✅ Конвертация завершена. WAV файлы сохранены в: {output_dir.resolve()}")

# # convert_opus_to_wav()

In [3]:
# convert_opus_to_wav(opus_dir='test_opus/audio', output_dir_name='test_opus/converted_wav')

In [4]:
AUDIO_DIR = "train_opus/converted_wav"
TEST_AUDIO_DIR = "test_opus/converted_wav"

In [None]:
files = [
    os.path.join(AUDIO_DIR, f)
    for f in os.listdir(AUDIO_DIR)
    if f.endswith(".wav") and not f.startswith("._")
]
files.sort()

In [None]:
import json

# читаем JSON-файл
with open("train_opus/word_bounds.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# проверим, что это словарь
if isinstance(data, dict):
    keys = set(data.keys())
    print(keys)
else:
    print("JSON не является словарем")

{'2883168064978289286962456424961447069044', '8005722711767016062430335415805998639002', '0807034267050836428657478219916591877913', '2633195268521567671353352048854874613311', '2904494198991114897879942119007153759672', '6214764445254568534252024113366737242967', '4276442970568457483110318169630731980700', '3575432742868368470055029349597116530125', '7175828636061126445497238762466143939095', '7609183063548200902957867960937542087786', '0109496367485877925844696080587555622611', '2042594498677904733647505806786464762736', '5395610070653005853999251853526289810390', '0175826257925558685820601823619982955605', '8735310971798020498009811213390462209231', '6042956831190211358189457545763360149580', '1593563774254683665722565643513224002448', '4008664491391320980026886169210542581509', '0176992209892051100573442990430367060886', '3452974102932398775822392967062016142045', '1120233451485851900203713935102693801208', '9228911853837767193578724140573484797315', '582901555053603982072316485126

In [None]:
train = pd.DataFrame({
    "id": [os.path.basename(f).replace(".wav", "") for f in files],
    "filename": files
})

train["label"] = train["id"].apply(lambda x: 1 if x in keys else 0)

In [8]:
# Разделяем на train/val
train_df, val_df = train_test_split(train, test_size=0.01, stratify=train["label"], random_state=42)

In [9]:
class AudioDataset(Dataset):
    def __init__(self, df, processor, target_sampling_rate=16000):
        self.df = df.reset_index(drop=True)
        self.processor = processor
        self.resampler = torchaudio.transforms.Resample(new_freq=target_sampling_rate)
        self.target_sr = target_sampling_rate

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = str(self.df.loc[idx, "filename"]).replace("\\", "/")
        label = int(self.df.loc[idx, "label"])
        waveform, sr = torchaudio.load(path)
        if sr != self.target_sr:
            waveform = self.resampler(waveform)
        waveform = waveform.squeeze()
        inputs = self.processor(waveform, sampling_rate=self.target_sr, return_tensors="pt", padding=True)
        inputs["labels"] = torch.tensor(label, dtype=torch.long)
        return {k: v.squeeze(0) for k, v in inputs.items()}

In [10]:
# model_name = "facebook/wav2vec2-base"
folder_name = "wavLM-1"
model_name = "microsoft/wavlm-base-plus"
processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = WavLMForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    problem_type="single_label_classification",
    use_safetensors=True
)

Some weights of WavLMForSequenceClassification were not initialized from the model checkpoint at microsoft/wavlm-base-plus and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
train_dataset = AudioDataset(train_df, processor)
val_dataset = AudioDataset(val_df, processor)

In [None]:
def compute_metrics(pred):
    preds = torch.argmax(torch.tensor(pred.predictions), dim=1)
    labels = torch.tensor(pred.label_ids)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()

    num_pos = tp + fn
    num_neg = tn + fp

    frr = fn / num_pos if num_pos > 0 else 0.0
    far = fp / num_neg if num_neg > 0 else 0.0

    valid_values = [1 - frr, 1 - far]
    harmonic_mean_score = hmean(valid_values) if all(v > 0 for v in valid_values) else 0.0

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "harmonic_mean": harmonic_mean_score
    }


training_args = TrainingArguments(
    output_dir=f"./{folder_name}",

    num_train_epochs=3,
    per_device_train_batch_size=28,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    gradient_accumulation_steps=2,

    logging_dir='./logs',
    logging_steps=20,
    save_steps=640,
    save_total_limit=2,
    save_strategy='steps',

    eval_strategy='steps',
    eval_steps=640,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="harmonic_mean",

    seed=42,
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [14]:
test_files = [
    os.path.join(TEST_AUDIO_DIR, f)
    for f in os.listdir(TEST_AUDIO_DIR)
    if f.endswith(".wav") and not f.startswith("._")
]
test_files.sort()

In [15]:
test = pd.DataFrame({
    "id": [os.path.basename(f).replace(".wav", "") for f in test_files],
    "filename": test_files
})

test["label"] = test["id"].apply(lambda x: 1 if x in keys else 0)

In [16]:
test_dataset = AudioDataset(test, processor)

In [17]:
predictions = trainer.predict(test_dataset)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [18]:
submission = pd.DataFrame({
    "id": test["id"],
    "label": predictions.predictions.argmax(axis=1)
})

In [None]:
submission.to_csv("WavLM-4.csv", index=False)

In [None]:
wavlm_preds = predictions.predictions

In [25]:
wav2vec_preds = np.load("wav2vec_preds.npy")

In [35]:
combined_preds = (wavlm_preds * 0.6 + wav2vec_preds * 0.4)

In [36]:
submission = pd.DataFrame({
    "id": test["id"],
    "label": combined_preds.argmax(axis=1)
})
# submission.to_csv("wavLM-wav2vec3.csv", index=False)