In [None]:
# install libs
import os

if not os.path.exists('/kaggle/working/flag'):
    !pip3 install huggingsound --user
    !pip3 install num2words

    with open('flag', 'w') as fout:
        fout.write('flag')

In [None]:
# Utility
from sklearn.model_selection import KFold
from num2words import num2words
import numpy as np
import pandas as pd 
import random
import json
import yaml

# Audio
import librosa
import IPython.display as ipd
from tqdm.notebook import tqdm

# Graphics 
%matplotlib inline
import librosa.display
import matplotlib.pyplot as plt

# Models
import torch
from huggingsound import SpeechRecognitionModel
from huggingsound import TrainingArguments, ModelArguments, TokenSet
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split

# Metrics
from jiwer import wer

# Consts and Settings
import warnings
warnings.filterwarnings('ignore')

SEED = 42

MODEL_ID = 'jonatasgrosman/wav2vec2-large-xlsr-53-russian'

PATH = '/kaggle/input/speech-to-text-russian/data/data/'
PATH_TRAIN = PATH + 'train_wavs/'
PATH_TEST = PATH + 'test_wavs/'
work_mode = True
PATH_WORKING = '/kaggle/working/' if work_mode else '/kaggle/input/speech-to-text-russian/results-4/' 


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

torch.cuda.empty_cache()

cuda = torch.cuda.is_available()
device_name = 'cuda' if cuda else 'cpu'
device = torch.device(device_name)

device_name

In [None]:
# Load and listen to the audio file
example_file = PATH_TRAIN + '0.wav'
audio, sample_rate = librosa.load(example_file)

ipd.Audio(example_file, rate=sample_rate)

In [None]:
# Plot our example audio file's waveform
plt.rcParams['figure.figsize'] = (20,5)
plt.title('Waveform of Audio Example')
plt.ylabel('Amplitude')

_ = librosa.display.waveshow(audio)

In [None]:
mel_spec = librosa.feature.melspectrogram(audio, sr=sample_rate)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

librosa.display.specshow(
    mel_spec_db, x_axis='time', y_axis='mel')
plt.colorbar()
plt.title('Mel Spectrogram')

In [None]:
# Get spectrogram using Librosa's Short-Time Fourier Transform (stft)
spec = np.abs(librosa.stft(audio))
spec_db = librosa.amplitude_to_db(spec, ref=np.max)  # Decibels

# Use log scale to view frequencies
librosa.display.specshow(spec_db, y_axis='log', x_axis='time')
plt.colorbar()
plt.title('Audio Spectrogram');

In [None]:
# Filter transcription
train_labels = pd.read_csv(PATH + 'train_labels.csv')

def str_filter(utterance):
    bad_symbols = set(['0','1','2','3','4','5','6','7','8','9','_','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','і','ї','ґ'])
    utterance.strip('0123456789.- _')
    utterance = ' '.join([num2words(i, lang='ru') if i.isdigit() else i for i in utterance.split()])
    utterance = ''.join([c for c in utterance if c not in bad_symbols])
    return utterance

train_labels['Expected'] = train_labels['Expected'].apply(str_filter)
train_labels.head()

In [None]:
sample_sub = pd.read_csv(PATH + 'sample_submission.csv')
sample_sub.head()

## Predicting without Fine-tuning

In [None]:
def predict(sample_sub, model=None, batch_size=1, path_test=PATH_TEST):
    audio_path = [path_test + str(row['Id']) + '.wav' for i, row in sample_sub.iterrows()]
    
    if model is None:
        model = SpeechRecognitionModel(MODEL_ID, device=device_name)
        
    transcriptions = model.transcribe(audio_path, batch_size=batch_size)
    sample_sub['Predicted'] = [transcriptions[i]['transcription'] for i in range(len(transcriptions))]
    
    sample_sub = sample_sub.set_index('Id')
    return sample_sub

In [None]:
# predict = predict(sample_sub.iloc[:100])
# predict.to_csv('answer_no_tuning.csv')
# predict.head()

## Fine-Tuning model

In [None]:
model = SpeechRecognitionModel(MODEL_ID, device=device_name)
output_dir = './fine-tuned/'

# first of all, you need to define your model's token set
tokens = ['а','б','в','г','д','е','ж','з','и','й','к','л','м','н','о','п','р','с','т','у','ф','х','ц','ч','ш','щ','ъ','ы','ь','э','ю','я']
token_set = TokenSet(tokens)

In [None]:
def fine_tune(model, X, train_batch_size=3, eval_batch_size=3, test_size=0.2):
    labels_train, labels_val = train_test_split(X, test_size=test_size, random_state=SEED)

    train_data = [
        {'path': PATH_TRAIN + str(row['Id']) + '.wav', 'transcription': row['Expected']}
        for i, row in labels_train.iterrows()
    ]

    eval_data = [
        {'path': PATH_TRAIN + str(row['Id']) + '.wav', 'transcription': row['Expected']}
        for i, row in labels_val.iterrows()
    ]
    
    # and finally, fine-tune your model
    training_args = TrainingArguments()
    training_args.per_device_train_batch_size = train_batch_size
    training_args.per_device_eval_batch_size = eval_batch_size
    training_args.logging_steps = 1000
    training_args.overwrite_output_dir = True


    model.finetune(
        output_dir, 
        train_data=train_data, 
        eval_data=eval_data,
        token_set=token_set,
        training_args=training_args,
    )
    
    return model

In [None]:
kf = KFold(n_splits=5)

for _, test_index in kf.split(train_labels):
    X = train_labels.iloc[test_index]
    model = fine_tune(model, X)
    predict = predict(sample_sub, model=model)
    predict.to_csv('answer_with_tuning.csv')

In [None]:
predict.head()