<a href="https://colab.research.google.com/github/Aapng-cmd/fish/blob/master/voice_synth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import librosa
import librosa.display
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
from google.colab import files
files.upload()

In [41]:
import speech_recognition as sr
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence
import nltk
from nltk.tokenize import sent_tokenize

# nltk.download('punkt')  # download the Punkt tokenizer models

def transcribe_audio(path):
    with sr.AudioFile(path) as source:
        audio_listened = r.record(source)
        try:
            text = r.recognize_google(audio_listened, language="ru-RU")
        except sr.UnknownValueError as e:
            print("Error:", str(e))
            text = ""
        return text

def get_large_audio_transcription_on_silence(path):
    sound = AudioSegment.from_file(path)
    chunks = split_on_silence(sound, min_silence_len=500, silence_thresh=sound.dBFS-14, keep_silence=500)
    folder_name = "audio-chunks"
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
    whole_text = ""
    for i, audio_chunk in enumerate(chunks, start=1):
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        text = transcribe_audio(chunk_filename)
        whole_text += text + " "
    sentences = sent_tokenize(whole_text)
    return ". ".join(sentences)

r = sr.Recognizer()
for i in range(1):
    path = f"audio/sample{i}.wav"
    text = get_large_audio_transcription_on_silence(path)
    with open(f"data/sample{i}.txt", "w") as f:
        f.write(text)

    print(text)

подушка кровать корова наушники футболка телевизор приставка


In [50]:
class MyDataset(Dataset):
    def __init__(self, audio_files, text_files):
        self.audio_files = audio_files
        self.text_files = text_files

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio_file = self.audio_files[idx]
        text_file = self.text_files[idx]

        # Load audio file with error handling
        try:
            audio, sr = librosa.load(audio_file)
        except Exception as e:
            print(f"Error loading audio file: {audio_file}, {e}")
            audio = torch.zeros(1, 80)  # Replace with a placeholder if loading fails
            sr = 22050  # Default sample rate

        # Extract MFCC features from audio data
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=80)
        mfccs = mfccs.T  # Transpose to (time_steps, features)

        # Load text transcription
        with open(text_file, 'r') as f:
            text = f.read()

        # Tokenize text
        tokens = word_tokenize(text)

        # Convert tokens to numerical IDs (you'll need a vocabulary for this)
        vocab = {'<pad>': 0, '<unk>': 1}  # example vocabulary
        token_ids = [vocab.get(token, 1) for token in tokens]

        # Preprocess audio and text
        audio_tensor = torch.tensor(mfccs)  # (time_steps, features)
        text_tensor = torch.tensor(token_ids)

        return audio_tensor, text_tensor

In [51]:
dataset = MyDataset([f'audio/sample{i}.wav' for i in range(1)], [f'data/sample{i}.txt' for i in range(1)])
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [48]:
class FastSpeech(nn.Module):
    def __init__(self):
        super(FastSpeech, self).__init__()
        self.encoder = nn.LSTM(input_size=80, hidden_size=256, num_layers=2, batch_first=True)
        self.decoder = nn.Linear(256, 7)  # Output dimension matches the shape of the text tensor

    def forward(self, x):
        x, _ = self.encoder(x)
        x = self.decoder(x[:, -1, :])  # Take the last hidden state and pass it through the decoder
        return x

In [52]:
model = FastSpeech()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(100):
    for batch in data_loader:
        audio, text = batch
        audio = audio.to(device)
        text = text.to(device)

        # Forward pass
        output = model(audio)

        # Calculate the loss
        loss = criterion(output, text.float())

        # Backpropagate the loss
        loss.backward()

        # Update the model parameters
        optimizer.step()

        # Reset the gradients
        optimizer.zero_grad()

        # Print the loss for the current batch
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1, Loss: 1.0086119174957275
Epoch 2, Loss: 0.6742094159126282
Epoch 3, Loss: 0.4082925319671631
Epoch 4, Loss: 0.14254361391067505
Epoch 5, Loss: 0.019336888566613197
Epoch 6, Loss: 0.0608539916574955
Epoch 7, Loss: 0.09472382813692093
Epoch 8, Loss: 0.07359181344509125
Epoch 9, Loss: 0.03889273852109909
Epoch 10, Loss: 0.016895201057195663
Epoch 11, Loss: 0.010884399525821209
Epoch 12, Loss: 0.013557950966060162
Epoch 13, Loss: 0.01785932667553425
Epoch 14, Loss: 0.020316969603300095
Epoch 15, Loss: 0.020191740244627
Epoch 16, Loss: 0.018096206709742546
Epoch 17, Loss: 0.015030339360237122
Epoch 18, Loss: 0.011885138228535652
Epoch 19, Loss: 0.009230030700564384
Epoch 20, Loss: 0.00728000421077013
Epoch 21, Loss: 0.005984848830848932
Epoch 22, Loss: 0.0051641869358718395
Epoch 23, Loss: 0.004624220542609692
Epoch 24, Loss: 0.004236070904880762
Epoch 25, Loss: 0.0039472379721701145
Epoch 26, Loss: 0.0037458795122802258
Epoch 27, Loss: 0.0036188450176268816
Epoch 28, Loss: 0.00352

In [53]:
# Save the model after training
torch.save(model.state_dict(), 'models/fastspeech.pt')