<a href="https://colab.research.google.com/github/Aapng-cmd/fish/blob/master/voice_synth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import librosa
import librosa.display
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
from google.colab import files
files.upload()

Saving sample0.wav to sample0.wav


{'sample0.wav': b'RIFF$0\t\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00D\xac\x00\x00\x88X\x01\x00\x02\x00\x10\x00data\x000\t\x00\n\x00\n\x00\x07\x00\x06\x00\x05\x00\x03\x00\x00\x00\xfe\xff\xfe\xff\xfe\xff\xff\xff\x01\x00\x03\x00\x04\x00\x04\x00\x04\x00\x03\x00\x03\x00\x01\x00\xff\xff\xfd\xff\xfb\xff\xf8\xff\xf8\xff\xf8\xff\xf9\xff\xfb\xff\xfe\xff\x02\x00\x06\x00\t\x00\n\x00\x0b\x00\t\x00\x06\x00\x02\x00\xfe\xff\xf9\xff\xf4\xff\xf1\xff\xf0\xff\xf1\xff\xf2\xff\xf6\xff\xf9\xff\xfc\xff\xff\xff\x03\x00\x06\x00\x08\x00\t\x00\t\x00\x07\x00\x06\x00\x04\x00\x03\x00\x01\x00\x00\x00\xff\xff\x00\x00\x00\x00\x01\x00\x02\x00\x03\x00\x04\x00\x04\x00\x04\x00\x03\x00\x01\x00\xfe\xff\xfd\xff\xfd\xff\xfd\xff\xfe\xff\xff\xff\x00\x00\x02\x00\x03\x00\x05\x00\x06\x00\x08\x00\x07\x00\x06\x00\x05\x00\x04\x00\x03\x00\x04\x00\x04\x00\x03\x00\x02\x00\x01\x00\x01\x00\xfe\xff\xfc\xff\xfb\xff\xfa\xff\xf8\xff\xf6\xff\xf5\xff\xf5\xff\xf4\xff\xf5\xff\xf8\xff\xfb\xff\xff\xff\x02\x00\x06\x00\x08\x00\n\x00\n\x00\n\x00\x08\

In [12]:
import speech_recognition as sr
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence
import nltk
from nltk.tokenize import sent_tokenize

# nltk.download('punkt')  # download the Punkt tokenizer models

def transcribe_audio(path):
    with sr.AudioFile(path) as source:
        audio_listened = r.record(source)
        try:
            text = r.recognize_google(audio_listened, language="ru-RU")
        except sr.UnknownValueError as e:
            print("Error:", str(e))
            text = ""
        return text

def get_large_audio_transcription_on_silence(path):
    sound = AudioSegment.from_file(path)
    chunks = split_on_silence(sound, min_silence_len=500, silence_thresh=sound.dBFS-14, keep_silence=500)
    folder_name = "audio-chunks"
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
    whole_text = ""
    for i, audio_chunk in enumerate(chunks, start=1):
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        text = transcribe_audio(chunk_filename)
        whole_text += text + " "
    sentences = sent_tokenize(whole_text)
    return ". ".join(sentences)

r = sr.Recognizer()
for i in range(1):
    path = f"audio/sample{i}.wav"
    text = get_large_audio_transcription_on_silence(path)
    with open(f"data/sample{i}.txt", "w") as f:
        f.write(text)

    print(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


подушка кровать корова наушники футболка телевизор приставка


In [13]:
class MyDataset(Dataset):
    def __init__(self, audio_files, text_files):
        self.audio_files = audio_files
        self.text_files = text_files

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio_file = self.audio_files[idx]
        text_file = self.text_files[idx]

        # Load audio file
        audio, sr = librosa.load(audio_file)

        # Load text transcription
        with open(text_file, 'r') as f:
            text = f.read()

        # Tokenize text
        tokens = word_tokenize(text)

        # Convert tokens to numerical IDs (you'll need a vocabulary for this)
        vocab = {'<pad>': 0, '<unk>': 1}  # example vocabulary
        token_ids = [vocab.get(token, 1) for token in tokens]

        # Preprocess audio and text
        audio = audio.reshape(-1, 1)  # (T, 1)
        text_tensor = torch.tensor(token_ids)

        return audio, text_tensor

In [14]:
dataset = MyDataset([f'audio/sample{i}.wav' for i in range(1)], [f'data/sample{i}.txt' for i in range(1)])
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [15]:
class FastSpeech(nn.Module):
    def __init__(self):
        super(FastSpeech, self).__init__()
        self.encoder = nn.LSTM(input_size=80, hidden_size=256, num_layers=2, batch_first=True)
        self.decoder = nn.Linear(256, 7)  # Output dimension matches the shape of the text tensor

    def forward(self, x):
        x, _ = self.encoder(x)
        x = self.decoder(x[:, -1, :])  # Take the last hidden state and pass it through the decoder
        return x

In [16]:
model = FastSpeech()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(100):
    for batch in data_loader:
        audio, text = batch
        audio = audio.to(device)
        text = text.to(device)

        # Forward pass
        output = model(audio)

        # Calculate the loss
        loss = criterion(output, text.float())

        # Backpropagate the loss
        loss.backward()

        # Update the model parameters
        optimizer.step()

        # Reset the gradients
        optimizer.zero_grad()

        # Print the loss for the current batch
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

RuntimeError: input.size(-1) must be equal to input_size. Expected 80, got 1

In [None]:
# Save the model after training
torch.save(model.state_dict(), 'models/fastspeech.pt')