In [2]:
# Importing all the libraries that we need
import re
import numpy as np
import pandas as pd
import librosa
import random
import unicodedata
import string
import tqdm
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# Defining the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Importing the paths-traductions dataframe
DF = pd.read_csv("transcript.csv")

In [4]:
CLS_token = 0
SEP_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "CLS", 1: "SEP"}
        self.n_words = 2  # Count CLS and EOS
        self.max_length = 0

    def addSentence(self, sentence):
        split = sentence.split(' ')
        if self.max_length < len(split):
            self.max_length = len(split)
        for word in split:
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
            
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = str(s)
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s



In [5]:
%%time

# Normalization
output_lang = Lang("english")

DF["traductions"] = DF["traductions"].apply(lambda s: normalizeString(s))

DF["traductions"].apply(lambda s: output_lang.addSentence(s))

print("It is done !")

It is done !
CPU times: user 2.16 s, sys: 13.3 ms, total: 2.18 s
Wall time: 2.18 s


In [6]:
output_lang.max_length

106

In [7]:

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(SEP_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


In [8]:
def wav2raw(path_wav, device, duration=6, srate=16000, send_wave=True):
    """ 
    Opens an wav audio file with librosa.
        
    :param path_wav:
        
    """
    
    max_length = duration * srate
    # Open the audio file
    wave, srate = librosa.load(path_wav, duration=duration, mono=True, sr=srate)
    # Padd if too short or slice if too long
    
    if wave.shape[0] < max_length:
        pass
        #wave = np.pad(wave, (0, max_length - wave.shape[0]),'constant', constant_values=0)
    else:
        wave = wave[:max_length]
        
    wave = wave.reshape(1, 1, wave.shape[0])
    
    if send_wave == True:
        return torch.tensor(wave).to(device)
    else:
        return path_wav

In [9]:
%%time
# La plus longue séquence
values = DF['paths'][0:5].progress_apply(lambda path: wav2raw(path, device, duration=10, srate=16000, send_wave=False)).values
max_path = np.argmax(values)
max_val = np.max(values)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


CPU times: user 818 ms, sys: 17.3 ms, total: 835 ms
Wall time: 850 ms


In [10]:
spec_path = DF["paths"][max_path]

In [11]:
# Test
spec = wav2raw(spec_path, device)
spec.shape

torch.Size([1, 1, 96000])

In [12]:
print(spec_path)

../english-bible/Genesis/Genesis_1-5.wav


In [23]:
class EncoderRNN(nn.Module):
    def __init__(self,  hidden_size=256):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_dir = 2
        self.batch_size = 1 # For the moment    

        self.conv1d1 = nn.Conv1d(1,  self.hidden_size, kernel_size=5)
        self.conv1d2 = nn.Conv1d(self.hidden_size, self.hidden_size, kernel_size=5)
        self.conv1d3 = nn.Conv1d(self.hidden_size, self.hidden_size, kernel_size=3)
        self.conv1d4 = nn.Conv1d(self.hidden_size, self.hidden_size, kernel_size=3)
        
        self.pool1d1 = nn.MaxPool1d(kernel_size=3)
        self.pool1d2 = nn.MaxPool1d(kernel_size=3)
        self.pool1d3 = nn.MaxPool1d(kernel_size=3)
        self.pool1d4 = nn.MaxPool1d(kernel_size=3)
        
        self.dropout_1 = nn.Dropout(0.3)
        self.dropout_2 = nn.Dropout(0.2)
        
        self.gru_1 = nn.GRU(hidden_size, hidden_size, batch_first=True, num_layers=1, bidirectional=True)


    def forward(self, wave, hidden):

        # Started with the convolutionnal base
        conv_layer = self.conv1d1(wave)
        pool_layer = self.pool1d1(conv_layer)
        
        conv_layer = self.conv1d2(pool_layer)
        pool_layer = self.pool1d2(conv_layer)
        
        conv_layer = self.conv1d3(pool_layer)
        pool_layer = self.pool1d2(conv_layer)
        
        conv_layer = self.conv1d3(pool_layer)
        pool_layer = self.pool1d3(conv_layer)

        conv_layer = self.conv1d4(pool_layer)
        pool_layer = self.pool1d4(conv_layer)
        
        dim_1 = pool_layer.shape[1]
        dim_2 = pool_layer.shape[2]
        
        # Reshape from the conv (batch, features, seq_length) to (batch, seq_length, features) 
        output = pool_layer.reshape(1, dim_2, dim_1)
        output, hidden = self.gru_1(output, hidden)
        # Separate the forward pass
        forward = output.view(1, dim_2, 2, self.hidden_size)[:, :, 0, :]
        # Separate the backward pass
        backward = output.view(1, dim_2, 2, self.hidden_size)[:, :, 1, :]
        # Sum the forward pass and the backward to form the output
        output = (forward + backward) / 2
        output = F.relu(output)
        # Pass through the dropout layer
        output = self.dropout_2(output)
        # I don't know what i will do with but i collect the last state for the moment
        self.last_state = hidden.view(2, 1, self.hidden_size)[-1,:,:].unsqueeze(0).to(device)
        return output.squeeze(0), hidden 

    def initHidden(self):
        return torch.zeros(2, self.batch_size, self.hidden_size, device=device)

In [24]:
# Let test the encoder 
encoder = EncoderRNN().to(device)
hidden = encoder.initHidden()
encoder(spec, hidden)
enc_out, enc_state = encoder(spec, hidden)
print(enc_out.shape)

torch.Size([394, 256])


In [25]:
MAX_LENGTH = 1000#enc_out.shape[0]

In [26]:
class AttnDecoderRNN(nn.Module):
    def __init__(self,  output_size, max_length, dropout_p=0.1,  hidden_size=256):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2 + self.max_length, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, attn_weights, encoder_outputs):
        
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        # Compute the energie
        cat = torch.cat((embedded[0], hidden[0], attn_weights), 1)
        score = self.attn(cat)
        # Compute the attention weights
        attn_weights = F.softmax(score, dim=1)
        #print("att weights", attn_weights.shape)
        # Apply the attention weights to the encoder outputs
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        # Combine it with the inputs to predict the next word
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        # Compute the probabilities for the next word
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device), torch.zeros(1, self.max_length, device=device)

In [27]:
decoder = AttnDecoderRNN(output_lang.n_words, max_length=394, dropout_p=0.1).to(device)
hidden, attn_w = decoder.initHidden()
x = torch.tensor([0], device=device)
dec_out, _, _ = decoder(x, hidden, attn_w, enc_out)

In [28]:
#teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, max_length=MAX_LENGTH):
    
    # Initialize the encoder and the decoder's attention weights
    encoder_hidden = encoder.initHidden()
    _, decoder_attention = decoder.initHidden()
    
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()


    target_length = target_tensor.size(0)
    loss = 0
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    # We choose to encode all the raw audio directly (so here our approach is diffrent from NLP 
    # From Scratch tutorial)
    encoder_outputs_of_input, encoder_hidden = encoder(input_tensor, encoder_hidden)
    # Impute the encoder_outputs with encoder_outputs_of_input
    dim_0, dim_1 = encoder_outputs_of_input.shape
    encoder_outputs[0 : dim_0 , 0 : dim_1] = encoder_outputs_of_input
    

    decoder_input = torch.tensor([[CLS_token]], device=device)
    # We take only the last state of the encoder to initialize the decoder
    decoder_hidden = encoder.last_state

    use_teacher_forcing = True #if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, decoder_attention, encoder_outputs)
            
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        pass
 

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [29]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)



In [30]:

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    since = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters())
    decoder_optimizer = optim.Adam(decoder.parameters())
    

    criterion = nn.NLLLoss()
    #criterion = nn.CrossEntropyLoss()
    training_pairs = DF.values.tolist()
    for iter in range(1, n_iters + 1):
        # Shuffle the list at the beginning of an epoch 
        if iter % len(training_pairs) == 0:
            random.shuffle(training_pairs)
            
        training_pair = training_pairs[iter - 1]
        path = training_pair[0]
        input_tensor = wav2raw(path, device)
        target_tensor = tensorFromSentence(output_lang, training_pair[1])
        
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == print_every - 1:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print("Iteration: {} \t Evolution: {} % \t Loss: {} " .format(iter+1, round(iter / n_iters * 100, 1), round(print_loss_avg, 4)))
            
            

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
        
    print("Time taken for training: {}".format(asMinutes(time.time() - since)))

    showPlot(plot_losses)

In [31]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker



def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [32]:
DF.shape[0] * 3

93249

In [None]:
%%time

encoder1 = EncoderRNN().to(device)
attn_decoder1 = AttnDecoderRNN(output_lang.n_words, dropout_p=0.1, max_length=MAX_LENGTH).to(device)

trainIters(encoder1, attn_decoder1, 93249, print_every=200)

Iteration: 200 	 Evolution: 0.2 % 	 Loss: 5.934 
Iteration: 400 	 Evolution: 0.4 % 	 Loss: 5.536 
Iteration: 600 	 Evolution: 0.6 % 	 Loss: 5.1362 
Iteration: 800 	 Evolution: 0.9 % 	 Loss: 4.8368 
Iteration: 1000 	 Evolution: 1.1 % 	 Loss: 4.7133 
Iteration: 1200 	 Evolution: 1.3 % 	 Loss: 4.7312 
Iteration: 1400 	 Evolution: 1.5 % 	 Loss: 4.4528 
Iteration: 1600 	 Evolution: 1.7 % 	 Loss: 4.5729 
Iteration: 1800 	 Evolution: 1.9 % 	 Loss: 4.1571 
Iteration: 2000 	 Evolution: 2.1 % 	 Loss: 4.3228 
Iteration: 2200 	 Evolution: 2.4 % 	 Loss: 4.545 
Iteration: 2400 	 Evolution: 2.6 % 	 Loss: 4.3164 
Iteration: 2600 	 Evolution: 2.8 % 	 Loss: 4.1996 
Iteration: 2800 	 Evolution: 3.0 % 	 Loss: 3.3614 
Iteration: 3000 	 Evolution: 3.2 % 	 Loss: 3.2136 
Iteration: 3200 	 Evolution: 3.4 % 	 Loss: 3.8517 
Iteration: 3400 	 Evolution: 3.6 % 	 Loss: 3.7372 
Iteration: 3600 	 Evolution: 3.9 % 	 Loss: 3.8455 
Iteration: 3800 	 Evolution: 4.1 % 	 Loss: 2.9968 
Iteration: 4000 	 Evolution: 4.3 % 	 L

In [None]:
# Idea make a max_length directly in the encoder so that we will give it automatically to the decoder
