# Translation English to German Audio Experiment
## Experiment 4
### Text Translation & TTS 1

https://huggingface.co/blog/speecht5


# Import Libraries

In [1]:
import torch
from transformers import *
from datasets import load_dataset
import soundfile as sf



# Text To Translate

In [2]:
article = """
Albert Einstein ( 14 March 1879 – 18 April 1955) was a German-born theoretical physicist, widely acknowledged to be one of the greatest physicists of all time. 
Einstein is best known for developing the theory of relativity, but he also made important contributions to the development of the theory of quantum mechanics. 
Relativity and quantum mechanics are together the two pillars of modern physics."
"""

# Setup Translation Model

In [3]:
def get_translation_model_and_tokenizer(src_lang, dst_lang):
  """
  Given the source and destination languages, returns the appropriate model
  See the language codes here: https://developers.google.com/admin-sdk/directory/v1/languages
  For the 3-character language codes, you can google for the code!
  """
  # construct our model name
  model_name = f"Helsinki-NLP/opus-mt-{src}-{dst}"
  # initialize the tokenizer & model
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
  # return them for use
  return model, tokenizer

In [4]:
# source & destination languages
src = "en"
dst = "de"

model, tokenizer = get_translation_model_and_tokenizer(src, dst)

loading configuration file config.json from cache at C:\Users\Connor/.cache\huggingface\hub\models--Helsinki-NLP--opus-mt-en-de\snapshots\6183067f769a302e3861815543b9f312c71b0ca4\config.json
Model config MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-en-de",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      58100
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 58100,
  "decoder_vocab_size": 58101,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "gradient_check

# Translate

In [5]:
# encode the text into tensor of integers using the appropriate tokenizer
inputs = tokenizer.encode(article, return_tensors="pt", max_length=512, truncation=True)
print(inputs)

tensor([[ 7799, 39858,    20,   536,  1290,   268,  3977,   112,   268,   757,
         18170,    27,    58,    14,   586,    13,  4904, 15823, 38818,     2,
         10884, 20420,    12,    43,   128,     7,     4,  7833, 38818,     6,
             7,    92,   160,     3, 39858,    19,   517,  1369,    23,  3121,
             4,  8807,     7,  5049,   658,     2,   144,   137,   115,   319,
           501,  6820,    12,     4,   478,     7,     4,  8807,     7, 35266,
         35330,     3,   465,  1270, 24370,     8, 35266, 35330,    48,   848,
             4,   254, 26364,     7,  1457, 19419,   221,     0]])


In [6]:
# generate the translation output using beam search
beam_outputs = model.generate(inputs, num_beams=3)
# decode the output and ignore special tokens
print("Beam Outputs")
translated_text = tokenizer.decode(beam_outputs[0], skip_special_tokens=True)
print(translated_text)

Beam Outputs
Albert Einstein (14. März 1879 – 18. April 1955) war ein in Deutschland geborener theoretischer Physiker, der weithin als einer der größten Physiker aller Zeiten anerkannt wurde. Einstein ist am besten für die Entwicklung der Relativitätstheorie bekannt, leistete aber auch wichtige Beiträge zur Entwicklung der Quantenmechanik. Relativität und Quantenmechanik sind zusammen die beiden Säulen der modernen Physik."


In [7]:
print(type(translated_text))

<class 'str'>


# Setup TTS Model

In [8]:
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

loading configuration file preprocessor_config.json from cache at C:\Users\Connor/.cache\huggingface\hub\models--microsoft--speecht5_tts\snapshots\30fcde30f19b87502b8435427b5f5068e401d5f6\preprocessor_config.json
Feature extractor SpeechT5FeatureExtractor {
  "do_normalize": false,
  "feature_extractor_type": "SpeechT5FeatureExtractor",
  "feature_size": 1,
  "fmax": 7600,
  "fmin": 80,
  "frame_signal_scale": 1.0,
  "hop_length": 16,
  "mel_floor": 1e-10,
  "num_mel_bins": 80,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "SpeechT5Processor",
  "reduction_factor": 2,
  "return_attention_mask": true,
  "sampling_rate": 16000,
  "win_function": "hann_window",
  "win_length": 64
}

loading file spm_char.model from cache at C:\Users\Connor/.cache\huggingface\hub\models--microsoft--speecht5_tts\snapshots\30fcde30f19b87502b8435427b5f5068e401d5f6\spm_char.model
loading file added_tokens.json from cache at C:\Users\Connor/.cache\huggingface\hub\models--microsoft--spe

# Tokenise Inputs

In [9]:
tts_inputs = tts_processor(text=translated_text, return_tensors="pt")

# Speaker Embeddings 

In [10]:

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# Generation
Spectogram Speech Generation Voice Vocoding and writing output

In [11]:
spectrogram = tts_model.generate_speech(tts_inputs["input_ids"], speaker_embeddings)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

speech = tts_model.generate_speech(tts_inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
sf.write("tts_example.wav", speech.numpy(), samplerate=16000)

loading configuration file config.json from cache at C:\Users\Connor/.cache\huggingface\hub\models--microsoft--speecht5_hifigan\snapshots\bb6f429406e86a9992357a972c0698b22043307d\config.json
Model config SpeechT5HifiGanConfig {
  "architectures": [
    "SpeechT5HifiGan"
  ],
  "initializer_range": 0.01,
  "leaky_relu_slope": 0.1,
  "model_in_dim": 80,
  "model_type": "hifigan",
  "normalize_before": true,
  "resblock_dilation_sizes": [
    [
      1,
      3,
      5
    ],
    [
      1,
      3,
      5
    ],
    [
      1,
      3,
      5
    ]
  ],
  "resblock_kernel_sizes": [
    3,
    7,
    11
  ],
  "sampling_rate": 16000,
  "torch_dtype": "float32",
  "transformers_version": "4.34.1",
  "upsample_initial_channel": 512,
  "upsample_kernel_sizes": [
    8,
    8,
    8,
    8
  ],
  "upsample_rates": [
    4,
    4,
    4,
    4
  ]
}

loading weights file pytorch_model.bin from cache at C:\Users\Connor/.cache\huggingface\hub\models--microsoft--speecht5_hifigan\snapshots\bb6f

# Output

In [12]:
from IPython.display import Audio

# Replace 'path/to/file.wav' with the path to your WAV file
Audio(filename='tts_example.wav', autoplay=True)
