In [18]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install keybert
!pip install noisereduce
!pip install soundfile


In [22]:
def main(audio_file):
  import os
  import librosa
  import torch
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
  from keybert import KeyBERT

  def convert_extension(audio_file):
    filename, file_extension = os.path.splitext(audio_file)
    if (str(file_extension)) != ".flac":
      print('another extension')
      audio_file = str(filename) + ".flac"
      print(audio_file)
      return audio_file

  def translate(tokenizer, model, transcript):
    batch = tokenizer([transcript], return_tensors='pt')
    generated_ids = model.generate(**batch)
    translated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return translated_text

  def transcript(tokenizer, model, audio_file):

    print('---File Converter---')
    convert_extension(audio_file)
    
    transcript = ""

    # Stream over 20 seconds chunks
    stream = librosa.stream(
        audio_file, block_length=20, frame_length=16000, hop_length=16000
    )

    for speech in stream:
        if len(speech.shape) > 1:
            speech = speech[:, 0] + speech[:, 1]

        input_values = tokenizer(speech, return_tensors="pt").input_values
        logits = model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = tokenizer.decode(predicted_ids[0])
        transcript += transcription.lower() + " "
        
    return transcript

  tokenizer_transcribe = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
  model_transcribe = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
  tokenizer_translate = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ur")
  model_translate = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ur")


  transcription = transcript(tokenizer_transcribe, model_transcribe, audio_file)
  print('---Transcription---')
  print(transcription)

  translation = translate(tokenizer_translate, model_translate, transcription)
  print('---Translation---')
  print(translation)

  kw_model = KeyBERT()
  keywords = kw_model.extract_keywords(transcription, highlight=True)
  print(keywords)

audio_file = "/content/audio13.flac"
check=main(audio_file)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.


Downloading pytorch_model.bin:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

Downloading target.spm:   0%|          | 0.00/828k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.82M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/292M [00:00<?, ?B/s]

---File Converter---
---Transcription---
ostaveca pope erson tusd 




---Translation---
پوپ کی طرف سے مخالفت


Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

[('pope', 0.5789), ('ostaveca', 0.5685), ('tusd', 0.4242), ('erson', 0.3591)]
