#####Install torch and the stuff needed

In [None]:
#You should only need to install torch once, you can comment the line out after this and only install transformers and datasets every time
#%pip install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
!pip install transformers
!pip install datasets

#####Make the files available in colab. Can be done via Google Drive:

In [None]:
#Check what is in your library at colab
! ls

sample_data


In [None]:
#Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
#Copy the path to the folder you want to the library in colab
! cp "/content/drive/mydrive/Plugg/Examensarbete/Ljudfiler/"
#check your library again to see if the files you want are there
! ls

cp: missing destination file operand after '/content/drive/mydrive/Plugg/Examensarbete/ModernaTider/ModernaTider_Ljudfiler/'
Try 'cp --help' for more information.
drive  fil_1_NERKORTAD.mp3  sample_data


It can also be done by uploading files from your computer by clicking on "upload files" in the bar to the left

#####Choosing processor and model

In [None]:
import torch
import torchaudio
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import os

processor = Wav2Vec2Processor.from_pretrained("KBLab/wav2vec2-large-voxrex-swedish")
model = Wav2Vec2ForCTC.from_pretrained("KBLab/wav2vec2-large-voxrex-swedish")
# Note! We use a different frequency from the ekot file
resampler = torchaudio.transforms.Resample(44_100, 16_000)

Downloading:   0%|          | 0.00/212 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/211 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/421 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

###Transcribing only one file step by step to test if it works

Below I go through and read in one mp3 file in torch

In [None]:
speech_array, sampling_rate = torchaudio.load("De_1.mp3")
print(speech_array)
print(sampling_rate)

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0213, 0.0211, 0.0219]])
48000


We see that we have a different sampling rate

In [None]:
new_ekot = resampler(speech_array).squeeze().numpy() #resampling to 16 000
print(new_ekot)
print(new_ekot[0])
print(new_ekot[0][:2])

[0.         0.         0.         ... 0.02288184 0.02171886 0.02218858]
0.0


IndexError: ignored

In [None]:
print(new_ekot[0].size)
# Lets compute the number of datapoints for one minute (Ekot is 15 minutes)
print(new_ekot[0].size/15)

185574
0.06666666666666667


In [None]:
sample_length = int(new_ekot[0].size/15) #about one minute.
inputs = processor(new_ekot[0][:sample_length], sampling_rate=16_000, return_tensors="pt", padding=True)
print(inputs)

{'input_values': tensor([[ 0.0027,  0.0027,  0.0027,  ..., -0.7621, -0.5213, -0.5096]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)}


In [None]:
with torch.no_grad():
  logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
predicted_ids = torch.argmax(logits, dim=-1)

text = processor.batch_decode(predicted_ids)
print(text)

  return (input_length - kernel_size) // stride + 1


['dagens eko kvart i fem med utbrett supande bland effensvenskar i bosnien karlbilt får regeringsnixs efter uttalandet om natomedlemskap apoteken kan inte reglerna vägrar lämna ut fria läkemedel utan betalning och fullt slalomföre i sydsverige här ekot med barbro nordvall och susanne rodiner det var stora alkoholproblem med fylla både hos meniga och officerare hos den femte svenska effenbataljonen i bosnien det skriver tidningen expressen i dag ekot har talat med kaptenen ulf rydström som var rättstjänstbefäl ett slags bataljonspolis han bekräftar att alkoholproblemen var stora och diskuterades ofta jag skulle nästan vilja påstå att det diskuterades dagligen vacr fylleriet så omfattande ja jag skulle vilja påstå att det var det den femte svenska effenbataljonen kom hem för snart ett år sedan nu diskuteras problemen med alkohol och fylleri']


###Transcribing all audio files in one loop

I made a loop so all audio files can be transcribed in one go. I named the files De_1, De_2 etc.

In [None]:
from google.colab import files

for number in range(1, 11): #om du skriver 1, 4 så kommer den att köra 1, 2 och 3.

    filename = 'De_%d.mp3' % number
    speech_array, sampling_rate = torchaudio.load(filename)
    print("File De_", number)

    new_ekot = resampler(speech_array).squeeze().numpy()
    print("entire sample length:", new_ekot[0].size)

    sample_length = int(new_ekot[0].size/15) #about one minute.
    sample_length_original = int(new_ekot[0].size/15)
    print("1 minute sample_length:", sample_length)

    inputs = processor(new_ekot[0][:sample_length], sampling_rate=16_000, return_tensors="pt", padding=True)


    #First I transcribe the first minute:
    start = 0
    with torch.no_grad():
      logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
    predicted_ids = torch.argmax(logits, dim=-1)

    text = processor.batch_decode(predicted_ids)
    total = text[0]

    del logits
    del predicted_ids
    del text

    print(1, "first datapoint transcribed:", start)
    print(1, "last datapoint transcribed:", sample_length)
    print(f'Round 1 done.')

    #Then I create a loop to transcribe the rest of the file
    for i in range(2, 17):
      if sample_length < sample_length_original *15: #when sample length is bigger than the actual file, the
        start = sample_length # picks up where the last datapoint was transcribed in round 1
        sample_length = sample_length + sample_length_original #add one more minute to transcribe
        print(f'Round {i}, first datapoint transcribed: {start}')
        print(f'Round {i}, last datapoint transcribed: {sample_length}')

        inputs = processor(new_ekot[0][start:sample_length], sampling_rate=16_000, return_tensors="pt", padding=True)
        with torch.no_grad():
          logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        text = processor.batch_decode(predicted_ids)
        total += text[0]

        # delete what is not needed so colab won't crash
        del logits
        del predicted_ids
        del text
        print(f'Round {i} done.')


      else:
        # now the file should be fully transcribed

        # delete what is not needed so colab won't crash
        del new_ekot
        del speech_array
        del sampling_rate
        del sample_length
        del sample_length_original

        #create a text file and save it to drive
        text_file_name = "De_%d.txt" % number
        text_file = open(text_file_name, "w")
        n = text_file.write(total)
        text_file.close()
        !cp {text_file_name} "/content/drive/MyDrive/Plugg/Examensarbete/Transkriberingar"

        print(f'Final round done.')
        break


File fil_ 1
entire sample length: 1
1 minute sample_length: 0


IndexError: ignored