In [1]:
import pandas as pd
import pickle as pkl
import os
import librosa

In [2]:
path = os.path.join(os.getcwd(), "data")
db = pd.read_csv(path+"/"+"file_language.csv")

In [3]:
db

Unnamed: 0.1,Unnamed: 0,file,language
0,0,acoruna_barco_m67780_a84524_audio_converted.mp3,Galician
1,1,barcelona_Ovvero_m68255_a84999_audio_converted...,Bosnian
2,2,berlin_Dirk_m67384_a84128_audio_converted.mp3,German


In [5]:
databases = []
for language in db["language"].unique():
    if language == "corrupted":
        continue
    databases.append(db[db['language'] == language])
databases

[   Unnamed: 0                                             file  language
 0           0  acoruna_barco_m67780_a84524_audio_converted.mp3  Galician,
    Unnamed: 0                                               file language
 1           1  barcelona_Ovvero_m68255_a84999_audio_converted...  Bosnian,
    Unnamed: 0                                           file language
 2           2  berlin_Dirk_m67384_a84128_audio_converted.mp3   German]

# The Model

In [7]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
transcriptions, unsupported_languages = {}, []
non_language_databases = []
j = 0
transcriptions_file, unsupported_languages_file = "transcriptions.pkl", "unsupported_languages.pkl"
for database in databases:
    transcription_tmp, i = [], 0
    try:
        language = list(database["language"])[0].lower()
        print(language)
    except KeyError:
        non_language_databases.append(database)
        continue
    try:
        forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
    except ValueError:
        unsupported_languages.append(language)
        with open(os.path.join(path,unsupported_languages_file),"wb") as f:
            pkl.dump(obj=unsupported_languages,file=f)
        continue

    for file in database["file"]:
        filepath = os.path.join(os.getcwd(), "uploads",file)
        signal, sr = librosa.load(filepath,sr=16000)
        input_features = processor(signal, sampling_rate=sr, return_tensors="pt").input_features
        predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        try:
            transcriptions[file] = transcription[0]
        except KeyError:
            transcriptions[file] = ""
        if i % 60 == 0:
            print(f"Reached iteration {i} storing results")
            
            with open(path+"/"+transcriptions_file,"wb") as f:
                pkl.dump(obj=transcriptions,file=f)
        i += 1

    print(f"Reached iteration {i} storing results")
    with open(path+"/"+transcriptions_file,"wb") as f:
        pkl.dump(obj=transcriptions,file=f)

galician


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Reached iteration 0 storing results
Reached iteration 1 storing results
bosnian
Reached iteration 0 storing results
Reached iteration 1 storing results
german
Reached iteration 0 storing results
Reached iteration 1 storing results


In [17]:
# Create a new list to hold all data
file_list = []
language_list = []
transcription_list = []

# Get data from the original language CSV
language_df = pd.read_csv(os.path.join(path, "file_language.csv"))

# Iterate through the language dataframe
for index, row in language_df.iterrows():
    file_name = row['file']
    language = row['language']
    
    # Add to our lists
    file_list.append(file_name)
    language_list.append(language)
    
    # Get transcription if available, otherwise NaN
    if file_name in transcriptions:
        transcription_list.append(transcriptions[file_name])
    else:
        transcription_list.append(float('nan'))  # Use NaN for missing transcriptions

# Create the full dataframe
full_df = pd.DataFrame({
    'file': file_list,
    'language': language_list,
    'transcription': transcription_list
})

# Save to CSV in the format notebook 4 expects
full_df.to_csv(os.path.join(os.getcwd(), "data", "full_db.csv"), index=False)

print(f"Created full_db.csv with {len(full_df)} entries")
full_df.head()

Created full_db.csv with 3 entries


Unnamed: 0,file,language,transcription
0,acoruna_barco_m67780_a84524_audio_converted.mp3,Galician,"En esta zona, tenemos 3 placas de discapacita..."
1,barcelona_Ovvero_m68255_a84999_audio_converted...,Bosnian,"Ate, da ste gravati. Ja, da ne? Ja, svoj mojo..."
2,berlin_Dirk_m67384_a84128_audio_converted.mp3,German,Also hier gibt es eine Kreuzung und neben den...
