In [1]:
%%capture
!pip install datasets
!pip install transformers==4.28.0

In [1]:
import tensorflow as tf
from transformers import TFWav2Vec2ForCTC, Wav2Vec2Processor

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#load HDF5 from local
my_model="/content/drive/Shareddrives/C23-PR569/Model/Model_HDF5"
processor = Wav2Vec2Processor.from_pretrained(my_model)
model = TFWav2Vec2ForCTC.from_pretrained(my_model)


TFWav2Vec2ForCTC has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tine this model, you need a GPU or a TPU
All model checkpoint layers were used when initializing TFWav2Vec2ForCTC.

All the layers of TFWav2Vec2ForCTC were initialized from the model checkpoint at /content/drive/Shareddrives/C23-PR569/Model/Model_HDF5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFWav2Vec2ForCTC for predictions without further training.


In [None]:
model.summary()

Model: "tf_wav2_vec2_for_ctc"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 wav2vec2 (TFWav2Vec2MainLay  multiple                 315438720 
 er)                                                             
                                                                 
 dropout_98 (Dropout)        multiple                  0         
                                                                 
 lm_head (Dense)             multiple                  48175     
                                                                 
Total params: 315,486,895
Trainable params: 315,486,895
Non-trainable params: 0
_________________________________________________________________


In [3]:
import librosa

# **Dimulai dari sini perubahan codenya**


In [4]:
def resample_audio(audio, original_sample_rate, target_sample_rate):
    resampled_audio = librosa.resample(audio, orig_sr=original_sample_rate, target_sr=target_sample_rate)
    return resampled_audio

def predict_from_file(file_path):
    speech, sample_rate = librosa.load(file_path)
    #target_sample_rate = 16000                                                    # klu ga support hilangin aja yang ku comment ini
    #speech = resample_audio(speech, sample_rate, target_sample_rate)              #  klu ga support hilangin aja yang ku comment ini
    input_values = processor(speech, return_tensors="tf").input_values
    logits = model(input_values).logits
    predicted_ids = tf.argmax(logits, axis=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription

In [5]:
def calculate_cer(original_word, transcribed_word):
    original_len = len(original_word)
    transcribed_len = len(transcribed_word)
    distance = edit_distance(original_word, transcribed_word)
    cer = min((distance / original_len) * 100, 100)  # Limit the maximum CER to 100
    return cer

def get_rating(original_text, transcribed_text):
    original_words = original_text.split()
    transcribed_words = transcribed_text.split()
    wer = 0
    reduc1 = 0
    total_cer = 0
    average_cer=0
    reduc2 = 0
    rating_reduction = 0

    for original_word, transcribed_word in zip(original_words, transcribed_words):
        cer = calculate_cer(original_word, transcribed_word)
        if 60 <= cer <= 100:
            rating_reduction = 4
        elif cer == 0:
            rating_reduction = 0
        else:
            total_cer += cer

        print("cer :", cer)
        print("rate :",rating_reduction)
        reduc1 += rating_reduction

    if len(original_words) > 0:
        average_cer = total_cer / len(original_words)
        if average_cer < 15:
            reduc2 = 0
        elif average_cer < 25:
            reduc2 = 1
        elif average_cer < 45:
            reduc2 = 2
        else:
            reduc2 = 3
        print("aver",average_cer)
        rating = 5 - reduc1 - reduc2
        rating = max(rating, 1)

    return rating

def edit_distance(a, b):
    m = len(a)
    n = len(b)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        dp[i][0] = i

    for j in range(n + 1):
        dp[0][j] = j

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if a[i - 1] == b[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j - 1], dp[i][j - 1], dp[i - 1][j])

    return dp[m][n]

In [43]:
#Predict from wav file to text
file_path = "/content/drive/Shareddrives/C23-PR569/data/Audio_Dataset/114/114000.mp3"
predicted_transcription = predict_from_file(file_path)

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [45]:
transcribed_text = predicted_transcription
original_text = "بِسْمِ اللّٰهِ الرَّحْمٰنِ الرَّحِيْمِ"

rate = get_rating(original_text, transcribed_text)
print("text :", transcribed_text )
print("rating", rate)


cer : 100
rate : 4
cer : 100
rate : 4
cer : 45.45454545454545
rate : 4
aver 11.363636363636363
text : بُْمِلَّهِ الرَّحْمَانٍ الرَّحِيمٍ
rating 1
