In [1]:
# load transcription from all whisper fine-tuned models
# get list 

import glob
dir_list = glob.glob("/home/sz38235/workDir/elic/whisper_test/*.csv")
dir_list

['/home/sz38235/workDir/elic/whisper_test/ckm016-2023-06-10-Klana_09_whisper_all.csv',
 '/home/sz38235/workDir/elic/whisper_test/ckm015-2023-06-08-Klana_05_whisper_all.csv',
 '/home/sz38235/workDir/elic/whisper_test/ckm016-2023-06-10-Klana_07_whisper_all.csv',
 '/home/sz38235/workDir/elic/whisper_test/ckm016-2023-06-10-Klana_08_whisper_all.csv',
 '/home/sz38235/workDir/elic/whisper_test/ckm015-2023-06-08-Klana_04_whisper_all.csv',
 '/home/sz38235/workDir/elic/whisper_test/ckm016-2023-06-10-Klana_06_whisper_all.csv',
 '/home/sz38235/workDir/elic/whisper_test/ckm016-2023-06-10-Klana_05_whisper_all.csv',
 '/home/sz38235/workDir/elic/whisper_test/ckm009-2023-05-26-Žminj_02_whisper_all.csv',
 '/home/sz38235/workDir/elic/whisper_test/ckm016-2023-06-10-Klana_04_whisper_all.csv',
 '/home/sz38235/workDir/elic/whisper_test/ckm015-2023-06-08-Klana_06_whisper_all.csv',
 '/home/sz38235/workDir/elic/whisper_test/ckm009-2023-05-26-Žminj_03_whisper_all.csv',
 '/home/sz38235/workDir/elic/whisper_test

In [None]:
dir_list_2 = ['/home/sz38235/workDir/elic/whisper_test/ckm015-2023-06-08-Klana_02_whisper_all.csv',
'/home/sz38235/workDir/elic/whisper_test/ckm016-2023-06-10-Klana_02_whisper_all.csv',
'/home/sz38235/workDir/elic/whisper_test/ckm009-2023-05-26-Žminj_01_whisper_all.csv']

In [2]:
len(dir_list)

19

In [3]:
import re
import pandas as pd
from Bio import pairwise2
from thefuzz import fuzz
import numpy as np

def clean_text(text):
    if text == 'x':
        return ''
    # remove () pattern, will leave a ) to be removed
    text = re.sub(r'\(([^)]+)\)', '', text)
    special_char_list = [',', '.', '?', '#', '@', '-', ')', ':']
    for char in special_char_list:
        text = text.replace(char, '')
    # change all text to lower cases
    text = text.lower()
    return text

def get_clean_word_list(text):
    new_text = clean_text(text)
    word_list = new_text.split(' ')
    word_list_clean = list(filter(None, word_list))
    return word_list_clean

def get_aligned_list(l1, l2):
    # get aligned list, along with wer type and fuzz match score
    alignments = pairwise2.align.globalxx(l1, 
                                      l2,
                                      gap_char=['-']
                                     )
    l1_align = alignments[0].seqA
    l2_align = alignments[0].seqB
    score = []
    for i in range(len(l1_align)):
        score.append(word_match_score(l1_align[i], l2_align[i]))
    return l1_align, l2_align, score


def word_match_score(t1, t2):
    if t1 == '-' or t2 == '-':
        return 0
    else:
        score = fuzz.ratio(t1, t2)
        return score

def get_matched(t1, t2):
    #print (t1, t2)
    if t1 == t2:
            return 1
    else:
        return 0
    
def get_unmatched_cand(match_list, idx):
    cand = []
    idx_org = idx
    if match_list[idx] == 1:
        return [idx]
    else:
        while match_list[idx] != 1 and idx > 0:
            cand.append(idx)
            idx = idx - 1 # before
        idx = idx_org
        while match_list[idx] != 1 and idx < len(match_list) -1:
            cand.append(idx)
            idx = idx + 1 # after
        return list(set(cand))
    
def get_can_score(idx,match_cand, df_align):
    if len(match_cand) == 1:
        return [100]
    score_all = []
    for cand in match_cand:
        t1 = df_align['manual'][idx]
        t2 = df_align['model'][cand]
        score = word_match_score(t1, t2)
        score_all.append(score)
    return score_all

def get_fuzzy_cand(match_cand, match_cand_score, df_align):
    max_cand = np.max(match_cand_score)
    if max_cand > 60:
        # fuzzy matched
        max_cand_id = match_cand[match_cand_score.index(max_cand)]
        #print (max_cand_id)
        word = df_align['model'][max_cand_id]
        return word, max_cand, max_cand_id
    else:
        return '-', 0, '-'
    
def get_error_type(t1, t2):
    if (t1 != '-') and (t2 != '-') and (t1 != t2):
        return 's'
    if (t1 != '-') and (t2 == '-'):
        return 'd'
    if (t1 == '-') and (t2 != '-'):
        return 'i'
    if (t1 != '-') and (t2 != '-') and (t1 == t2):
        return 'c'
    if t1 == '':
        return ""
    
def get_wer_from_df(df):
    s = len(df[df['error_type_correct'] == 's'])
    d = len(df[df['error_type_correct'] == 'd'])
    i = len(df[df['error_type_correct'] == 'i'])
    c = len(df[df['error_type_correct'] == 'c'])
    n = s + d + c
    if n == 0:
        return "inf"
    wer = (s + d + i)/n * 100
    return wer,s,d,i,c,n

def get_df_align_and_wer_info(df, model_name, file_name):
    save_dir = '/home/sz38235/workDir/elic/whisper_wer/' + file_name.split('/')[-1].split('whisper')[0] + model_name + '.csv'
    manual_all = []
    model_all = []
    model_fuzzy_all = []
    error_type_all = []
    wer_all = []
    for id in range(len(df)):
        text1 = df['text'][id]
        text2 = df[model_name][id]
        l1 = get_clean_word_list(text1)
        l2 = get_clean_word_list(text2)
        if l1 == []:
            l1 = [""]
        if l2 == []:
            l2 = [""]
        #print (l1, l2)
        df_align = pd.DataFrame(columns = ['manual', 'model'])
        df_align['manual'] = get_aligned_list(l1, l2)[0]
        df_align['model'] = get_aligned_list(l1, l2)[1]
        df_align['score'] = get_aligned_list(l1, l2)[2]
        df_align['match'] = df_align.apply(lambda x:get_matched(x.manual, x.model), axis = 1)
        df_align['idx'] = df_align.index
        df_align['match_cand'] = df_align.apply(lambda x: get_unmatched_cand(df_align['match'].tolist(), x.idx), axis = 1)
        df_align['match_cand_score'] = df_align.apply(lambda x: get_can_score(x.idx, x.match_cand, df_align), axis = 1)
        df_align['model_fuzzy'] = df_align.apply(lambda x: get_fuzzy_cand(x.match_cand, x.match_cand_score, df_align)[0], axis = 1)
        df_align['model_fuzzy_id'] = df_align.apply(lambda x: get_fuzzy_cand(x.match_cand, x.match_cand_score, df_align)[2], axis = 1)
        df_align['model_fuzzy_score'] = df_align.apply(lambda x: get_fuzzy_cand(x.match_cand, x.match_cand_score, df_align)[1], axis = 1)
        df_align['error_type'] = df_align.apply(lambda x: get_error_type(x.manual, x.model), axis = 1)
        sub_id = df_align[(df_align['error_type']=='d') & (df_align['model_fuzzy_id'] != '-')]['model_fuzzy_id'].tolist()
        del_sub_id = df_align[(df_align['error_type']=='d') & (df_align['model_fuzzy_id'] != '-')].index.tolist()
        error_type = df_align['error_type'].tolist()
        for id in del_sub_id:
            error_type[id] = 's'
        df_align['error_type_correct'] = error_type
        df_align_clean = df_align[~df_align.index.isin(sub_id)]

        manual_curr = df_align_clean['manual'].tolist()
        model_fuzzy_curr  = df_align_clean['model_fuzzy'].tolist()
        error_type  = df_align_clean['error_type_correct'].tolist()
        model_curr = df_align_clean['model'].tolist()
        wer_curr = get_wer_from_df(df_align_clean)
        
        manual_all = manual_all + manual_curr
        model_fuzzy_all = model_fuzzy_all + model_fuzzy_curr
        error_type_all = error_type_all + error_type
        model_all = model_all + model_curr
        wer_all.append(wer_curr)
    df_all = pd.DataFrame(columns = ['manual', 'model', 'model_fuzzy', 'error_type'])
    df_all['manual'] = manual_all
    df_all['model'] = model_all
    df_all['model_fuzzy'] = model_fuzzy_all
    df_all['error_type'] = error_type_all
    
    df_all.to_csv(save_dir)
    return wer_all



In [7]:
for file_id in range(len(dir_list)):
#for file_id in range(1):
    file_name = dir_list[file_id]
    print (file_name)
    df = pd.read_csv(file_name)
    for model_id in range(1, 11):
        model_name = 'transcription_v' + str(int(model_id))
        print (model_name)
        col = model_name + '_wer'
        df[col] = get_df_align_and_wer_info(df, model_name, file_name)
    df['text_clean']= df['text'].apply(lambda x: clean_text(x))
    df.to_csv('/home/sz38235/workDir/elic/whisper_wer/' + file_name.split('/')[-1].split('whisper')[0]  + 'all_model_wer.csv')
    

/home/sz38235/workDir/elic/whisper_test/ckm016-2023-06-10-Klana_09_whisper_all.csv
transcription_v1
transcription_v2
transcription_v3
transcription_v4
transcription_v5
transcription_v6
transcription_v7
transcription_v8
transcription_v9
transcription_v10
/home/sz38235/workDir/elic/whisper_test/ckm015-2023-06-08-Klana_05_whisper_all.csv
transcription_v1
transcription_v2
transcription_v3
transcription_v4
transcription_v5
transcription_v6
transcription_v7
transcription_v8
transcription_v9
transcription_v10
/home/sz38235/workDir/elic/whisper_test/ckm016-2023-06-10-Klana_07_whisper_all.csv
transcription_v1
transcription_v2
transcription_v3
transcription_v4
transcription_v5
transcription_v6
transcription_v7
transcription_v8
transcription_v9
transcription_v10
/home/sz38235/workDir/elic/whisper_test/ckm016-2023-06-10-Klana_08_whisper_all.csv
transcription_v1
transcription_v2
transcription_v3
transcription_v4
transcription_v5
transcription_v6
transcription_v7
transcription_v8
transcription_v9
tr

In [8]:
df

Unnamed: 0.1,Unnamed: 0,chunk_name,file_tier,start,end,start_id,end_id,text,transcription_v1,transcription_v2,...,transcription_v2_wer,transcription_v3_wer,transcription_v4_wer,transcription_v5_wer,transcription_v6_wer,transcription_v7_wer,transcription_v8_wer,transcription_v9_wer,transcription_v10_wer,text_clean
0,0,ckm016-2023-06-10-Klana_01_1,1,0.219319,8.219319,0,2,"Pa- sad moremo po naše, sa se moremo prehitit ...","Pa, sad moremo po naše, sad se moremo prehit i...","Pa, sad moremo po naše, sad se moremo prehit i...",...,"(13.333333333333334, 2, 0, 0, 13, 15)","(13.333333333333334, 2, 0, 0, 13, 15)","(1066.6666666666665, 5, 2, 153, 8, 15)","(73.33333333333333, 1, 1, 9, 13, 15)","(13.333333333333334, 2, 0, 0, 13, 15)","(46.666666666666664, 4, 2, 1, 9, 15)","(13.333333333333334, 2, 0, 0, 13, 15)","(333.33333333333337, 3, 1, 46, 11, 15)","(80.0, 5, 2, 5, 8, 15)",pa sad moremo po naše sa se moremo prehitit i ...
1,1,ckm016-2023-06-10-Klana_01_2,1,85.947592,93.947592,2,3,Mhm.,MD- Mi smo morali recitirat pjesu. P,A.,...,inf,"(1200.0, 0, 1, 11, 0, 1)","(0.0, 0, 0, 0, 1, 1)","(0.0, 0, 0, 0, 1, 1)","(0.0, 0, 0, 0, 1, 1)","(0.0, 0, 0, 0, 1, 1)","(0.0, 0, 0, 0, 1, 1)",inf,"(600.0, 0, 1, 5, 0, 1)",mhm
2,2,ckm016-2023-06-10-Klana_01_3,1,216.672562,224.672562,3,4,((laughs)),Više. A on je prišal na na na na na na na na n...,.)laughs). A on je prišal poklalala pok pok po...,...,"(3200.0, 0, 1, 31, 0, 1)","(7500.0, 0, 1, 74, 0, 1)","(900.0, 0, 1, 8, 0, 1)","(6600.0, 0, 1, 65, 0, 1)","(4400.0, 0, 1, 43, 0, 1)","(700.0, 0, 1, 6, 0, 1)","(1800.0, 0, 1, 17, 0, 1)","(2300.0, 0, 1, 22, 0, 1)","(1500.0, 0, 1, 14, 0, 1)",
3,3,ckm016-2023-06-10-Klana_01_4,1,394.253003,402.253003,4,5,((laughs)),".)laughing). To je, evo, to bi bilo od mojega ...",".)laughs). To je, evo, to bi bilo od mojega kl...",...,"(2900.0, 0, 1, 28, 0, 1)","(400.0, 0, 1, 3, 0, 1)","(400.0, 0, 1, 3, 0, 1)",inf,"(400.0, 0, 1, 3, 0, 1)","(1100.0, 0, 1, 10, 0, 1)",inf,inf,"(800.0, 0, 1, 7, 0, 1)",
4,4,ckm016-2023-06-10-Klana_01_5,1,482.519859,482.962148,5,7,Mhm.,Aha.,Aha.,...,inf,inf,"(0.0, 0, 0, 0, 1, 1)","(0.0, 0, 0, 0, 1, 1)","(0.0, 0, 0, 0, 1, 1)","(0.0, 0, 0, 0, 1, 1)","(0.0, 0, 0, 0, 1, 1)","(0.0, 0, 0, 0, 1, 1)","(0.0, 0, 0, 0, 1, 1)",mhm
5,5,ckm016-2023-06-10-Klana_01_1,2,7.092442,19.011797,0,3,"Rojen san va Klani, sedamnajst dvanajstiga ped...","Rojen san va Klani, sedamnajst iga penes prve....","Rojen san va Klani 17.12.51. Mama je Klanjica,...",...,"(46.15384615384615, 0, 3, 3, 10, 13)","(50.0, 3, 0, 4, 11, 14)","(606.6666666666666, 2, 7, 82, 6, 15)","(480.0, 1, 10, 61, 4, 15)","(142.85714285714286, 2, 3, 15, 9, 14)","(50.0, 3, 0, 4, 11, 14)","(60.0, 2, 2, 5, 11, 15)","(33.33333333333333, 0, 3, 2, 12, 15)","(78.57142857142857, 2, 4, 5, 8, 14)",rojen san va klani sedamnajst dvanajstiga pede...
6,6,ckm016-2023-06-10-Klana_01_2,2,20.443752,30.770797,3,7,@ ((noise)) Nisan puno navadiv (.) tatinih bes...,E Nisan puno navadil u tatinih besidaši on mla...,O- Nisan puno navadiv tatinih besidašnon mladu...,...,"(370.5882352941177, 4, 6, 53, 7, 17)","(82.35294117647058, 7, 3, 4, 7, 17)","(94.11764705882352, 3, 10, 3, 4, 17)","(88.23529411764706, 4, 4, 7, 9, 17)","(76.47058823529412, 4, 4, 5, 9, 17)","(62.5, 5, 2, 3, 9, 16)","(188.23529411764704, 0, 16, 16, 1, 17)","(105.88235294117648, 0, 17, 1, 0, 17)","(105.88235294117648, 8, 5, 5, 4, 17)",nisan puno navadiv tatinih besid aš je on m...
7,7,ckm016-2023-06-10-Klana_01_3,2,32.090036,45.083069,7,11,"Živiv san z mamu, i živiv san s staru nonu. O...",Živil san z mamu i živil san staru nonu. Od nj...,Živil san z mamu i živil san staru nonu. Od nj...,...,"(26.08695652173913, 4, 0, 2, 19, 23)","(26.08695652173913, 4, 1, 1, 18, 23)","(366.66666666666663, 2, 11, 75, 11, 24)","(83.33333333333334, 2, 10, 8, 12, 24)","(13.043478260869565, 1, 1, 1, 21, 23)","(26.08695652173913, 4, 1, 1, 18, 23)","(17.391304347826086, 2, 1, 1, 20, 23)","(25.0, 3, 3, 0, 18, 24)","(70.83333333333334, 12, 3, 2, 9, 24)",živiv san z mamu i živiv san s staru nonu od ...
8,8,ckm016-2023-06-10-Klana_01_4,2,46.318426,54.318426,11,13,si smo govorili va Klani po klanjski. Tako da-...,"Svi smo govorili va klani, poklani. Svi smo go...","Si smo govorili va klani, poklanjski. Tako da,...",...,"(189.4736842105263, 2, 3, 31, 14, 19)","(31.57894736842105, 2, 3, 1, 14, 19)","(90.0, 0, 16, 2, 4, 20)","(16.666666666666664, 2, 1, 0, 15, 18)","(100.0, 2, 14, 4, 4, 20)","(85.0, 0, 15, 2, 5, 20)","(31.57894736842105, 0, 4, 2, 15, 19)","(21.052631578947366, 0, 3, 1, 16, 19)","(84.21052631578947, 4, 7, 5, 8, 19)",si smo govorili va klani po klanjski tako da ...
9,9,ckm016-2023-06-10-Klana_01_5,2,56.711196,66.824463,13,16,"Aš je to bila za nas ena- (.) ena čudna stvar,...","aš je.)hhh). to bila za nas una čudna stvar, n...","aš je, to bila za nas ena čudna stvar, nepozna...",...,"(0.0, 0, 0, 0, 15, 15)","(0.0, 0, 0, 0, 15, 15)","(18.75, 2, 0, 1, 14, 16)","(33.33333333333333, 0, 4, 1, 11, 15)","(33.33333333333333, 2, 1, 2, 12, 15)","(13.333333333333334, 0, 1, 1, 14, 15)","(18.75, 3, 0, 0, 13, 16)","(26.666666666666668, 0, 0, 4, 15, 15)","(50.0, 5, 1, 2, 10, 16)",aš je to bila za nas ena ena čudna stvar nepo...


In [9]:
df['text_clean'].values

array(['pa sad moremo po naše sa se moremo prehitit i počet dakle  kadi ste rojeni',
       'mhm', '', '', 'mhm',
       'rojen san va klani sedamnajst dvanajstiga pedesprve  mama mi je klanjica tata   prišav iz sežane',
       '  nisan puno navadiv  tatinih besid aš je on mlad umrv se kaj san navadiv san navadiv klanjski',
       'živiv san z mamu  i živiv san s staru nonu od njih san se navadiv govorit i  va vrime kada san ja biv dica',
       'si smo govorili va klani po klanjski tako da  da kada smo krenili va školu smo imeli problema s književnin',
       'aš je to bila za nas ena  ena čudna stvar nepoznata i  ja van moran reć',
       'enu stvar iz drugiga razreda ka je za ka se je samo nan mogla dogodit morali smo  vezat  facole črljene',
       'prvimu razredu ki je biv prijet va pionire mi smo morali recitirat pjesmu',
       'a glavninu pjesme je recitirav  naš najbolji učenik i učiteljica nan je ovako rekla',
       'va bile stomanje stavite črljeni facov i pionirsku baretu 