In [116]:
#WER / CER 계산 코드 (구현)
import locale
locale.getpreferredencoding = lambda: "UTF-8"
# locale.getpreferredencoding()
# !pip install Levenshtein
import Levenshtein as Lev
import pandas as pd

def calculate_wer(ref, hyp ,debug=False):
        # Handle NaN values by converting them to empty strings
    if pd.isna(ref):
        ref = ""
    if pd.isna(hyp):
        hyp = ""
    r = ref.split()
    h = hyp.split()
    #costs will holds the costs, like in the Levenshtein distance algorithm
    costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
    # backtrace will hold the operations we've done.
    # so we could later backtrace, like the WER algorithm requires us to.
    backtrace = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]

    OP_OK = 0
    OP_SUB = 1
    OP_INS = 2
    OP_DEL = 3

    DEL_PENALTY=1 # Tact
    INS_PENALTY=1 # Tact
    SUB_PENALTY=1 # Tact
    # First column represents the case where we achieve zero
    # hypothesis words by deleting all reference words.
    for i in range(1, len(r)+1):
        costs[i][0] = DEL_PENALTY*i
        backtrace[i][0] = OP_DEL

    # First row represents the case where we achieve the hypothesis
    # by inserting all hypothesis words into a zero-length reference.
    for j in range(1, len(h) + 1):
        costs[0][j] = INS_PENALTY * j
        backtrace[0][j] = OP_INS

    # computation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                costs[i][j] = costs[i-1][j-1]
                backtrace[i][j] = OP_OK
            else:
                substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
                insertionCost    = costs[i][j-1] + INS_PENALTY   # penalty is always 1
                deletionCost     = costs[i-1][j] + DEL_PENALTY   # penalty is always 1

                costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
                if costs[i][j] == substitutionCost:
                    backtrace[i][j] = OP_SUB
                elif costs[i][j] == insertionCost:
                    backtrace[i][j] = OP_INS
                else:
                    backtrace[i][j] = OP_DEL

    # back trace though the best route:
    i = len(r)
    j = len(h)
    numSub = 0
    numDel = 0
    numIns = 0
    numCor = 0
    if debug:
        print("OP\tREF\tHYP")
        lines = []
    while i > 0 or j > 0:
        if backtrace[i][j] == OP_OK:
            numCor += 1
            i-=1
            j-=1
            if debug:
                lines.append("OK\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_SUB:
            numSub +=1
            i-=1
            j-=1
            if debug:
                lines.append("SUB\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_INS:
            numIns += 1
            j-=1
            if debug:
                lines.append("INS\t" + "****" + "\t" + h[j])
        elif backtrace[i][j] == OP_DEL:
            numDel += 1
            i-=1
            if debug:
                lines.append("DEL\t" + r[i]+"\t"+"****")
    if debug:
        lines = reversed(lines)
        for line in lines:
            print(line)
        print("Ncor " + str(numCor))
        print("Nsub " + str(numSub))
        print("Ndel " + str(numDel))
        print("Nins " + str(numIns))
    return numCor, numSub, numDel, numIns, (numSub + numDel + numIns) / (float) (len(r))


In [117]:
# 위스퍼
def get_mean_wer_whisper(df):
    df['wer'] = df.apply(lambda row: calculate_wer(row['correct_text'], row['whisper_text'])[4], axis=1)
    return df['wer'].mean()

# 위스퍼+라마기본
def get_mean_wer_llama(df):
    df['wer'] = df.apply(lambda row: calculate_wer(row['correct_text'], row['llm_text'])[4], axis=1)
    return df['wer'].mean()

# 위스퍼+라마파인튜닝
def get_mean_wer_llama_ft(df):
    df['wer'] = df.apply(lambda row: calculate_wer(row['correct_text'], row['llm_output_text'])[4], axis=1)
    return df['wer'].mean()

In [118]:

df = pd.read_csv('atco2_test_dictation_by_whisper_small.csv', encoding='UTF-8')
mean_wer_1 = get_mean_wer_whisper(df)
mean_wer_1

1.0405270031874403

In [119]:
df = pd.read_csv('atco2_test_dictation_by_whisper_finetuned.csv', encoding='UTF-8')
mean_wer_2 = get_mean_wer_whisper(df)
mean_wer_2

1.0495432499523087

In [120]:
df = pd.read_csv('atco2_test_dictation_by_whisper_small_and_llama2_original.csv', encoding='UTF-8')
mean_wer_3 = get_mean_wer_llama(df)
mean_wer_3

1.068548207134803

In [121]:
df = pd.read_csv('atco2_test_dictation_by_whisper_finetuned_and_llama2_original.csv', encoding='UTF-8')
mean_wer_4 = get_mean_wer_llama(df)
mean_wer_4

0.9536716822147444

In [122]:
df = pd.read_csv('Whisper에 llama2 finetuned 붙여서 ATCO2 WER 뽑기_결과.csv', encoding='UTF-8')
mean_wer_5 = get_mean_wer_llama_ft(df)
mean_wer_5

0.8557662425136696

In [123]:
df = pd.read_csv('Whisper 파인튜닝한 모델에 llama2 finetuned 붙여서 ATCO2 WER 뽑기_결과.csv', encoding='UTF-8')
mean_wer_6 = get_mean_wer_llama_ft(df)
mean_wer_6

0.7405766128636935

In [124]:
print("위스퍼 기본모델              : ", mean_wer_1)
print("위스퍼 파인튜닝              : ", mean_wer_2)
print("위스퍼 기본모델 + 라마 기본모델 : ", mean_wer_3)
print("위스퍼 파인튜닝 + 라마 기본모델 : ", mean_wer_4)
print("위스퍼 기본모델 + 라마 파인튜닝 : ", mean_wer_5)
print("위스퍼 기본모델 + 라마 파인튜닝 : ", mean_wer_6)

위스퍼 기본모델              :  1.0405270031874403
위스퍼 파인튜닝              :  1.0495432499523087
위스퍼 기본모델 + 라마 기본모델 :  1.068548207134803
위스퍼 파인튜닝 + 라마 기본모델 :  0.9536716822147444
위스퍼 기본모델 + 라마 파인튜닝 :  0.8557662425136696
위스퍼 기본모델 + 라마 파인튜닝 :  0.7405766128636935


In [127]:
# 알파벳 -> 숫자
import re

# 숫자 단어와 실제 숫자의 매핑
number_map = {
    'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4',
    'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9',
    'ten': '10', 'eleven': '11', 'twelve': '12', 'thirteen': '13', 'fourteen': '14',
    'fifteen': '15', 'sixteen': '16', 'seventeen': '17', 'eighteen': '18', 'nineteen': '19',
    'twenty': '20', 'thirty': '30','forty': '40', 'fifty': '50', 'sixty': '60', 
    'seventy': '70', 'eighty': '80', 'ninety': '90',
    
}

# 숫자 단어를 숫자로 변환하는 함수
def convert_word_to_number(word):
    return int(number_map[word])


# thousand, hundred 가 모두 있는 경우만 처리
def replace_thousands_and_hundreds(text):
    # 정규식 패턴 정의
    pattern = re.compile(r'(\b(?:zero|one|two|three|four|five|six|seven|eight|nine)\sthousand\s(?:zero|one|two|three|four|five|six|seven|eight|nine)\shundred\b)')

    # 숫자 단어를 숫자로 변환하는 함수
    def convert_match(match):
        words = match.group(0).split()
        thousands = convert_word_to_number(words[0])
        hundreds = convert_word_to_number(words[2])
        return str(thousands * 1000 + hundreds * 100)

    return pattern.sub(convert_match, text)


# thousand 만 있는 경우 처리
def replace_thousands(text):
    # 정규식 패턴 정의
    pattern = re.compile(r'(\b(?:zero|one|two|three|four|five|six|seven|eight|nine)\sthousand\b)')

    # 숫자 단어를 숫자로 변환하는 함수
    def convert_match(match):
        words = match.group(0).split()
        thousands = convert_word_to_number(words[0])
        
        return str(thousands * 1000)

    return pattern.sub(convert_match, text)


# hundred 만 있는 경우 처리
def replace_hundreds(text):
    # 정규식 패턴 정의
    pattern = re.compile(r'(\b(?:zero|one|two|three|four|five|six|seven|eight|nine)\shundred\b)')

    # 숫자 단어를 숫자로 변환하는 함수
    def convert_match(match):
        words = match.group(0).split()
        hundreds = convert_word_to_number(words[0])
        
        return str(hundreds * 100)

    return pattern.sub(convert_match, text)


reverse_number_map = {
    '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five',
    '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine'
}
# forty five 등의 경우 four five 로 변환
def convert_big_numbers(text):
    # 숫자 단어들을 찾아 매핑을 통해 변환
    def replace_numbers(match):
        words = match.group(0).split()
        reverse_key = str(int(int(number_map[words[0]]) / 10))
        words[0] = reverse_number_map[reverse_key]
        return ' '.join(word for word in words)

    # 정규식을 사용하여 숫자 단어들의 패턴을 찾음
    pattern = re.compile(r'(?:ten|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)\s+(?:one|two|three|four|five|six|seven|eight|nine)')
    return pattern.sub(replace_numbers, text)


# 숫자 단어를 숫자로 변환하는 함수
def convert_words_to_numbers(text):
    # 숫자 단어들을 찾아 매핑을 통해 변환
    def replace_numbers(match):
        words = match.group(0).split()
        # 'decimal'을 포함한 경우 소수점으로 변환
        if 'decimal' in words:
            decimal_index = words.index('decimal')
            integer_part = ''.join(number_map[word] for word in words[:decimal_index])
            decimal_part = ''.join(number_map[word] for word in words[decimal_index+1:])
            return f"{integer_part}.{decimal_part}"
        else:
            return ''.join(number_map[word] for word in words)

    # 정규식을 사용하여 숫자 단어들의 패턴을 찾음
    pattern = re.compile(r'\b(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|decimal)(?: (?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|decimal))*\b')
    return pattern.sub(replace_numbers, text)


# 떨어져 있는 숫자 붙이기
def attatch_numbers(text):
    result = re.sub(r'(\d)\s+(\d)', r'\1\2', text)
    return result

def convert(text):
    text = replace_thousands_and_hundreds(text)
    text = replace_thousands(text)
    text = replace_hundreds(text)
    text = convert_big_numbers(text)
    text = convert_words_to_numbers(text)
    text = attatch_numbers(text)
    return text

In [128]:

def get_mean_wer_whisper_number(df):
    df['whisper_conv'] = df['whisper_text'].apply(lambda x: convert(x))
    df['correct_conv'] = df['correct_text'].apply(lambda x: convert(x))
    df['wer'] = df.apply(lambda row: calculate_wer(row['correct_conv'], row['whisper_conv'])[4], axis=1)
    return df['wer'].mean()

In [129]:
df = pd.read_csv('atco2_test_dictation_by_whisper_finetuned_mh_Tuned.csv', encoding='UTF-8')
df

Unnamed: 0,file,whisper_text,correct_text
0,atco2_test0.wav,oscar kilo foxtrot alfa oscar taxi to holding ...,oscar kilo foxtrot alfa oscar taxi to holding ...
1,atco2_test1.wav,ok call you soon four zero nine one resume to,ok quality 4091 ruzyne tower information lima ...
2,atco2_test2.wav,car kilo echo lima alfa confirm proceeding to ...,oscar kilo echo lima alfa confirm proceeding t...
3,atco2_test3.wav,saysay one delta zurich descend to flight leve...,csa 1 delta zulu descend flight level 100 no s...
4,atco2_test4.wav,are you setting follow one hundred three three...,descending flight level 100 free speed csa 1 d...
...,...,...,...
866,atco2_test866.wav,cindy tar air condor six forty two,sydney tower qantas 642
867,atco2_test867.wav,cross six forty two is in attack good day,qantas 642 sydney tower good day
868,atco2_test868.wav,jetset seven six one wind is coming romeo six ...,jetstar 761 wind is calm runway 16 right clear...
869,atco2_test869.wav,belair and one six right just seven six two off,cleared to land 16 right jetstar 761


In [130]:
mean_wer_alphabet = get_mean_wer_whisper_number(df)
mean_wer_alphabet

0.64840090827054

In [131]:
wer_test = get_mean_wer_whisper(df)
wer_test

0.9923243086589043

In [132]:
# 숫자 -> 알파벳
import re

# 숫자 단어와 실제 숫자의 매핑
number_map = {
    'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4',
    'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9',
    'ten': '10', 'eleven': '11', 'twelve': '12', 'thirteen': '13', 'fourteen': '14',
    'fifteen': '15', 'sixteen': '16', 'seventeen': '17', 'eighteen': '18', 'nineteen': '19',
    'twenty': '20', 'thirty': '30', 'forty': '40', 'fifty': '50', 'sixty': '60',
    'seventy': '70', 'eighty': '80', 'ninety': '90'
}

reverse_number_map = {v: k for k, v in number_map.items()}

# 숫자를 단어로 변환하는 함수
def convert_number_to_words(number):
    if number == 0:
        return 'zero'
    if number < 20:
        return reverse_number_map[str(number)]
    elif number < 100:
        tens, below_ten = divmod(number, 10)
        return reverse_number_map[str(tens * 10)] + (' ' + reverse_number_map[str(below_ten)] if below_ten else '')
    elif number < 1000:
        hundreds, below_hundred = divmod(number, 100)
        return reverse_number_map[str(hundreds)] + ' hundred' + (' ' + convert_number_to_words(below_hundred) if below_hundred else '')
    else:
        thousands, below_thousand = divmod(number, 1000)
        return convert_number_to_words(thousands) + ' thousand' + (' ' + convert_number_to_words(below_thousand) if below_thousand else '')

# 텍스트에서 숫자를 단어로 변환하는 함수
def convert_numbers_to_words(text):
    if not isinstance(text, str):
        return text
    # 숫자를 찾는 정규식 패턴
    pattern = re.compile(r'\d+')
    return pattern.sub(lambda x: convert_number_to_words(int(x.group())), text)

# 테스트
text = "I have 2 apples and 81 oranges. The price is 1200 dollars."
converted_text = convert_numbers_to_words(text)
print(converted_text)  # 결과: I have two apples and eighty one oranges. The price is one thousand two hundred dollars.

# 데이터프레임에서 함수 적용
import pandas as pd

# 예시 데이터프레임
data = {
    "file": ["atco2_test0.wav", "atco2_test1.wav", "atco2_test2.wav", "atco2_test3.wav"],
    "whisper_text": [
        "Oskarkelo, focus throttle, false car, taxi do...",
        "Ok, koli ty 4091, rzhnět tower informatíčná, ...",
        "Karkila EkoLema Alpha, conform procedím to ta...",
        "CSA1 Delta to Lodi, central at level 108, no ..."
    ],
    "correct_text": [
        "Oscar kilo foxtrot alfa Oscar taxi to holding ...",
        "ok quality 4091 ruzyne tower information lima ...",
        "oscar kilo echo lima alfa confirm proceeding t...",
        "csa 1 delta zulu descend flight level 100 no s..."
    ]
}

# 데이터프레임 생성
df = pd.DataFrame(data)

# 데이터프레임의 각 행에 함수 적용
df['whisper_conv'] = df['whisper_text'].apply(lambda x: convert_numbers_to_words(x))
df['correct_conv'] = df['correct_text'].apply(lambda x: convert_numbers_to_words(x))

print(df[['whisper_conv', 'correct_conv']])

I have two apples and eighty one oranges. The price is one thousand two hundred dollars.
                                        whisper_conv  \
0   Oskarkelo, focus throttle, false car, taxi do...   
1  Ok, koli ty four thousand ninety one, rzhnět t...   
2   Karkila EkoLema Alpha, conform procedím to ta...   
3  CSAone Delta to Lodi, central at level one hun...   

                                        correct_conv  
0  Oscar kilo foxtrot alfa Oscar taxi to holding ...  
1  ok quality four thousand ninety one ruzyne tow...  
2  oscar kilo echo lima alfa confirm proceeding t...  
3  csa one delta zulu descend flight level one hu...  


In [134]:
def get_mean_wer_whisper_alphabet(df):
    df['whisper_conv'] = df['whisper_text'].apply(lambda x: convert_numbers_to_words(x))
    df['correct_conv'] = df['correct_text'].apply(lambda x: convert_numbers_to_words(x))
    df['wer'] = df.apply(lambda row: calculate_wer(row['correct_conv'], row['whisper_conv'])[4], axis=1)
    return df['wer'].mean()


def get_mean_wer_llama_alphabet(df):
    df['whisper_conv'] = df['llm_text'].apply(lambda x: convert_numbers_to_words(x))
    df['correct_conv'] = df['correct_text'].apply(lambda x: convert_numbers_to_words(x))
    df['wer'] = df.apply(lambda row: calculate_wer(row['correct_conv'], row['whisper_conv'])[4], axis=1)
    return df['wer'].mean()


def get_mean_wer_llama_ft_alphabet(df):
    df['whisper_conv'] = df['llm_output_text'].apply(lambda x: convert_numbers_to_words(x))
    df['correct_conv'] = df['correct_text'].apply(lambda x: convert_numbers_to_words(x))
    df['wer'] = df.apply(lambda row: calculate_wer(row['correct_conv'], row['whisper_conv'])[4], axis=1)
    return df['wer'].mean()



In [135]:
df = pd.read_csv('atco2_test_dictation_by_whisper_small.csv', encoding='UTF-8')
alphabet_mean_wer_1 = get_mean_wer_whisper_alphabet(df)
alphabet_mean_wer_1

0.9026089834468001

In [136]:
df = pd.read_csv('atco2_test_dictation_by_whisper_finetuned.csv', encoding='UTF-8')
alphabet_mean_wer_2 = get_mean_wer_whisper_alphabet(df)
alphabet_mean_wer_2

0.9191547865206413

In [137]:
df = pd.read_csv('atco2_test_dictation_by_whisper_small_and_llama2_original.csv', encoding='UTF-8')
alphabet_mean_wer_3 = get_mean_wer_llama_alphabet(df)
alphabet_mean_wer_3

0.9437961681222905

In [138]:
df = pd.read_csv('atco2_test_dictation_by_whisper_finetuned_and_llama2_original.csv', encoding='UTF-8')
alphabet_mean_wer_4 = get_mean_wer_llama_alphabet(df)
alphabet_mean_wer_4

0.8311526276159446

In [139]:
df = pd.read_csv('Whisper에 llama2 finetuned 붙여서 ATCO2 WER 뽑기_결과.csv', encoding='UTF-8')
alphabet_mean_wer_5 = get_mean_wer_llama_ft_alphabet(df)
alphabet_mean_wer_5

0.8550298274371527

In [140]:
df = pd.read_csv('Whisper 파인튜닝한 모델에 llama2 finetuned 붙여서 ATCO2 WER 뽑기_결과.csv', encoding='UTF-8')
alphabet_mean_wer_6 = get_mean_wer_llama_ft_alphabet(df)
alphabet_mean_wer_6

0.7574159781717675

In [141]:
print("위스퍼 기본모델              : ", mean_wer_1)
print("위스퍼 기본모델              : ", alphabet_mean_wer_1)
print("위스퍼 파인튜닝              : ", mean_wer_2)
print("위스퍼 파인튜닝              : ", alphabet_mean_wer_2)
print("위스퍼 기본모델 + 라마 기본모델 : ", mean_wer_3)
print("위스퍼 기본모델 + 라마 기본모델 : ", alphabet_mean_wer_3)
print("위스퍼 파인튜닝 + 라마 기본모델 : ", mean_wer_4)
print("위스퍼 파인튜닝 + 라마 기본모델 : ", alphabet_mean_wer_4)
print("위스퍼 기본모델 + 라마 파인튜닝 : ", mean_wer_5)
print("위스퍼 기본모델 + 라마 파인튜닝 : ", alphabet_mean_wer_5)
print("위스퍼 기본모델 + 라마 파인튜닝 : ", mean_wer_6)
print("위스퍼 기본모델 + 라마 파인튜닝 : ", alphabet_mean_wer_6)

위스퍼 기본모델              :  1.0405270031874403
위스퍼 기본모델              :  0.9026089834468001
위스퍼 파인튜닝              :  1.0495432499523087
위스퍼 파인튜닝              :  0.9191547865206413
위스퍼 기본모델 + 라마 기본모델 :  1.068548207134803
위스퍼 기본모델 + 라마 기본모델 :  0.9437961681222905
위스퍼 파인튜닝 + 라마 기본모델 :  0.9536716822147444
위스퍼 파인튜닝 + 라마 기본모델 :  0.8311526276159446
위스퍼 기본모델 + 라마 파인튜닝 :  0.8557662425136696
위스퍼 기본모델 + 라마 파인튜닝 :  0.8550298274371527
위스퍼 기본모델 + 라마 파인튜닝 :  0.7405766128636935
위스퍼 기본모델 + 라마 파인튜닝 :  0.7574159781717675
