In [None]:
import os
import tqdm
import pickle
import shutil
import librosa
import subprocess
import soundfile as sf
import numpy as np
import tensorflow as tf
import csv

In [None]:
zero_length_audios = []

def khosapogh_data_preprocessing(root_dir, output_dir):
    data_dict = {}
    root_files = os.listdir(root_dir)
    path = root_dir + "/"
    joining_number = 0
    for file in root_files:
        
        if file.endswith(".txt"):
            sentence = ""
            with open(path + file, 'r') as f:
                file_contents = f.read()
            lines = file_contents.split('\n')
            for line in lines:
                sentence += " " + line
            data_dict[str(joining_number)] = sentence
            
            for file_ in root_files:
                if file_.endswith(".wav") and file_.split(".")[0] == file.split(".")[0]:
                    
                    input_file = path + file_
                    output_file = os.path.join(output_dir, str(joining_number)+".wav")
                    
                    try:
                        y, sr = librosa.load(input_file, sr=None)
                        if sr != 16000:
                            y_resampled = librosa.resample(y, orig_sr=sr, target_sr=16000)
                            sf.write(output_file, y_resampled, 16000, 'PCM_16')
                        else:
                            shutil.copy(input_file, output_file)
                    except:
                        
                        sampling_rate = 16000  
                        ffmpeg_command = [
                            'ffmpeg',                     
                            '-i', input_file,              
                            '-c:a', 'pcm_s16le',           
                            '-ar', str(sampling_rate),  
                            output_file                    
                        ]
                        
                        #problematic audio detection
                        subprocess.run(ffmpeg_command)
                        file = tf.io.read_file(output_file)
                        audio, _ = tf.audio.decode_wav(file)
                        if len(audio.numpy()) == 0:
                            zero_length_audios.append([path + file_,output_file])

                    
                    
            joining_number+=1
            
    with open('arm_sentences.pkl', 'wb') as f:
        pickle.dump(data_dict, f)

In [None]:
audio_output_directory = 'Renamed_Audio_Recordings'
khosapogh_data_directory = "dataset_new_verified"
khosapogh_data_preprocessing(khosapogh_data_directory, audio_output_directory)

In [None]:
with open('arm_sentences.pkl', 'rb') as f:
    sentences_dict = pickle.load(f)
sentences_dict 

len(sentences_dict), len(os.listdir(audio_output_directory))

In [None]:
def armenian_sentence_processing(sentence):
    arm_range = range(ord("ա"), ord("և") + 1)
    allowed_punctuations = set(["։", ",", " "])
    sentence = sentence.lower()
    sentence = sentence.strip()
    sentence = sentence.replace(":", "։")
    sentence = "".join([char for char in sentence if (ord(char) in arm_range or char in allowed_punctuations)])
    if sentence[-1] == "։":
        sentence = sentence[:-1]
    return sentence

In [None]:
for i in tqdm.tqdm(range(len(sentences_dict))):
    sentences_dict[str(i)] = armenian_sentence_processing(sentences_dict[str(i)])

In [None]:
with open('arm_sentences.pkl', 'wb') as f:
    pickle.dump(sentences_dict, f)
    
with open('arm_sentences.pkl', 'rb') as f:
    sentences_dict = pickle.load(f)
sentences_dict

In [None]:
new_wav_dir = 'wavs_final'

def resample_wav(input_file, output_file, target_sr=16000):
    y, sr = librosa.load(input_file, sr=None)
    y_resampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
    sf.write(output_file, y_resampled, target_sr)



for filename in tqdm.tqdm(os.listdir(audio_output_directory)):
    if filename.endswith('.wav'):
        input_file = os.path.join(audio_output_directory, filename)
        output_file = os.path.join(new_wav_dir, filename)
        resample_wav(input_file, output_file)

In [None]:
csv_file = 'metadata.csv'
with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["file_name", "normalized_transcription"])
    for key, value in sentences_dict.items():
        writer.writerow([key, value])