## Data preprocessing for modelling Amharic data

In [3]:
!pip install python_speech_features

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python_speech_features
  Downloading python_speech_features-0.6.tar.gz (5.6 kB)
Building wheels for collected packages: python-speech-features
  Building wheel for python-speech-features (setup.py) ... [?25l[?25hdone
  Created wheel for python-speech-features: filename=python_speech_features-0.6-py3-none-any.whl size=5888 sha256=8d9e70d9e8354ac141fb31135d604749eb49f1b6c90d16653f7c0e257463e2a9
  Stored in directory: /root/.cache/pip/wheels/b0/0e/94/28cd6afa3cd5998a63eef99fe31777acd7d758f59cf24839eb
Successfully built python-speech-features
Installing collected packages: python-speech-features
Successfully installed python-speech-features-0.6


### Preprocessing steps:
1. Load labels
2. sample each audio at 44100Hz
3. Convert mono to stereo
4. Resize audios
5. Generate a pandas dataframe
6. save preprocessed audios and transcritions to a new folder
6. Split the data to train and valid corpus
7.  save train and valid corpus


In [17]:
import librosa   #for audio processing
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile #for audio processing
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")

In [18]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [19]:
import os
os.chdir("/content/drive/My Drive/AMHARIC")
os.listdir()

['README.md',
 'data',
 'kaldi-script',
 'lang',
 'lm',
 'models',
 'train_corpus.json',
 'valid_corpus.json',
 'model_1.png']

In [20]:
import librosa  # for audio processing
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile  # for audio processing
from numpy.lib.stride_tricks import as_strided
from mpl_toolkits.axes_grid1 import make_axes_locatable
from IPython.display import Audio
import sklearn
import pandas as pd
import json
import os
from os.path import exists
import warnings
warnings.filterwarnings("ignore")
# from cleaner import CleanDataFrame

In [24]:
train_dataset_location = 'train/'
train_wav_location = os.path.join('/content/drive/My Drive/AMHARIC/data/train/','wav/')
train_changed_wav_location = os.path.join('/content/drive/My Drive/AMHARIC/data/train/','changed_wav/')
train_txt_location = ("/content/drive/My Drive/AMHARIC/data/train/trsTrain.txt")
test_dataset_location = 'test/' 
lexicon_location = 'train/'

In [28]:
def convert_to_sterio(audio: np.array) -> np.array:
        if len(audio.shape) == 1:
            sterio = np.stack([audio, audio], axis=1)
            return sterio
        return audio


from scipy.io import wavfile

def resize_audio(audio: np.array, size: int) -> np.array:
        """
        This resizes all input audio to a fixed sample size.
        It helps us to have a consistent data shape

        Args:
            audio: This is the audio sample as a numpy array
        """
        resized = librosa.util.fix_length(audio, size, axis=0)
        print(f"Audio resized to {size} samples")
        return resized
# changed = convert_to_sterio(samples)
# changed.shape  
# resized = resize_audio(changed,200000)
# print(resized.shape)
# print(resized.T.shape)
# wavfile.write(os.path.join(train_changed_wav_location, 'trial.wav'), sample_rate, resized)
# ipd.Audio(resized.T, rate=sample_rate)
# # mfcc = librosa.feature.mfcc(y=changed, sr=sample_rate)
# # mfcc.shape
# samples, sample_rate = librosa.load(train_changed_wav_location+'trial.wav' , sr=44100)
# samples.shape
# wav_roll = np.roll(samples, int(sample_rate/10))

def meta_data(trans, path):
        target = []
        features = []
        mode = []
        rmse = []
        spec_cent = []
        spec_bw = []
        rolloff = []
        zcr = []
        mfcc = []
        rate = []
        filenames = []
        duration_of_recordings = []
        for index, k in enumerate(trans):
            if True:
                filename = path + k + ".wav"
                next_file_name = path + k + "changed.wav"
                if exists(filename):
                    audio, fs = librosa.load(filename, sr=44100)
                   
                    chroma_stft = librosa.feature.chroma_stft(y=audio, sr=fs)
                    rmse.append(np.mean(librosa.feature.rms(y=audio)))
                    spec_cent.append(
                        np.mean(librosa.feature.spectral_centroid(y=audio, sr=fs)))
                    spec_bw.append(
                        np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=fs)))
                    rolloff.append(
                        np.mean(librosa.feature.spectral_rolloff(y=audio, sr=fs)))
                    zcr.append(
                        np.mean(librosa.feature.zero_crossing_rate(audio)))
                    mfcc.append(np.mean(librosa.feature.mfcc(y=audio, sr=fs)))
                    duration_of_recordings.append(float(len(audio)/fs))
                    rate.append(fs)
                    changed = convert_to_sterio(audio)
                    audio = resize_audio(changed,200000)
                   
                    # stereo = change_channel_to_stereo(filename, next_file_name)
                    # resized = self.resize_audio(audio,200000)
                    split_array = str(filename).split('/')
                    filename = '../data/train/changed_wav/' + str(split_array[len(split_array)-1])
                    wavfile.write(os.path.join(train_changed_wav_location, str(split_array[len(split_array)-1]) ), fs, audio)
                    filenames.append(filename)
                    mode.append('mono')  # if stereo == 1 else 'stereo')
                    lable = trans[k]
                    target.append(lable)
        # self.logger.info(f"Meta Data Generated For {len(filenames)} Audios")
        return filenames, target, duration_of_recordings, mode, rmse, spec_cent, spec_bw, rolloff, zcr, mfcc, rate

In [29]:
def loaderTrans(filename: str):
        """
        # Loads the audio file and returns the audio data and sample rate
        # param filename: The path to the txt file
        # @return: The audio data and sample rate
        #
        """
        name_to_text = {}
        with open(filename, encoding="utf-8") as f:
            for line in f:
                name = line.split("</s>")[1]
                name = name.replace("(", "")
                name = name.replace(")", "")
                name = name.replace("\n", "")
                name = name.replace(" ", "")
                text = line.split("</s>")[0]
                text = text.replace("<s>", "")
                name_to_text[name] = text
                # self.logger.info(f"Training data loaded: {name}")
        return name_to_text
transcription = loaderTrans(train_txt_location)

In [30]:
filenames, target, duration_of_recordings,mode ,rmse,spec_cent,spec_bw,rolloff,zcr,mfcc,rate = meta_data(transcription, train_wav_location)
data = pd.DataFrame({'key': filenames, 'text': target,
                    'duration': duration_of_recordings, 'mode': mode , 'rate': rate ,'rmse': rmse,'spec_cent' :spec_cent,'spec_bw': spec_bw,"rolloff" :rolloff,"zcr": zcr,"mfcc": mfcc})
data.head()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio resized to 200000 samples
Audio r

Unnamed: 0,key,text,duration,mode,rate,rmse,spec_cent,spec_bw,rolloff,zcr,mfcc
0,../data/train/changed_wav/tr_1_tr01001.wav,ያንደኛ ደረጃ ትምህርታቸው ን ጐንደር ተ ም ረዋል,4.608005,mono,44100,0.005634,712.137443,1235.842984,1487.201251,0.004273,-18.118677
1,../data/train/changed_wav/tr_2_tr01002.wav,የተ ለቀቁት ምርኮኞች በ አካባቢያቸው ሰላማዊ ኑሮ እንዲ ኖሩ የ ትራንስ...,16.384014,mono,44100,0.00556,769.146959,1257.024045,1619.168774,0.003095,-17.846657
2,../data/train/changed_wav/tr_3_tr01003.wav,በ አዲስ አበባው ስታዲየም በ ተካሄዱ ት ሁለት ግጥሚያ ዎች በ መጀመሪያ...,14.592018,mono,44100,0.005418,784.268292,1287.589639,1656.634798,0.001926,-18.260798
3,../data/train/changed_wav/tr_4_tr01004.wav,ወሬው ን ወሬ ያደረጉ ምስጢረ ኞች ናቸው,4.736009,mono,44100,0.005537,689.18964,1164.392879,1376.119457,0.004953,-18.800745
4,../data/train/changed_wav/tr_5_tr01005.wav,ኢትዮጵያዊ ቷ በ ብሄራዊ ባህላዊ አለባበስ ከ አለም አንደኝነት ን ተቀዳ...,8.192018,mono,44100,0.005631,781.056528,1256.620487,1685.354398,0.003393,-18.187614


In [None]:
train_json = data[:8000].to_dict(orient='records')
with open("train_corpus.json", "w", encoding='UTF-8') as export_file:
            json.dump(train_json, export_file, indent=4,
                     sort_keys=True, ensure_ascii=False)
valid_json = data[8001:].to_dict(orient='records')
with open("valid_corpus.json", "w", encoding='UTF-8') as export_file:
            json.dump(valid_json, export_file, indent=4,
                     sort_keys=True, ensure_ascii=False)