# Objective:

```
Produce a random selection of 1000 20sec segment pairs (text,mp3) from the data below, to check their quality. Important is 1. these are random from the whose sample, 2. it is easy to inspect the transcript and find the corresponding mp3 file, I suggest you just use the “alphabet ordering” trick, so mp3 files called “samplexxx”, xxx ranging from 000 to 999. One text file can have these file names and transcripts in file-name-alphabetical order.

20sec segments
/home/korzinek/kaldi/exp/ali_all/ali.json

Mapping of recoXXXX ids to actual file wav names
/home/korzinek/kaldi/exp/ali_all/wav.scp

Symbolic links to wavs
/home/nikola/projects/parlaspeech/wav/
```


# File management

In [50]:
links_to_wavs = "/home/nikola/projects/parlaspeech/wav/"
mappingsfile = "/home/korzinek/kaldi/exp/ali_all/wav.scp"
segmentsfile = "/home/korzinek/kaldi/exp/ali_all/asr.json"

import json
with open(segmentsfile, "r") as file:
    contents = []
    for line in file.readlines():
        contents.append(json.loads(line))


In [51]:
import pandas as pd
segments = pd.DataFrame(contents)
segments.head()

Unnamed: 0,file,start,end,words,word_start_times,word_end_times,asr
0,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1853.82,1873.66,"[konačni, prijedlog, zakona, o, izmjenama, i, ...","[0, 0.47, 0.8, 1.21, 1.26, 1.79, 1.82, 2.34, 2...","[0.47, 0.8, 1.21, 1.26, 1.79, 1.82, 2.34, 2.78...","[konačni, prijedlog, zakona, o, izmjenama, i, ..."
1,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1874.08,1893.99,"[u, svezi, sa, člankom, sto, devedeset, poslov...","[0, 0.11, 0.48, 0.66, 1.12, 1.36, 1.94, 2.53, ...","[0.11, 0.48, 0.66, 1.12, 1.36, 1.94, 2.53, 3.0...","[u, svezi, sa, člankom, sto, devedeset, poslov..."
2,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1893.99,1909.04,"[za, zakonodavstvo, i, odbor, za, financije, i...","[0, 0.17, 1.12, 1.24, 1.49, 1.65, 2.12, 2.15, ...","[0.17, 1.12, 1.24, 1.49, 1.65, 2.12, 2.15, 2.5...","[za, zakonodavstvo, i, odbor, za, financije, d..."
3,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1912.94,1919.47,"[hvala, lijepo, gospodine, potpredsjedniče, hr...","[0, 0.15, 0.37, 0.74, 1.52, 1.9, 2.35, 2.67, 2...","[0.15, 0.37, 0.74, 1.52, 1.9, 2.35, 2.67, 2.93...","[hval, lijepo, gospodine, potpredsjedniče, hrv..."
4,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1920.52,1940.34,"[zakona, o, izmjenama, i, dopunama, zakona, o,...","[0, 0.39, 0.44, 0.75, 0.79, 1.26, 1.65, 1.7, 2...","[0.39, 0.44, 0.75, 0.79, 1.26, 1.65, 1.7, 2.79...","[zakona, o, izmjenama, i, dopunama, zakona, o,..."


In [26]:
reconame, longname = [],[]
with open(mappingsfile) as f:
    for line in f.readlines():
        reconame.append(line[0:7])
        longname.append(line[8:-1])

mappings = pd.DataFrame(data={"reconame": reconame, "longname": longname})
mappings.shape

(742, 2)

In [52]:
merged = segments.merge(mappings, how="left", left_on="file", right_on="longname").drop(columns="longname")
merged.head(3)

Unnamed: 0,file,start,end,words,word_start_times,word_end_times,asr,reconame
0,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1853.82,1873.66,"[konačni, prijedlog, zakona, o, izmjenama, i, ...","[0, 0.47, 0.8, 1.21, 1.26, 1.79, 1.82, 2.34, 2...","[0.47, 0.8, 1.21, 1.26, 1.79, 1.82, 2.34, 2.78...","[konačni, prijedlog, zakona, o, izmjenama, i, ...",reco723
1,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1874.08,1893.99,"[u, svezi, sa, člankom, sto, devedeset, poslov...","[0, 0.11, 0.48, 0.66, 1.12, 1.36, 1.94, 2.53, ...","[0.11, 0.48, 0.66, 1.12, 1.36, 1.94, 2.53, 3.0...","[u, svezi, sa, člankom, sto, devedeset, poslov...",reco723
2,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1893.99,1909.04,"[za, zakonodavstvo, i, odbor, za, financije, i...","[0, 0.17, 1.12, 1.24, 1.49, 1.65, 2.12, 2.15, ...","[0.17, 1.12, 1.24, 1.49, 1.65, 2.12, 2.15, 2.5...","[za, zakonodavstvo, i, odbor, za, financije, d...",reco723


Remarks:
* I have the long filename and the recoNNN mapping merged. Not sure why I need the latter tho.
* Wav files are present at `links_to_wavs + long filename`.

Plan:
* Generate a sample of 1000 rows from the merged DF.
* Based on the info I have in the merged dataframe I will generate transcript from `words` column
* I will trim the audio file to the segment borders.
* Save the transcript, save the audio.

In [95]:
sampledf = merged.sample(1000).reset_index(drop=True)
sampledf["transcript"] = sampledf.words.str.join(" ")
sampledf["asr_transcript"] = sampledf.asr.str.join(" ")
sampledf["samplename"] = ""
sampledf["samplepath"] = ""

In [59]:
sampledf.file[0]

'14 2 2019 - 3. dio, 11. sjednica, 9. saziv [KMGKU5I8aUQ].wav'

In [128]:
from pydub import AudioSegment
import os
output_dir = "/home/rupnik/macocu/task8/sample"
for i, row in sampledf.iterrows():
    print(f"i: {i:0004}", end="\r")
    wavfile = row["file"]
    transcript = row["transcript"]
    reconame = row["reconame"]
    segment_start_ms = int(row["start"] * 1000)
    segment_end_ms = int(row["end"] * 1000)
    try:
        audio = AudioSegment.from_wav(
            links_to_wavs  + wavfile
            )
        segment = audio[segment_start_ms:segment_end_ms]
        outpath = os.path.join(
                output_dir, 
                f"sample{i:003}.mp3")
        segment.export(
            outpath, 
            format="mp3")
        sampledf.loc[i, "samplepath"] = outpath
        sampledf.loc[i, "samplename"] = f"sample{i:003}.mp3"
    except FileNotFoundError:
        continue



i: 0999

In [131]:
finished = !ls /home/rupnik/macocu/task8/sample
sampledf["finished"] = sampledf.samplename.isin(finished)
sampledf.head(10)#to_excel("transcripts.xls")

Unnamed: 0,file,start,end,words,word_start_times,word_end_times,asr,reconame,transcript,asr_transcript,samplename,samplepath,finished
0,"17 1 2018 - 2. dio, 7. sjednica, 9. saziv [Hla...",3207.7,3227.54,"[predsjedniče, hrvatskoga, sabora, uvaženi, ko...","[0, 0.71, 1.23, 1.69, 2.22, 2.56, 3.1, 3.92, 4...","[0.71, 1.23, 1.69, 2.22, 2.56, 3.1, 3.56, 4.31...","[predsjedniče, hrvatskoga, sabora, uvaženi, ko...",reco348,predsjedniče hrvatskoga sabora uvaženi kolega ...,predsjedniče hrvatskoga sabora uvaženi kolega ...,sample000.mp3,/home/rupnik/macocu/task8/sample/sample000.mp3,True
1,"06 12 2016 - 1. dio, 2. sjednica, 9. saziv [GV...",9453.64,9473.56,"[ulaz, u, sam, sustav, javne, nabave, i, narav...","[0, 0.52, 0.6, 0.87, 1.29, 1.59, 2.06, 2.11, 2...","[0.46, 0.6, 0.87, 1.29, 1.59, 2.06, 2.11, 2.55...","[ulaz, u, sam, sustav, javne, nabave, i, narav...",reco314,ulaz u sam sustav javne nabave i naravno da će...,ulaz u sam sustav javne nabave i naravno da će...,sample001.mp3,/home/rupnik/macocu/task8/sample/sample001.mp3,True
2,"15 09 2017 - 5. sjednica, 9. saziv [wGcZ33lyWU...",21078.47,21081.37,"[projekta, ili, ostvarivanja, tog, projekta]","[0, 0.91, 1.26, 2.2, 2.43]","[0.88, 1.23, 2.17, 2.43, 2.9]","[projekta, ili, ostvarivanja, tog, projekta]",reco153,projekta ili ostvarivanja tog projekta,projekta ili ostvarivanja tog projekta,sample002.mp3,/home/rupnik/macocu/task8/sample/sample002.mp3,True
3,"16 11 2017 - 6. sjednica, 9. saziv [xbgYVKmXxg...",14985.39,15005.18,"[politički, angažira, on, je, priznao, da, je,...","[0, 0.65, 1.76, 2.01, 2.11, 2.57, 2.66, 2.73, ...","[0.65, 1.37, 2.01, 2.11, 2.57, 2.66, 2.73, 2.9...","[politički, angažira, on, je, priznao, da, je,...",reco111,politički angažira on je priznao da je to isti...,politički angažira on je priznao da je to isti...,sample003.mp3,/home/rupnik/macocu/task8/sample/sample003.mp3,True
4,"11 10 2018 - 2. dio, 9. sjednica, 9. saziv [P_...",9967.92,9974.35,"[ja, moram, reći, da, sam, zahvalan, vama, koj...","[0, 0.88, 1.32, 2.18, 2.25, 2.41, 2.95, 3.47, ...","[0.06, 1.09, 1.61, 2.25, 2.41, 2.95, 3.44, 3.6...","[da, sam, zahvala, vama, koji, ste, pokrenuli,...",reco532,ja moram reći da sam zahvalan vama koji ste po...,da sam zahvala vama koji ste pokrenuli moj pos...,sample004.mp3,/home/rupnik/macocu/task8/sample/sample004.mp3,True
5,"23 1 2019 - 2. dio, 11. sjednica, 9. saziv [GW...",11124.44,11143.07,"[propis, kroz, koji, se, usklađujete, sa, svim...","[0, 1.11, 1.33, 1.96, 2.14, 2.9, 3.03, 3.32, 3...","[0.44, 1.33, 1.69, 2.14, 2.9, 3.03, 3.32, 3.52...","[propis, kroz, koju, se, usklađujete, sa, svim...",reco312,propis kroz koji se usklađujete sa svim onim š...,propis kroz koju se usklađujete sa svim onim š...,sample005.mp3,/home/rupnik/macocu/task8/sample/sample005.mp3,True
6,"10 5 2019 - 12. sjednica, 9. saziv [WoEEFk8lcD...",2399.39,2418.67,"[izvolite, gospodine, državni, tajniče, poštov...","[0, 0.49, 0.91, 1.44, 1.72, 2.02, 2.51, 3.64, ...","[0.49, 0.88, 1.44, 1.72, 2.02, 2.48, 3.07, 4.1...","[gospođo, za, naravno, da, nadzor, nad, radom,...",,izvolite gospodine državni tajniče poštovana g...,gospođo za naravno da nadzor nad radom zaštita...,,,False
7,"29 4 2020 - 2. dio, 16. sjednica, 9. saziv [Ya...",4979.44,4998.83,"[mrtvo, slovo, na, papiru, a, onda, se, to, kl...","[0, 0.34, 0.66, 0.77, 1.86, 2.01, 2.34, 2.98, ...","[0.34, 0.66, 0.77, 1.48, 2.01, 2.34, 2.95, 3.1...","[mrtvo, slovo, na, papiru, a, onda, se, toklat...",reco281,mrtvo slovo na papiru a onda se to klatno zalj...,mrtvo slovo na papiru a onda se toklatno zalju...,sample007.mp3,/home/rupnik/macocu/task8/sample/sample007.mp3,True
8,"3 5 2019 - 12. sjednica, 9. saziv [KcVA-9b1E80...",8813.97,8825.61,"[gledanja, na, određeni, problem, tako, da, se...","[0, 0.68, 0.88, 2.03, 2.72, 2.97, 3.33, 3.56, ...","[0.65, 0.88, 1.58, 2.54, 2.97, 3.16, 3.53, 3.6...","[gledanja, na, određenni, problem, tako, da, s...",,gledanja na određeni problem tako da se i suci...,gledanja na određenni problem tako da se i suc...,,,False
9,"3 4 2019 - 1. dio, 11. sjednica, 9. saziv [U8T...",10400.49,10420.02,"[prva, stvar, je, da, zakonom, treba, onemoguć...","[0, 0.35, 0.94, 1.44, 1.64, 2.28, 2.97, 4.75, ...","[0.35, 0.9, 1.41, 1.64, 2.28, 2.94, 4.68, 5.65...","[prva, stvar, je, da, zakonom, treba, onemoguć...",,prva stvar je da zakonom treba onemogućiti pre...,prva stvar je da zakonom treba onemogućiti pre...,,,False


In [140]:
sampledf.loc[sampledf.samplename != "", :].to_csv("transcripts.csv")

In [154]:
from pydub import AudioSegment
import os
output_dir = "/home/rupnik/macocu/task8/sample"
for i, row in sampledf.iterrows():
    if row["samplename"] != "":
        continue
    print(f"i: {i:0004}", end="\r")
    wavfile = row["file"]
    transcript = row["transcript"]
    reconame = row["reconame"]
    segment_start_ms = int(row["start"] * 1000)
    segment_end_ms = int(row["end"] * 1000)
    try:
        audio = AudioSegment.from_wav(
            links_to_wavs  + wavfile.replace("-", "- ")
            )
        segment = audio[segment_start_ms:segment_end_ms]
        outpath = os.path.join(
                output_dir, 
                f"sample{i:003}.mp3")
        segment.export(
            outpath, 
            format="mp3")
        sampledf.loc[i, "samplepath"] = outpath
        sampledf.loc[i, "samplename"] = f"sample{i:003}.mp3"
    except FileNotFoundError:
        continue




i: 0998

In [164]:
sampledf.loc[sampledf.samplename != "", :].to_csv("transcripts.csv")

In [163]:
available_files = os.listdir(links_to_wavs)

from pydub import AudioSegment
import os
output_dir = "/home/rupnik/macocu/task8/sample"
for i, row in sampledf.iterrows():
    if row["samplename"] != "":
        continue
    print(f"i: {i:0004}", end="\r")
    wavfile = row["file"]
    transcript = row["transcript"]
    reconame = row["reconame"]
    segment_start_ms = int(row["start"] * 1000)
    segment_end_ms = int(row["end"] * 1000)

    if wavfile not in available_files:
        split_filename = wavfile.split()
        for available_file in available_files:
            if split_filename == available_file.split():
                wavfile = available_file
    try:
        audio = AudioSegment.from_wav(
            links_to_wavs  + wavfile
            )
        segment = audio[segment_start_ms:segment_end_ms]
        outpath = os.path.join(
                output_dir, 
                f"sample{i:003}.mp3")
        segment.export(
            outpath, 
            format="mp3")
        sampledf.loc[i, "samplepath"] = outpath
        sampledf.loc[i, "samplename"] = f"sample{i:003}.mp3"
    except FileNotFoundError:
        continue

i: 0998

In [161]:
failed_condition = sampledf.samplename==""
failed_files = set(sampledf.loc[failed_condition, "file"].unique())
ok_files = set(sampledf.loc[~failed_condition, "file"].unique())

ok_files.intersection(failed_files)

set()

In [162]:
failed_files

{'10 7 2019 - 1. dio, 12. sjednica, 9. saziv [1DSAO-5loYo].wav',
 '10 7 2019 - 2. dio, 12. sjednica, 9. saziv [00ssxRS5L-4].wav',
 '12 6 2019 - 12. sjednica, 9. saziv [-k1z8behXXg].wav',
 '16 10 2019 - 3. dio, 14. sjednica, 9. saziv [05ONqG97F-0].wav',
 '18 10 2019 - 14. sjednica, 9. saziv [_g-mjsl5lhE].wav',
 '19 6 2017 - 1. dio, 4. sjednica, 9. saziv [w-mXysdhZaU].wav',
 '20 9 2019 - 2. dio, 14. sjednica, 9. saziv [U39UIQwM-lg].wav',
 '21 2 2019 - 11. sjednica, 9. saziv [zszY0-vwr70].wav',
 '27 3 2019 - 11. sjednica, 9. saziv [mGJ-n2xBISA].wav',
 '3 2 2017 - 3. sjednica, 9. saziv [btzBgb-_Skc].wav',
 '3 5 2019 - 12. sjednica, 9. saziv [KcVA-9b1E80].wav',
 '30 1 2019 - 2. dio, 11. sjednica, 9. saziv [eyv7y-kyI1E].wav',
 '30 5 2019 - 1. dio, 12. sjednica, 9. saziv [5yky-shnXCM].wav',
 '30 5 2019 - 2. dio, 12. sjednica, 9. saziv [-NTgCyx5mv0].wav',
 '4 4 2019 - 1. dio, 11. sjednica, 9. saziv [C2P-jPLGFSk].wav',
 '5 4 2019 - 1. dio, 11. sjednica, 9. saziv [T1-MQ-t3Ems].wav',
 '5 4 2019 -

In [165]:
mappings

Unnamed: 0,reconame,longname
0,reco000,"12 7 2019 - 2. dio, 12. sjednica, 9. saziv [f..."
1,reco001,"30 3 2017 - 2. dio, 3. sjednica, 9. saziv [fb..."
2,reco002,"10 2 2017 - 2. dio, 3. sjednica, 9. saziv [j-..."
3,reco003,"19 9 2018 - 2. dio, 9. sjednica, 9. saziv [fU-..."
4,reco004,"12 12 2018 - 4. dio, 10. sjednica, 9. saziv [g..."
...,...,...
737,reco737,"15 1 2020 - 5. dio, 16. sjednica, 9. saziv [sx..."
738,reco738,"3 5 2017 - 3. dio, 3. sjednica, 9. saziv [sdD..."
739,reco739,"2 7 2018 - 1. dio, 8. sjednica, 9. saziv [tdMB..."
740,reco740,"08 12 2016 - 2. dio, 2. sjednica, 9. saziv [sH..."
