# Objective:

```
Produce a random selection of 1000 20sec segment pairs (text,mp3) from the data below, to check their quality. Important is 1. these are random from the whose sample, 2. it is easy to inspect the transcript and find the corresponding mp3 file, I suggest you just use the “alphabet ordering” trick, so mp3 files called “samplexxx”, xxx ranging from 000 to 999. One text file can have these file names and transcripts in file-name-alphabetical order.

20sec segments
/home/korzinek/kaldi/exp/ali_all/ali.json

Mapping of recoXXXX ids to actual file wav names
/home/korzinek/kaldi/exp/ali_all/wav.scp

Symbolic links to wavs
/home/nikola/projects/parlaspeech/wav/
```


# File management

In [50]:
links_to_wavs = "/home/nikola/projects/parlaspeech/wav/"
mappingsfile = "/home/korzinek/kaldi/exp/ali_all/wav.scp"
segmentsfile = "/home/korzinek/kaldi/exp/ali_all/asr.json"

import json
with open(segmentsfile, "r") as file:
    contents = []
    for line in file.readlines():
        contents.append(json.loads(line))


In [51]:
import pandas as pd
segments = pd.DataFrame(contents)
segments.head()

Unnamed: 0,file,start,end,words,word_start_times,word_end_times,asr
0,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1853.82,1873.66,"[konačni, prijedlog, zakona, o, izmjenama, i, ...","[0, 0.47, 0.8, 1.21, 1.26, 1.79, 1.82, 2.34, 2...","[0.47, 0.8, 1.21, 1.26, 1.79, 1.82, 2.34, 2.78...","[konačni, prijedlog, zakona, o, izmjenama, i, ..."
1,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1874.08,1893.99,"[u, svezi, sa, člankom, sto, devedeset, poslov...","[0, 0.11, 0.48, 0.66, 1.12, 1.36, 1.94, 2.53, ...","[0.11, 0.48, 0.66, 1.12, 1.36, 1.94, 2.53, 3.0...","[u, svezi, sa, člankom, sto, devedeset, poslov..."
2,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1893.99,1909.04,"[za, zakonodavstvo, i, odbor, za, financije, i...","[0, 0.17, 1.12, 1.24, 1.49, 1.65, 2.12, 2.15, ...","[0.17, 1.12, 1.24, 1.49, 1.65, 2.12, 2.15, 2.5...","[za, zakonodavstvo, i, odbor, za, financije, d..."
3,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1912.94,1919.47,"[hvala, lijepo, gospodine, potpredsjedniče, hr...","[0, 0.15, 0.37, 0.74, 1.52, 1.9, 2.35, 2.67, 2...","[0.15, 0.37, 0.74, 1.52, 1.9, 2.35, 2.67, 2.93...","[hval, lijepo, gospodine, potpredsjedniče, hrv..."
4,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1920.52,1940.34,"[zakona, o, izmjenama, i, dopunama, zakona, o,...","[0, 0.39, 0.44, 0.75, 0.79, 1.26, 1.65, 1.7, 2...","[0.39, 0.44, 0.75, 0.79, 1.26, 1.65, 1.7, 2.79...","[zakona, o, izmjenama, i, dopunama, zakona, o,..."


In [26]:
reconame, longname = [],[]
with open(mappingsfile) as f:
    for line in f.readlines():
        reconame.append(line[0:7])
        longname.append(line[8:-1])

mappings = pd.DataFrame(data={"reconame": reconame, "longname": longname})
mappings.shape

(742, 2)

In [52]:
merged = segments.merge(mappings, how="left", left_on="file", right_on="longname").drop(columns="longname")
merged.head(3)

Unnamed: 0,file,start,end,words,word_start_times,word_end_times,asr,reconame
0,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1853.82,1873.66,"[konačni, prijedlog, zakona, o, izmjenama, i, ...","[0, 0.47, 0.8, 1.21, 1.26, 1.79, 1.82, 2.34, 2...","[0.47, 0.8, 1.21, 1.26, 1.79, 1.82, 2.34, 2.78...","[konačni, prijedlog, zakona, o, izmjenama, i, ...",reco723
1,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1874.08,1893.99,"[u, svezi, sa, člankom, sto, devedeset, poslov...","[0, 0.11, 0.48, 0.66, 1.12, 1.36, 1.94, 2.53, ...","[0.11, 0.48, 0.66, 1.12, 1.36, 1.94, 2.53, 3.0...","[u, svezi, sa, člankom, sto, devedeset, poslov...",reco723
2,"01 12 2016 - 1. dio, 2. sjednica, 9. saziv [sP...",1893.99,1909.04,"[za, zakonodavstvo, i, odbor, za, financije, i...","[0, 0.17, 1.12, 1.24, 1.49, 1.65, 2.12, 2.15, ...","[0.17, 1.12, 1.24, 1.49, 1.65, 2.12, 2.15, 2.5...","[za, zakonodavstvo, i, odbor, za, financije, d...",reco723


Remarks:
* I have the long filename and the recoNNN mapping merged. Not sure why I need the latter tho.
* Wav files are present at `links_to_wavs + long filename`.

Plan:
* Generate a sample of 1000 rows from the merged DF.
* Based on the info I have in the merged dataframe I will generate transcript from `words` column
* I will trim the audio file to the segment borders.
* Save the transcript, save the audio.

In [95]:
sampledf = merged.sample(1000).reset_index(drop=True)
sampledf["transcript"] = sampledf.words.str.join(" ")
sampledf["asr_transcript"] = sampledf.asr.str.join(" ")
sampledf["samplename"] = ""
sampledf["samplepath"] = ""

In [59]:
sampledf.file[0]

'14 2 2019 - 3. dio, 11. sjednica, 9. saziv [KMGKU5I8aUQ].wav'

In [128]:
from pydub import AudioSegment
import os
output_dir = "/home/rupnik/macocu/task8/sample"
for i, row in sampledf.iterrows():
    print(f"i: {i:0004}", end="\r")
    wavfile = row["file"]
    transcript = row["transcript"]
    reconame = row["reconame"]
    segment_start_ms = int(row["start"] * 1000)
    segment_end_ms = int(row["end"] * 1000)
    try:
        audio = AudioSegment.from_wav(
            links_to_wavs  + wavfile
            )
        segment = audio[segment_start_ms:segment_end_ms]
        outpath = os.path.join(
                output_dir, 
                f"sample{i:003}.mp3")
        segment.export(
            outpath, 
            format="mp3")
        sampledf.loc[i, "samplepath"] = outpath
        sampledf.loc[i, "samplename"] = f"sample{i:003}.mp3"
    except FileNotFoundError:
        continue



i: 0546

In [None]:
sampledf.to_excel("transcripts.xls")