In [1]:
import pandas 
import glob
import re
import functools
from pathlib import Path

from os.path import join as pathjoin
from os.path import abspath

In [2]:
# string = """103-1240-0047 AND I KNOW ANOTHER CASE WHERE AN ADOPTED BOY USED TO SUCK THE EGGS THEY COULDN'T BREAK HIM OF IT IF YOU HAD ASKED MY ADVICE IN THE MATTER WHICH YOU DIDN'T DO MARILLA I'D HAVE SAID FOR MERCY'S SAKE NOT TO THINK OF SUCH A THING THAT'S WHAT
# 103-1240-0048 THIS JOB'S COMFORTING SEEMED NEITHER TO OFFEND NOR TO ALARM MARILLA SHE KNITTED STEADILY ON I DON'T DENY THERE'S SOMETHING IN WHAT YOU SAY RACHEL I'VE HAD SOME QUALMS MYSELF BUT MATTHEW WAS TERRIBLE SET ON IT I COULD SEE THAT SO I GAVE IN
# 103-1240-0049 IT'S SO SELDOM MATTHEW SETS HIS MIND ON ANYTHING THAT WHEN HE DOES I ALWAYS FEEL IT'S MY DUTY TO GIVE IN AND AS FOR THE RISK THERE'S RISKS IN PRETTY NEAR EVERYTHING A BODY DOES IN THIS WORLD THERE'S RISKS IN PEOPLE'S HAVING CHILDREN OF THEIR OWN IF IT COMES TO THAT THEY DON'T ALWAYS TURN OUT WELL
# """
# string_lines = string.splitlines()

# for line in string_lines:
#     print(re.match(r"(\d+-\d+-\d+) (\D*)", line).group(3))

In [3]:
def get_transcripts_dataframe(dataset_path: str) -> pandas.core.frame.DataFrame:
    
    ## Convert string path to PosixPath
    dataset_path = Path(dataset_path)
    
    ## Check if path exists
    if dataset_path.exists() == False:
        raise ValueError(f"Please provide a valid path. Input Dataset Path: {dataset_path}")
    
    ## Get all transcript files from the dataset path.
    transcript_files = glob.glob(f'{dataset_path}/**/*trans.txt', recursive=True)
    
    ## Compose a callable function
    read_func = functools.partial(pandas.read_csv, sep = r"(\d+-\d+-\d+) (\D*)", header= None, engine = "python", usecols=[1, 2], names = ["filename", "sentence"])

    ## Read all files then concat them together
    df = pandas.concat(map(read_func, transcript_files))
    
    ## Get the full filepath for each of the files
    df['filepath'] = df['filename'].apply(lambda x:abspath(pathjoin(dataset_path, pathjoin("/".join(x.split('-')[:2]), x) + '.flac')))
    
    ## Get the exact filename for the file
    df['filename'] = df['filename']+ '.flac'
    
    return df

In [4]:
dataset_path = "../../data/external/train-clean-100/LibriSpeech/train-clean-100/"
dataset_path = Path(dataset_path)

df = get_transcripts_dataframe(dataset_path)

In [None]:
df.head()

Unnamed: 0,filename,sentence,filepath
0,6147-34607-0000.flac,ABOVE THIS COUPLE THERE WAS ANNE QUEEN OF ENGL...,/home/ashim/Projects/DeepSpeech/data/external/...
1,6147-34607-0001.flac,HER STOUTNESS WAS BLOATED HER FUN HEAVY HER GO...,/home/ashim/Projects/DeepSpeech/data/external/...
2,6147-34607-0002.flac,AS A CHRISTIAN SHE WAS A HERETIC AND A BIGOT S...,/home/ashim/Projects/DeepSpeech/data/external/...
3,6147-34607-0003.flac,THE REST OF HER PERSON WAS INDIFFERENTLY FORME...,/home/ashim/Projects/DeepSpeech/data/external/...
4,6147-34607-0004.flac,SHE HAD A NARROW FOREHEAD SENSUAL LIPS FLESHY ...,/home/ashim/Projects/DeepSpeech/data/external/...


In [None]:
df.iloc[0]['filename']

'6147-34607-0000.flac'

In [None]:
df.iloc[0]['sentence']

'ABOVE THIS COUPLE THERE WAS ANNE QUEEN OF ENGLAND AN ORDINARY WOMAN WAS QUEEN ANNE SHE WAS GAY KINDLY AUGUST TO A CERTAIN EXTENT NO QUALITY OF HERS ATTAINED TO VIRTUE NONE TO VICE'

In [None]:
df.iloc[0]['filepath']

'/home/ashim/Projects/DeepSpeech/data/external/train-clean-100/6147/34607/6147-34607-0000.flac'

In [9]:
df

Unnamed: 0,filename,sentence,filepath
0,6147-34607-0000.flac,ABOVE THIS COUPLE THERE WAS ANNE QUEEN OF ENGL...,/home/ashim/Projects/DeepSpeech/data/external/...
1,6147-34607-0001.flac,HER STOUTNESS WAS BLOATED HER FUN HEAVY HER GO...,/home/ashim/Projects/DeepSpeech/data/external/...
2,6147-34607-0002.flac,AS A CHRISTIAN SHE WAS A HERETIC AND A BIGOT S...,/home/ashim/Projects/DeepSpeech/data/external/...
3,6147-34607-0003.flac,THE REST OF HER PERSON WAS INDIFFERENTLY FORME...,/home/ashim/Projects/DeepSpeech/data/external/...
4,6147-34607-0004.flac,SHE HAD A NARROW FOREHEAD SENSUAL LIPS FLESHY ...,/home/ashim/Projects/DeepSpeech/data/external/...
...,...,...,...
23,8770-295463-0023.flac,ABOUT NOON ON THE NINETEENTH OF NOVEMBER THE D...,/home/ashim/Projects/DeepSpeech/data/external/...
24,8770-295463-0024.flac,THERE WERE FIFTEEN THOUSAND PEOPLE WAITING,/home/ashim/Projects/DeepSpeech/data/external/...
25,8770-295463-0025.flac,WATCHING HER WHILE THE PRESIDENT OF THE UNITED...,/home/ashim/Projects/DeepSpeech/data/external/...
26,8770-295463-0026.flac,THE FACT THAT THE PRESIDENT WAS SPEAKING WAS S...,/home/ashim/Projects/DeepSpeech/data/external/...


In [11]:
df.to_csv('../../data/external/train-clean-100.csv', index=None)