In [1]:
import pandas as pd
import os, io, re, sys, time, datetime
from glob import glob
import numpy as np

from util.creating_directories import create_directories
from util.convert_srt_to_csv import change_encoding
from util.convert_srt_to_csv import convert_srt_to_csv
from util.change_sample_rate import pre_process_audio
#from util.extract_audio import wmv_to_wav
#from util.extract_audio import mp4_to_wav
from util.slice_audio import split_files
from util.create_DS_csv import create_DS_csv
from util.merge_csv import merge_csv
from util.merge_transcripts_and_files import merge_transcripts_and_wav_files
from util.clean import clean_unwanted_characters
from util.split import split_dataset
from util.audio_metrics import audio_metrics
#from util.trans_numbers import translate_numbers

start_time = time.time()



In [2]:
#Check if srt_files directory exists and contains srt files
srt_path = './srt_files/'

if os.path.exists(srt_path):
    print('Folder %s exists.. continuing processing..' %srt_path)
else:
    print('Folder "srt_files" is missing')
    try:
        os.mkdir(srt_path)
    except OSError:
        print('Creation of directory %s failed' %srt_path)
    else:
        print('Successfully created the directory %s' %srt_path)
    print('--> Please add srt files to folder %s' %srt_path)

#Check if audio directory exists and contains wmv or wav files

audio_path = './audio/'

if os.path.exists(audio_path):
    print('Folder %s exists.. continuing processing..' %audio_path)
else:
    print('Folder "audio" is missing')
    try:
        os.mkdir(audio_path)
    except OSError:
        print('Creation of directory %s failed' %audio_path)
    else:
        print('Successfully created the directory %s' %audio_path)
    print('--> Please add wav or wmv files to folder %s' %audio_path)

srt_counter = len(glob('./srt_files/' + '*.srt'))

if srt_counter == 0:
    print('!!! Please add srt_file(s) to %s-folder' %srt_path)

Folder ./srt_files/ exists.. continuing processing..
Folder ./audio/ exists.. continuing processing..


In [3]:
create_directories()

In [4]:
#Changing encoding from "cp1252" (a.k.a Windows 1252)to "utf-8-sig"
print('Encoding srt_file(s) to utf8...')
for srt in glob('./srt_files/*.srt'):
    change_encoding(srt)
print('Encoding of %s-file(s) changed' %srt_counter)
print('---------------------------------------------------------------------')

Encoding srt_file(s) to utf8...
Encoding of 1-file(s) changed
---------------------------------------------------------------------


In [5]:
print('Extracting information from srt_file(s) to csv_files')
for file in glob('./srt_files/*.srt'):
    convert_srt_to_csv(file)
print('%s-file(s) converted and saved as csv-files to ./csv' %srt_counter)
print('---------------------------------------------------------------------')

Extracting information from srt_file(s) to csv_files
1-file(s) converted and saved as csv-files to ./csv
---------------------------------------------------------------------


In [6]:
pre_process_audio(audio_path)
print('Pre-processing of audio files is complete.')
print('---------------------------------------------------------------------')

Downsampling wav files...
File  1  completed: 年轮
Downsampling complete
---------------------------------------------------------------------
The script took  0.27492618560791016  seconds to run
Pre-processing of audio files is complete.
---------------------------------------------------------------------


In [7]:
#now slice audio according to start- and end-times in csv
print('Slicing audio according to start- and end_times of transcript_csvs...')
for item in glob('./ready_for_slice/*.csv'):
    wav_item = item.replace('.csv','.wav')
    if os.path.exists(wav_item):
        split_files(item, wav_item)
    else:
        next
wav_counter = len(glob('./sliced_audio/' + '*.wav'))
print('Slicing complete. {} files in dir "sliced_audio"'.format(wav_counter))
print('---------------------------------------------------------------------')

Slicing audio according to start- and end_times of transcript_csvs...
Slicing complete. 46 files in dir "sliced_audio"
---------------------------------------------------------------------


In [8]:
create_DS_csv('./sliced_audio/')
print('DS_csv with Filenames - and sizes created.')
print('---------------------------------------------------------------------')

Extracting filepath and -size for every .wav file in ./sliced_audio
DS_csv with Filenames - and sizes created.
---------------------------------------------------------------------


In [9]:
#now join all seperate csv files
merge_csv('./ready_for_slice/')
print('Merged csv with all transcriptions created.')
print('---------------------------------------------------------------------')

Merging csv-files with transcriptions
All csv-files merged
Merged csv with all transcriptions created.
---------------------------------------------------------------------


In [10]:
transcript_path = './merged_csv/Full_Transcript.csv'
DS_csv = './merged_csv/Filepath_Filesize.csv'
df_final = pd.DataFrame()
df_transcripts = pd.read_csv(transcript_path)
df_files = pd.read_csv(DS_csv)

#by splitting the path at / and then choosing -1, the filename can be extracted
def remove_path(path):
    path = path.split('\\')[-1]
    return path

df_files['id'] = df_files['wav_filename'].apply(remove_path)

#filter out duration of less than 10 seconds
def convert(duration):
    time = float(duration)
    return time
df_files['duration'] = df_files['duration'].apply(convert)
df_files = df_files[df_files['duration']<12.00]
df_files = df_files[df_files['duration']>2.00]

#drop unnecessary columns
df_transcripts.drop(['start_times','end_times'], axis=1, inplace=True)

df_files['id'] = df_files['id'].replace('.wav', '', regex=True)

#merge on column id
df_final = pd.merge(df_transcripts, df_files, on='id')
df_final.drop(['id'], axis=1, inplace=True)
#rearrange columns
df_final = df_final[['wav_filename', 'duration', 'transcript']]

df_final.to_csv('./merged_csv/DS_training_final.csv', header=True, index=False, encoding='utf-8-sig')
print('Final DS csv generated.')
print('---------------------------------------------------------------------')

Final DS csv generated.
---------------------------------------------------------------------


In [11]:
final_csv_path = 'DS_training_final.csv'
clean_unwanted_characters(final_csv_path)
print('Unwanted characters cleaned.')
print('---------------------------------------------------------------------')

Length of ds_final: 44
Final Files cleaned of unwanted characters
Unwanted characters cleaned.
---------------------------------------------------------------------


In [12]:
#write transcript to text-file for language model
df_text = pd.read_csv('merged_csv\DS_training_final_pinyin.csv')
df_text[['wav_filename','transcript']].to_csv('./final_csv/final.txt', header=None, index=None, mode='a')

In [13]:
#根据汇总的文件路径和拼音内容
import re
with open('./final_csv/final.txt', 'r', encoding="utf-8-sig") as f:
        for lines in f.readlines():
            wavfile=lines.split(',')
            wavname=wavfile[0].split(".")
            pinyin=re.sub(r'\s+',' ',wavfile[1])
            with open("./my_lyrics/"+ wavname[0] + ".txt", "w") as newfile:
                newfile.write(pinyin)
                newfile.close()

import shutil,os,re
new_path='./my_wavs/'#新文件夹路径
wav_path='./sliced_audio/'

for derName, subfolders, filenames in os.walk('./my_lyrics/'):#要复制的文件夹所在的路径
  for i in range(len(filenames)):
    if filenames[i].endswith('.txt'):
        file_path = wav_path + filenames[i].split('.')[0] + ".wav"
        new_file_path=new_path + filenames[i].split('.')[0] + ".wav"
        shutil.copy(file_path, new_file_path)

In [None]:
slice_path = './ready_for_slice'
sliced_audio = './sliced_audio'
merged_csv_files = './merged_csv'
final_csv_files = './final_csv'
#shutil.rmtree(slice_path)
if os.path.exists(slice_path):
    shutil.rmtree(slice_path)
if os.path.exists(sliced_audio):
    shutil.rmtree(sliced_audio)
if os.path.exists(merged_csv_files):
    shutil.rmtree(merged_csv_files)
if os.path.exists(final_csv_files):
    shutil.rmtree(final_csv_files)

#evaluate the scripts execution time
end_time = time.time()
exec_time = str(datetime.timedelta(seconds=end_time-start_time))

print('The script took {} to run'.format(exec_time))
print('********************************************************************************************************')
