In [1]:
import os
import librosa

def process_audio_directory(directory_path, n_mfcc=20, n_fft=1024, hop_length=512):
    """
    Processes audio files in a directory, extracting MFCC features and identifying files with zero signal length.

    Args:
        directory_path: Path to the directory containing audio files.
        n_mfcc: Number of Mel-frequency cepstral coefficients to extract (default: 20).
        n_fft: Window size for FFT (default: 2048).
        hop_length: Hop length for MFCC computation (default: 512).

    Returns:
        A tuple containing:
            - extracted_features: A list of extracted MFCC features.
            - empty_filenames: A list of filenames with zero signal length.
    """

    extracted_features = []
    empty_filenames = []

    for filename in os.listdir(directory_path):
        if filename.endswith(".wav"):  # Check for WAV files only
            filepath = os.path.join(directory_path, filename)

            try:
                signal, sr = librosa.load(filepath, sr=None)
                if signal is None or len(signal) == 0:
                    empty_filenames.append(filename)
                    continue
                else:
                    feature = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
                    extracted_features.append(feature)
            except (librosa.util.exceptions.ParameterError, OSError) as e:
                print(f"Error processing file '{filepath}': {e}")
                continue

    return extracted_features, empty_filenames

# Example usage
directory_path = "."
extracted_features, empty_filenames = process_audio_directory(directory_path)

if extracted_features:
    print("Extracted MFCC features for valid audio files. You can access them using the 'extracted_features' variable.")
else:
    print("No valid audio files found in the directory.")

if empty_filenames:
    print("Filenames with zero signal length:")
    for filename in empty_filenames:
        print(filename)
else:
    print("No files with zero signal length found.")


Extracted MFCC features for valid audio files. You can access them using the 'extracted_features' variable.
No files with zero signal length found.


In [5]:
import os
import random
import shutil

# Paths to the text and wav folders
text_folder = 'D:/THESIS/TTS_40HRS/Telugu_M/Telugu_M/txt'
wav_folder = 'D:/THESIS/TTS_40HRS/Telugu_M/Telugu_M/resample_wav'


dest_text_folder = 'D:/THESIS/TTS_40HRS/Telugu_M/Telugu_M/test_100'
dest_wav_folder = 'D:/THESIS/TTS_40HRS/Telugu_M/Telugu_M/wav_100'


os.makedirs(dest_text_folder, exist_ok=True)
os.makedirs(dest_wav_folder, exist_ok=True)

# List all text and wav files
text_files = [f for f in os.listdir(text_folder) if f.endswith('.txt')]
wav_files = [f for f in os.listdir(wav_folder) if f.endswith('.wav')]

# Pair text and wav files by their base name (excluding extension)
text_base_names = {os.path.splitext(f)[0] for f in text_files}
wav_base_names = {os.path.splitext(f)[0] for f in wav_files}

# Find common base names (intersection of sets)
common_base_names = text_base_names.intersection(wav_base_names)

# Convert the set of common base names to a list (suitable for random.sample)
common_base_names_list = list(common_base_names)

# Randomly select 100 base names (handle possibility of less than 100)
num_files_to_select = min(100, len(common_base_names_list))
selected_base_names = random.sample(common_base_names_list, num_files_to_select)

# Copy selected pairs to the destination folders (optional, comment out if not needed)
# for base_name in selected_base_names:
#     text_src = os.path.join(text_folder, base_name + '.txt')
#     wav_src = os.path.join(wav_folder, base_name + '.wav')
#     text_dest = os.path.join(dest_text_folder, base_name + '.txt')
#     wav_dest = os.path.join(dest_wav_folder, base_name + '.wav')
#
#     shutil.copy2(text_src, text_dest)
#     shutil.copy2(wav_src, wav_dest)

# Remove selected files from original directories
for base_name in selected_base_names:
    text_file = os.path.join(text_folder, base_name + '.txt')
    wav_file = os.path.join(wav_folder, base_name + '.wav')

    if os.path.exists(text_file):
        os.remove(text_file)
    if os.path.exists(wav_file):
        os.remove(wav_file)

print(f"Removed {num_files_to_select} text and wav file pairs from {text_folder} and {wav_folder}.")


Removed 100 text and wav file pairs from D:/THESIS/TTS_40HRS/Telugu_M/Telugu_M/txt and D:/THESIS/TTS_40HRS/Telugu_M/Telugu_M/resample_wav.


In [None]:
import os
import shutil

def find_and_copy_extra_files(folder1_path, folder2_path, extra_files_folder_path):
    # List all files in both directories
    files_in_folder1 = set(os.listdir(folder1_path))
    files_in_folder2 = set(os.listdir(folder2_path))
    
    # Identify extra files in folder1
    extra_files = files_in_folder1 - files_in_folder2
    print(extra_files)
    
    # Create the extra_files_folder if it doesn't exist
    os.makedirs(extra_files_folder_path, exist_ok=True)
    
    # Copy extra files to the new folder
    for extra_file in extra_files:
        src_path = os.path.join(folder1_path, extra_file)
        dest_path = os.path.join(extra_files_folder_path, extra_file)
        shutil.copy(src_path, dest_path)
        print(f'Copied {extra_file} to {extra_files_folder_path}')

# Example usage
folder2_path = 'D:/Telugu_M/Telugu_M/txt'
folder1_path = 'D:/THESIS/TTS_40HRS/Telugu_M/Telugu_M/txt'
extra_files_folder_path = 'D:/THESIS/TTS_40HRS/Telugu_M/Telugu_M/wav_100'

find_and_copy_extra_files(folder1_path, folder2_path, extra_files_folder_path)


In [None]:
import os
import shutil

def copy_matching_text_files(wav_folder_path, text_folder_path, output_folder_path):
    # List all .wav files in the wav_folder
    wav_files = [f for f in os.listdir(wav_folder_path) if f.endswith('.wav')]
    
    # Extract the base names without extension from the .wav files
    wav_base_names = {os.path.splitext(f)[0] for f in wav_files}
    
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder_path, exist_ok=True)
    
    # Loop through the text files and copy the matching ones
    for text_file in os.listdir(text_folder_path):
        if text_file.endswith('.txt'):
            text_base_name = os.path.splitext(text_file)[0]
            if text_base_name in wav_base_names:
                src_path = os.path.join(text_folder_path, text_file)
                dest_path = os.path.join(output_folder_path, text_file)
                shutil.copy(src_path, dest_path)
                print(f'Copied {text_file} to {output_folder_path}')

# Example usage
wav_folder_path = 'D:/THESIS/TTS_40HRS/Telugu_M/Telugu_M/wav_100'
text_folder_path = 'D:/Telugu_M/Telugu_M/txt'
output_folder_path = 'D:/THESIS/TTS_40HRS/Telugu_M/Telugu_M/text_100'

copy_matching_text_files(wav_folder_path, text_folder_path, output_folder_path)


In [None]:
import os
import csv

def text_files_to_csv(folder_path, output_csv_path):
    # List all files in the directory
    files = os.listdir(folder_path)
    # Filter out non-text files
    text_files = [f for f in files if f.endswith('.txt')]
    
    with open(output_csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file, delimiter='|')
        # Write the header
        writer.writerow(['Filename', 'Content'])
        
        for text_file in text_files:
            file_path = os.path.join(folder_path, text_file)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                # Write the filename and content to the CSV file
                writer.writerow([text_file, content])

# Example usage
folder_path = 'D:/THESIS/TTS_40HRS/Telugu_M/Telugu_M/text_100'
output_csv_path = 'D:/THESIS/TTS_40HRS/Telugu_M/Telugu_M/metadata_100.csv'
text_files_to_csv(folder_path, output_csv_path)


In [None]:
## check for numbers

import os
import re

# Path to the folder containing text files
text_folder = 'D:/THESIS/TTS_40HRS/Telugu_M/Telugu_M/txt/'

# Function to check if a string contains any digit
def contains_digit(text):
    return any(char.isdigit() for char in text)

# Iterate over all text files in the folder
for text_file in os.listdir(text_folder):
    if text_file.endswith('.txt'):
        text_path = os.path.join(text_folder, text_file)
        
        # Open and read the content of the text file
        with open(text_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
            # Check if the content contains any digit
            if contains_digit(content):
                print(f"File '{text_file}' contains digit(s).")


In [29]:



import os

single_digit_dict = {
    "0": "సున్నా",
    "1": "ఒకటి",
    "2": "రెండు",
    "3": "మూడు",
    "4": "నాలుగు",
    "5": "ఐదు",
    "6": "ఆరు",
    "7": "ఏడు",
    "8": "ఎనిమిది",
    "9": "తొమ్మిది"
}

two_digits_dict = {

    "01": "ఒకటి",
    "02": "రెండు",
    "03": "మూడు",
    "04": "నాలుగు",
    "05": "ఐదు",
    "06": "ఆరు",
    "07": "ఏడు",
    "08": "ఎనిమిది",
    "09": "తొమ్మిది",

    "10": "పది",
    "11": "పదకొండు",
    "12": "పన్నెండు",
    "13": "పదమూడు",
    "14": "పద్నాలుగు",
    "15": "పదిహేను",
    "16": "పదహారు",
    "17": "పదిహేడు",
    "18": "పద్దెనిమిది",
    "19": "పందొమ్మిది",
    "20": "ఇరవై",
    "21": "ఇరవై ఒకటి",
    "22": "ఇరవై రెండు",
    "23": "ఇరవై మూడు",
    "24": "ఇరవై నాలుగు",
    "25": "ఇరవై ఐదు",
    "26": "ఇరవై ఆరు",
    "27": "ఇరవై ఏడు",
    "28": "ఇరవై ఎనిమిది",
    "29": "ఇరవై తొమ్మిది",
    "30": "ముప్పై",
    "31": "ముప్పై ఒకటి",
    "32": "ముప్పై రెండు",
    "33": "ముప్పై మూడు",
    "34": "ముప్పై నాలుగు",
    "35": "ముప్పై ఐదు",
    "36": "ముప్పై ఆరు",
    "37": "ముప్పై ఏడు",
    "38": "ముప్పై ఎనిమిది",
    "39": "ముప్పై తొమ్మిది",
    "40": "నలభై",
    "41": "నలభై ఒకటి",
    "42": "నలభై రెండు",
    "43": "నలభై మూడు",
    "44": "నలభై నాలుగు",
    "45": "నలభై ఐదు",
    "46": "నలభై ఆరు",
    "47": "నలభై ఏడు",
    "48": "నలభై ఎనిమిది",
    "49": "నలభై తొమ్మిది",
    "50": "యాభై",
    "51": "యాభై ఒకటి",
    "52": "యాభై రెండు",
    "53": "యాభై మూడు",
    "54": "యాభై నాలుగు",
    "55": "యాభై ఐదు",
    "56": "యాభై ఆరు",
    "57": "యాభై ఏడు",
    "58": "యాభై ఎనిమిది",
    "59": "యాభై తొమ్మిది",
    "60": "అరవై",
    "61": "అరవై ఒకటి",
    "62": "అరవై రెండు",
    "63": "అరవై మూడు",
    "64": "అరవై నాలుగు",
    "65": "అరవై ఐదు",
    "66": "అరవై ఆరు",
    "67": "అరవై ఏడు",
    "68": "అరవై ఎనిమిది",
    "69": "అరవై తొమ్మిది",
    "70": "డెబ్బై",
    "71": "డెబ్బై ఒకటి",
    "72": "డెబ్బై రెండు",
    "73": "డెబ్బై మూడు",
    "74": "డెబ్బై నాలుగు",
    "75": "డెబ్బై ఐదు",
    "76": "డెబ్బై ఆరు",
    "77": "డెబ్బై ఏడు",
    "78": "డెబ్బై ఎనిమిది",
    "79": "డెబ్బై తొమ్మిది",
    "80": "ఎనభై",
    "81": "ఎనభై ఒకటి",
    "82": "ఎనభై రెండు",
    "83": "ఎనభై మూడు",
    "84": "ఎనభై నాలుగు",
    "85": "ఎనభై ఐదు",
    "86": "ఎనభై ఆరు",
    "87": "ఎనభై ఏడు",
    "88": "ఎనభై ఎనిమిది",
    "89": "ఎనభై తొమ్మిది",
    "90": "తొంభై",
    "91": "తొంభై ఒకటి",
    "92": "తొంభై రెండు",
    "93": "తొంభై మూడు",
    "94": "తొంభై నాలుగు",
    "95": "తొంభై ఐదు",
    "96": "తొంభై ఆరు",
    "97": "తొంభై ఏడు",
    "98": "తొంభై ఎనిమిది",
    "99": "తొంభై తొమ్మిది"
}


def is_integer(s):
    try:
        int(s)
        return True
    except ValueError:
        return False


def convert_single_digit_nums(word):
    return single_digit_dict[word]

def convert_two_digit_nums(word):
    return two_digits_dict[word]

def convert_three_digit_nums(word):
    
    hundreds = {
        "100" : "వంద",
        "200" : "రెండు వందలు",
        "300" : "మూడు వందలు",
        "400" : "నాలుగు వందలు",
        "500" : "ఐదు వందలు",
        "600" : "ఆరు వందలు",
        "700" : "ఏడు వందలు",
        "800" : "ఎనిమిది వందలు",
        "900" : "తొమ్మిది వందలు"
    }

    if word in hundreds:
        return hundreds[word]
    
    temp = ""
    
    if word[0] == "1":

        temp += "నూట "
        temp += convert_two_digit_nums(word[1:3])
        return temp
    
    temp = convert_single_digit_nums(word[0])
    temp += " వందల "
    temp += convert_two_digit_nums(word[1:3])
    return temp

def convert_four_digit_nums(word):

    thousands = {
        "1000" : "వెయ్యి",
        "2000" : "రెండు వేలు",
        "3000" : "మూడు వేలు",
        "4000" : "నాలుగు వేలు",
        "5000" : "ఐదు వేలు",
        "6000" : "ఆరు వేలు",
        "7000" : "ఏడు వేలు",
        "8000" : "ఎనిమిది వేలు",
        "9000" : "తొమ్మిది వేలు"
    }

    if word in thousands:
        return thousands[word]
    

    temp = "" 
    
    if word[0:2] == "10":

        temp = "వెయ్యి "
        temp += convert_two_digit_nums(word[2:4])
        return temp
    
    if word[0] == "1":

        temp = convert_two_digit_nums(word[0:2])
        temp += " వందల "
        temp += convert_two_digit_nums(word[2:4])
        return temp
    
    temp = convert_single_digit_nums(word[0])
    temp += " వేల "
    
    if word[1] == "0":
        temp += convert_two_digit_nums(word[2:4])
        return temp
    
    else:
        temp += convert_three_digit_nums(word[1:4])
    
    return temp
    


def replace_single_digit_words_in_files(folder_path):

    files_updated = 0
    # Iterate over all the files in the folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # Check if it's a file (not a directory)
        if os.path.isfile(file_path):
            # Open and read the content of the file
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # Split the content into words
            words = content.split()

            # Create a list to hold updated words
            updated_words = []
            
            for word in words:

                numericPart = ""

                for char in word:
                    if is_integer(char):
                        numericPart += char
                    else:
                        break
                
                # Check if the word is in the dictionary
                if len(numericPart)==1:

                    # If the word is in the dictionary, replace it with the corresponding value
                    temp = convert_single_digit_nums(numericPart)
                    temp += word[len(numericPart):]
                    updated_words.append(temp)
                else:
                    # If the word is not in the dictionary, keep it unchanged
                    updated_words.append(word)

            # Join the words back into a single string
            updated_content = ' '.join(updated_words)

            # Write the updated content back to the file
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(updated_content)

def replace_two_digit_words_in_files(folder_path):

    files_updated = 0
    # Iterate over all the files in the folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # Check if it's a file (not a directory)
        if os.path.isfile(file_path):
            # Open and read the content of the file
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # Split the content into words
            words = content.split()

            # Create a list to hold updated words
            updated_words = []
            
            for word in words:
                
                numericPart = ""

                for char in word:
                    if is_integer(char):
                        numericPart += char
                    else:
                        break
                # Check if the word is in the dictionary
                if len(numericPart)==2:
                    try:
                    # If the word is in the dictionary, replace it with the corresponding value
                        temp = convert_two_digit_nums(numericPart)
                        temp += word[len(numericPart):]
                        updated_words.append(temp)
                    except:
                        updated_words.append(word)
                else:
                    # If the word is not in the dictionary, keep it unchanged
                    updated_words.append(word)

            # Join the words back into a single string
            updated_content = ' '.join(updated_words)

            # Write the updated content back to the file
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(updated_content)

def replace_three_digit_words_in_files(folder_path):

    files_updated = 0
    # Iterate over all the files in the folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # Check if it's a file (not a directory)
        if os.path.isfile(file_path):
            # Open and read the content of the file
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # Split the content into words
            words = content.split()

            # Create a list to hold updated words
            updated_words = []
            
            for word in words:

                numericPart = ""

                for char in word:
                    if is_integer(char):
                        numericPart += char
                    else:
                        break
                
                # Check if the word is in the dictionary
                if len(numericPart)==3:
                    # If the word is in the dictionary, replace it with the corresponding value
                    temp = convert_three_digit_nums(numericPart)
                    temp += word[len(numericPart):]
                    updated_words.append(temp)
                else:
                    # If the word is not in the dictionary, keep it unchanged
                    updated_words.append(word)

            # Join the words back into a single string
            updated_content = ' '.join(updated_words)

            # Write the updated content back to the file
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(updated_content)

def replace_four_digit_words_in_files(folder_path):
    
    files_updated = 0
    # Iterate over all the files in the folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # Check if it's a file (not a directory)
        if os.path.isfile(file_path):
            # Open and read the content of the file
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # Split the content into words
            words = content.split()

            # Create a list to hold updated words
            updated_words = []
            
            for word in words:
                # Check if the word is in the dictionary

                numericPart = ""

                for char in word:
                    if is_integer(char):
                        numericPart += char
                    else:
                        break
                if len(numericPart)==4:
                    try:
                    # If the word is in the dictionary, replace it with the corresponding value
                        temp = convert_four_digit_nums(numericPart)
                        temp += word[len(numericPart):]
                        updated_words.append(temp)
                    except:
                        updated_words.append(temp)
                else:
                    # If the word is not in the dictionary, keep it unchanged
                    updated_words.append(word)

            # Join the words back into a single string
            updated_content = ' '.join(updated_words)

            # Write the updated content back to the file
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(updated_content)




folder_path = 'D:/THESIS/TTS_40HRS/Telugu_M/Telugu_M/txt/'


replace_single_digit_words_in_files(folder_path)

replace_two_digit_words_in_files(folder_path)

replace_three_digit_words_in_files(folder_path)

replace_four_digit_words_in_files(folder_path)
