Normalization

In [8]:
import pandas as pd
import re
import inflect  # To convert numbers to words
import dateparser  # To recognize and parse dates
from contractions import fix  # To expand contractions

# Initialize inflect engine for number conversion
inflect_engine = inflect.engine()

# Load the dataset
input_file = 'data_text.csv'  # Replace with your file path
output_file_normalized = 'normalized_text.csv'  # Output file path for normalized text

# Dictionary for common abbreviations
abbreviations = {
    r'\be\.g\.\b': 'for example',
    r'\bi\.e\.\b': 'that is',
    r'\betc\.\b': 'and so on'
}

# Function to convert numbers to words
def num_to_words(match):
    num_str = match.group()
    try:
        return inflect_engine.number_to_words(int(num_str))
    except ValueError:
        return num_str

# Function to convert date to words
def date_to_words(match):
    date_str = match.group()
    parsed_date = dateparser.parse(date_str)
    if parsed_date:
        day = inflect_engine.number_to_words(parsed_date.day, ordinal=True)
        month = parsed_date.strftime("%B")
        year = inflect_engine.number_to_words(parsed_date.year)
        return f"{month} {day} {year}"
    return date_str

# Function to handle punctuation and prosody
def handle_punctuation(text):
    # Remove punctuation except periods
    text = re.sub(r'[!?,;:]', '', text)
    
    # Ensure each line ends with a period
    if not text.endswith('.'):
        text += '.'
        
    return text

# Function to expand abbreviations
def expand_abbreviations(text):
    for abbr, expansion in abbreviations.items():
        text = re.sub(abbr, expansion, text, flags=re.IGNORECASE)
    return text

# Function for text normalization
def normalize_text(text):
    if isinstance(text, str):
        # Convert text to lowercase
        text = text.lower()

        # Expand contractions (e.g., "don't" -> "do not")
        text = fix(text)

        # Expand abbreviations (e.g., "e.g." -> "for example")
        text = expand_abbreviations(text)

        # Handle punctuation (preserve periods at the end only)
        text = handle_punctuation(text)

        # Replace numbers with words (e.g., "123" -> "one hundred twenty-three")
        text = re.sub(r'\d+', num_to_words, text)

        # Replace dates with words (e.g., "2023-10-04" -> "October fourth twenty twenty-three")
        text = re.sub(r'\b(?:\d{1,2}[\/\-\s]\d{1,2}[\/\-\s]\d{2,4}|\d{4}[\/\-\s]\d{1,2}[\/\-\s]\d{1,2})\b', date_to_words, text)

        # Remove double quotes
        text = text.replace('"', '')

        # Remove dashes
        text = text.replace('--', '')

        # Remove extra whitespace and ensure single spacing between words
        text = re.sub(r'\s+', ' ', text).strip()

    return text

# Normalize the entire dataframe
def normalize_dataframe(dataframe):
    dataframe = dataframe.applymap(normalize_text)
    return dataframe

try:
    # Read the dataset
    df = pd.read_csv(input_file)

    # Display the original DataFrame
    print("Original DataFrame:")
    print(df.head())

    # Perform normalization on the DataFrame
    df_normalized = normalize_dataframe(df)

    # Save the normalized DataFrame to a new CSV file
    df_normalized.to_csv(output_file_normalized, index=False)
    print(f"\nNormalized data saved to {output_file_normalized}")

except Exception as e:
    print(f"An error occurred: {e}")


Original DataFrame:
                            normalized_transcription
0  Printing, in the only sense with which we are ...
1                     in being comparatively modern.
2  For although the Chinese took impressions from...
3  produced the block books, which were the immed...
4  the invention of movable metal letters in the ...


  dataframe = dataframe.applymap(normalize_text)



Normalized data saved to normalized_text.csv


PHONEME CONVERSION

In [2]:
import pandas as pd
import nltk
import string

# Download CMU Pronouncing Dictionary
nltk.download('cmudict')

# Load CMU Pronouncing Dictionary
phoneme_dict = nltk.corpus.cmudict.dict()

# Load the normalized dataset
input_file_normalized = 'normalized_text.csv'
output_file_phonemes = 'phoneme_text2.csv'

# Function to convert text to ARPAbet phonemes with fallback for unknown words
def text_to_phonemes(text):
    if isinstance(text, str):
        phoneme_sequence = []
        words = text.split()

        for word in words:
            word_clean = word.strip(string.punctuation)
            if word_clean:
                word_lower = word_clean.lower()
                if word_lower in phoneme_dict:
                    # Tacotron2 uses the first pronunciation variant (may need adjustment if multi-variant)
                    phonemes_with_stress = phoneme_dict[word_lower][0]
                    phoneme_sequence.extend(phonemes_with_stress)
                else:
                    phoneme_sequence.append(f"<UNK_{word_clean}>")  # Unknown word token
            if word[-1] in string.punctuation:  # Add pause for punctuation
                phoneme_sequence.append('<PAUSE>')
        return ' '.join(phoneme_sequence)  # Join phonemes by space for Tacotron2
    return text

# Apply phoneme conversion
try:
    df_normalized = pd.read_csv(input_file_normalized)
    df_normalized['phoneme_text'] = df_normalized['normalized_transcription'].apply(text_to_phonemes)
    df_normalized[['phoneme_text']].to_csv(output_file_phonemes, index=False)
    print(f"Phoneme-converted data saved to {output_file_phonemes}")
except Exception as e:
    print(f"An error occurred: {e}")


[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\Albin\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


Phoneme-converted data saved to phoneme_text2.csv


TOKENIZATION giving ids

In [4]:
import pandas as pd

# Load the phoneme data
input_file_phonemes = 'phoneme_text2.csv'
output_file_tokenized = 'tokenized_phoneme_ids.csv'

# Define a phoneme-to-ID mapping (ensure this aligns with Tacotron2's expected IDs)
phoneme_to_id = {
    'AA': 1, 'AE': 2, 'AH': 3, 'AO': 4, 'AW': 5, 'AY': 6, 'B': 7, 'CH': 8, 'D': 9, 'DH': 10,
    'EH': 11, 'ER': 12, 'EY': 13, 'F': 14, 'G': 15, 'HH': 16, 'IH': 17, 'IY': 18, 'JH': 19, 
    'K': 20, 'L': 21, 'M': 22, 'N': 23, 'NG': 24, 'OW': 25, 'OY': 26, 'P': 27, 'R': 28, 
    'S': 29, 'SH': 30, 'T': 31, 'TH': 32, 'UH': 33, 'UW': 34, 'V': 35, 'W': 36, 'Y': 37, 
    'Z': 38, 'ZH': 39, '<PAUSE>': 40, '<UNK>': 41
}

# Function to tokenize phoneme sequences
def tokenize_phonemes(phoneme_sequence):
    if isinstance(phoneme_sequence, str):
        phonemes = phoneme_sequence.split()  # Split by space
        token_ids = [phoneme_to_id.get(p, phoneme_to_id['<UNK>']) for p in phonemes]
        return token_ids
    return []

# Apply tokenization
try:
    df_phonemes = pd.read_csv(input_file_phonemes)
    df_phonemes['tokenized_phonemes'] = df_phonemes['phoneme_text'].apply(tokenize_phonemes)
    df_phonemes[['tokenized_phonemes']].to_csv(output_file_tokenized, index=False)
    print(f"Tokenized phoneme data saved to {output_file_tokenized}")
except Exception as e:
    print(f"An error occurred: {e}")


Tokenized phoneme data saved to tokenized_phoneme_ids.csv


PADDING AND SEQUENCE LENGTH HANDLING

In [5]:
import pandas as pd
import torch

# Load the tokenized phoneme data
phoneme_data = pd.read_csv('tokenized_phoneme_ids.csv')

# Function to pad sequences
def pad_sequences(sequences, pad_value=0):
    max_length = max(len(seq) for seq in sequences)
    padded_sequences = []
    lengths = []
    for seq in sequences:
        padded_seq = seq + [pad_value] * (max_length - len(seq))
        padded_sequences.append(padded_seq)
        lengths.append(len(seq))  # Store original length
    return padded_sequences, torch.tensor(lengths, dtype=torch.int64)

# Clean and convert tokenized sequences
def clean_and_convert(phoneme_str):
    phoneme_list = phoneme_str.strip("[]").replace("'", "").split(",")
    return [int(x.strip()) for x in phoneme_list if x.strip()]

# Apply cleaning and padding
phoneme_sequences = phoneme_data['tokenized_phonemes'].apply(clean_and_convert)
padded_sequences, sequence_lengths = pad_sequences(phoneme_sequences)

# Save the padded sequences
padded_df = pd.DataFrame({
    'padded_phonemes': padded_sequences,
    'lengths': sequence_lengths.numpy()
})
padded_df.to_csv('padded_phoneme_sequences.csv', index=False)

print("Padded sequences and lengths saved.")


Padded sequences and lengths saved.
