<a href="https://colab.research.google.com/github/Brutusa/HuggingFaceNLP/blob/main/NLP_Phoneme_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# @title Install Necessary Dependencies
!pip install transformers
!pip install huggingface_hub
!pip install keras_preprocessing
!pip install pronouncing
# Using earlier version of numpy for use with 'keras_preprocessing'
!pip install numpy==1.26.4

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.23.5
    Uninstalling numpy-1.23.5:
      Successfully uninstalled numpy-1.23.5
Successfully installed numpy-1.26.4


In [16]:
# @title Authenticate Hugging Face API
from google.colab import userdata
from huggingface_hub import login

# Fetch token from Colab's userdata storage
hf_token = userdata.get("HF_TOKEN")

# Authenticate automatically
if hf_token:
    login(token=hf_token)
    print("Hugging Face login successful!")
else:
    print("No token found. Please add your Hugging Face token to userdata.")

Hugging Face login successful!


In [None]:
# @title Run Program
import torch
import torch.nn as nn
import torch.optim as optim
from keras_preprocessing.sequence import pad_sequences
import numpy as np

# Import external files from github
!wget -O helpers.py https://raw.githubusercontent.com/Brutusa/HuggingFaceNLP/main/helpers.py
!wget -O words_list.txt https://raw.githubusercontent.com/Brutusa/HuggingFaceNLP/main/words_list.txt
!wget -O words.txt https://raw.githubusercontent.com/Brutusa/HuggingFaceNLP/main/words.txt

from helpers import Helpers
from huggingface_hub import InferenceClient
import os
import pronouncing
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import pronouncing

class Helpers:
    def read_words_from_file(self, filename):
        """Read words from a specified text file."""
        with open(filename, 'r') as file:
            # Read the lines and strip whitespace
            words = [line.strip() for line in file.readlines()]
        return words

    def get_phonetic_representation(self, words):
        """Get phonetic representation for a list of words."""
        # Get phonetic representation for each word
        phonetic_data = [(word, pronouncing.phones_for_word(word)[0]) for word in words if pronouncing.phones_for_word(word)]
        return phonetic_data

help = Helpers()

# Load words and phonetic representations
words = help.read_words_from_file('words_list.txt')
phonetic_data = help.get_phonetic_representation(words)
print("Phonetic Data:", phonetic_data)

# Generate phoneme sequences for training
def create_sequences(phonetic_data):
    sequences = []
    for word, phonetic in phonetic_data:
        phonemes = phonetic.split()
        for i in range(1, len(phonemes)):
            input_seq = phonemes[:i]
            target_phoneme = phonemes[i]
            sequences.append((input_seq, target_phoneme))
    return sequences

# Get sequences and create phoneme mappings
sequences = create_sequences(phonetic_data)
all_phonemes = set(p for _, phonetic in phonetic_data for p in phonetic.split())
phoneme_to_int = {p: i for i, p in enumerate(all_phonemes)}
int_to_phoneme = {i: p for p, i in phoneme_to_int.items()}

# Encode sequences
def encode_sequence(sequence, phoneme_to_int):
    return [phoneme_to_int[phoneme] for phoneme in sequence]

encoded_sequences = [(encode_sequence(seq, phoneme_to_int), phoneme_to_int[target])
                     for seq, target in sequences]
max_seq_len = max(len(seq) for seq, _ in encoded_sequences)

# Set pyTorch local env to use segmented GPU memory
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Clear GPU cache & Set the device to use GPU
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare data tensors
X = [seq for seq, _ in encoded_sequences]
y = [target for _, target in encoded_sequences]
X_padded = pad_sequences(X, maxlen=max_seq_len, padding='pre')
X_tensor = torch.tensor(X_padded, dtype=torch.long)  # Updated to use integer indices
y_tensor = torch.tensor(y, dtype=torch.long)

# Set tensors to use GPU
X_tensor = X_tensor.to(device)
y_tensor = y_tensor.to(device)

# Define LSTM model
class LSTMPhonemePredictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMPhonemePredictor, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])  # Take output from the last time step
        return out

# Model parameters
input_size = len(phoneme_to_int)
hidden_size = 128
output_size = len(phoneme_to_int)
model = LSTMPhonemePredictor(input_size, hidden_size, output_size)

# Set model to use GPU
model = model.to(device)

# Training setup
criterion = nn.CrossEntropyLoss()
#optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = optim.RMSprop(model.parameters(), lr=0.001)

# Create a DataLoader for mini-batching
batch_size = 32
dataset = TensorDataset(X_tensor, y_tensor)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training loop
num_epochs = 50
for epoch in tqdm(range(num_epochs)):
    model.train()
    epoch_loss = 0

    # Iterate over mini-batches
    for batch_X, batch_y in data_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()  # Accumulate loss for reporting

    # Report average loss per epoch
    if epoch % 10 == 0:
        avg_loss = epoch_loss / len(data_loader)
        print(f'Epoch [{epoch}/{num_epochs}], Loss: {avg_loss:.4f}')

# Save the trained model
model_save_path = 'lstm_phoneme_predictor.pth'
torch.save(model.state_dict(), model_save_path)
print(f'Model saved to {model_save_path}')

# Function to load model for inference
def load_model(model_path, input_size, hidden_size, output_size):
    model = LSTMPhonemePredictor(input_size, hidden_size, output_size)
    model.load_state_dict(torch.load(model_path, weights_only=True))
    model.eval()
    return model

loaded_model = load_model(model_save_path, input_size, hidden_size, output_size)

# Set loaded model to use GPU
loaded_model = loaded_model.to(device)

# Inference function
def predict_phoneme(model, input_seq):
    input_encoded = encode_sequence(input_seq, phoneme_to_int)
    input_padded = pad_sequences([input_encoded], maxlen=max_seq_len, padding='pre')
    input_tensor = torch.tensor(input_padded, dtype=torch.long).to(device)

    with torch.no_grad():
        output = model(input_tensor)
        predicted_index = torch.argmax(output, dim=1).item()
        return int_to_phoneme[predicted_index]


test_inputs = [
    ['P', 'R', 'IH0', 'D', 'IH1', 'K'],      # Corrected for 'predict', expecting "T"
    ['AH0', 'T', 'EH1', 'N'],           # Corrected for 'attend', expecting "D"
    ['K', 'AH0', 'M', 'Y', 'UW1', 'N', 'AH0', 'K', 'EY2'],  # Corrected for 'communicate', expecting "T"
    ['M', 'OW1', 'T', 'IH0', 'V', 'EY2'],  # Corrected for 'motivate', expecting "T"
    ['D', 'IH0', 'V', 'EH1', 'L', 'AH0'],  # Corrected for 'develop', expecting "P"
    ['IH0', 'S', 'T', 'AE1', 'B', 'L', 'IH0'],  # Corrected for 'establish', expecting "SH"
    ['IH2', 'N', 'S', 'T', 'R', 'AH1', 'K'],  # Corrected for 'instruct', expecting "T"
    ['R', 'AH0', 'F', 'AY1'],            # Corrected for 'refine', expecting "N"
    ['K', 'AA1', 'M', 'P', 'L', 'AH0', 'K', 'EY2'],  # Corrected for 'complicate', expecting "T"
    ['P', 'R', 'AE1', 'K', 'T', 'AH0'],
    ['K', 'OW1', 'D', 'IH0'],
    ['N', 'AE1', 'CH', 'ER0', 'AH0'],
    ['L' ,'AE1', 'NG', 'G', 'W', 'AH0'],
    ['K', 'UW1'],
    ['T', 'EH1', 'S', 'T', 'IH0'],
    ['K', 'OW1' ]
]

# Evaluate model predictions
compiled_results = ""
for test_input in test_inputs:
    predicted_phoneme = predict_phoneme(loaded_model, test_input)  # Use loaded model
    compiled_results += f"Original: {test_input}, Predicted next phoneme: {predicted_phoneme}\n"

print(compiled_results)

# List of words corresponding to the phonemic data
words = [
    'predict',  # Corresponds to ['P', 'R', 'IH0', 'D', 'IY1']
    'attend',   # Corresponds to ['AH0', 'T', 'EH1', 'N']
    'communicate',  # Corresponds to ['K', 'AH0', 'M', 'Y', 'UW1']
    'motivate', # Corresponds to ['M', 'OW1', 'T', 'IH0', 'V']
    'develop',  # Corresponds to ['D', 'IH0', 'V', 'EH1']
    'establish',  # Corresponds to ['IY0', 'S', 'T', 'AE1', 'B', 'L', 'IH0']
    'instruct',  # Corresponds to ['IH0', 'N', 'S', 'T', 'R', 'AH1']
    'refine',    # Corresponds to ['R', 'IH0', 'F', 'AY1']
    'complicate',  # Corresponds to ['K', 'AH0', 'M', 'P', 'L', 'IH1']
    'practice',
    'coding',
    'natural',
    'language',
    'cool',
    'testing',
    'code'
]

# Print the full phonemic spelling for each word
for word in words:
    phonemes = pronouncing.phones_for_word(word)
    if phonemes:  # Check if any phonemes are found
        print(f"The full phonemic spelling for '{word}' is: {phonemes[0]}")  # Use the first phonetic spelling
    else:
        print(f"No phonemic spelling found for '{word}'")

# Test cases with expected next phonemes
test_cases = [
    (['P', 'R', 'IH0', 'D', 'IH1', 'K'], "T"),   # Expecting "T"
    (['AH0', 'T', 'EH1', 'N'], "D"),            # Expecting "D"
    (['K', 'AH0', 'M', 'Y', 'UW1', 'N', 'AH0', 'K', 'EY2'], "T"),  # Expecting "T"
    (['M', 'OW1', 'T', 'IH0', 'V', 'EY2'], "T"),  # Expecting "T"
    (['D', 'IH0', 'V', 'EH1', 'L', 'AH0'], "P"),  # Expecting "P"
    (['IH0', 'S', 'T', 'AE1', 'B', 'L', 'IH0'], "SH"),  # Expecting "SH"
    (['IH2', 'N', 'S', 'T', 'R', 'AH1', 'K'], "T"),  # Expecting "T"
    (['R', 'AH0', 'F', 'AY1'], "N"),            # Expecting "N"
    (['K', 'AA1', 'M', 'P', 'L', 'AH0', 'K', 'EY2'], "T"),  # Expecting "T"
    (['P', 'R', 'AE1', 'K', 'T', 'AH0'],"S"),
    (['K', 'OW1', 'D', 'IH0'], "NG"),
    (['N', 'AE1', 'CH', 'ER0', 'AH0'], "L"),
    (['L' ,'AE1', 'NG', 'G', 'W', 'AH0'], "JH"),
    (['K', 'UW1'], "L"),
    (['T', 'EH1', 'S', 'T', 'IH0'], 'NG'),
    (['K', 'OW1' ], "D")
]

# Function to evaluate accuracy
def evaluate_accuracy(model, test_cases):
    correct_count = 0
    total_count = len(test_cases)

    for input_seq, expected_phoneme in test_cases:
        # Use prediction function
        predicted_phoneme = predict_phoneme(model, input_seq)
        print(f"Input: {input_seq}, Predicted: {predicted_phoneme}, Expected: {expected_phoneme}")

        if predicted_phoneme == expected_phoneme:
            correct_count += 1

    accuracy = (correct_count / total_count) * 100
    print(f"Accuracy: {accuracy:.2f}%")
    return accuracy

# Evaluate the model
accuracy = evaluate_accuracy(loaded_model, test_cases)

--2025-03-20 02:18:34--  https://raw.githubusercontent.com/Brutusa/HuggingFaceNLP/main/helpers.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 632 [text/plain]
Saving to: ‘helpers.py’


2025-03-20 02:18:35 (33.0 MB/s) - ‘helpers.py’ saved [632/632]

--2025-03-20 02:18:35--  https://raw.githubusercontent.com/Brutusa/HuggingFaceNLP/main/words_list.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5328920 (5.1M) [text/plain]
Saving to: ‘words_list.txt’


2025-03-20 02:18:35 (230 MB/s) - ‘words_list.txt’ saved [5328920/5328920]



  2%|▏         | 1/50 [00:36<29:52, 36.58s/it]

Epoch [0/50], Loss: 2.4778


 10%|█         | 5/50 [02:59<26:35, 35.45s/it]