# 1. Data Preparation


In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split, Subset
import pandas as pd
import numpy as np
import torch.optim as optim


# Define the allowed characters (26 lowercase + 26 uppercase)
ALPHABET = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
char_to_idx = {char: idx for idx, char in enumerate(ALPHABET)}

def one_hot_encode(letter):
    """Return a 52-dim one-hot vector for a given letter."""
    vec = np.zeros(len(ALPHABET), dtype=np.float32)
    if letter in char_to_idx:
        vec[char_to_idx[letter]] = 1.0
    return vec

class WordDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)  # CSV with columns: word, isEnlgish
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        word = str(row['word'])  # Convert to string in case it's a float
        label = float(row['isEnlgish'])  # 1 for English, 0 for Korean
        # Convert each character in the word to its one-hot representation
        word_seq = [one_hot_encode(ch) for ch in word if ch in char_to_idx]
        word_seq = torch.tensor(word_seq, dtype=torch.float32)  # shape: (seq_length, 52)
        label = torch.tensor([label], dtype=torch.float32)  # shape: (1,)
        return word_seq, label


In [2]:
WordDataset('words.csv')[0]


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\ProgramData\Anaconda3\envs\torchenv\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\ProgramData\Anaconda3\envs\torchenv\Lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
    app.start()
  File "c:\ProgramData\Anaconda3\envs\torchenv\Lib\site-packages\ipykernel\kernelapp.py", line 711, in start
    self.io_loop.start()
  Fil

(tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 tensor([0.]))

In [3]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    sequences, labels = zip(*batch)
    # Determine lengths of each sequence
    lengths = torch.tensor([seq.shape[0] for seq in sequences])
    # Pad sequences so they all have the same length (batch_first=True gives shape: (batch, max_length, 52))
    padded_seqs = pad_sequence(sequences, batch_first=True)
    labels = torch.stack(labels)  # shape: (batch, 1)
    return padded_seqs, labels, lengths


# 2. Model Definition


In [4]:
class LanguageClassifier(nn.Module):
    def __init__(self, input_size=52, hidden_size=128, num_layers=1):
        super(LanguageClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, lengths):
        # Pack the padded sequence
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_out, (h_n, _) = self.lstm(packed)
        # h_n is of shape (num_layers, batch, hidden_size); we use the last layer’s hidden state
        h_last = h_n[-1]  # shape: (batch, hidden_size)
        out = self.fc(h_last)  # shape: (batch, 1)
        out = self.sigmoid(out)  # Output between 0 and 1
        return out


# 3. Training Loop


In [5]:
# Dataset and DataLoader
dataset = WordDataset("words.csv") # Replace 'words.csv' with your dataset path
# Calculate dataset sizes
total_size = len(dataset)
train_size = int(0.7 * total_size)
remaining_size = total_size - train_size
val_size = int(remaining_size / 2)
test_size = remaining_size - val_size


# Use random_split to create training, validation, and test datasets
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn) # No need to shuffle val/test
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn) # No need to shuffle val/test


In [None]:
# --- Model, Criterion, Optimizer ---
model = LanguageClassifier(input_size=52, hidden_size=128, num_layers=1)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 20

# 4. Bringing It All Together


In [7]:
# --- Training and Validation loop ---
model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    for batch in train_dataloader: # Use train_dataloader
        inputs, labels, lengths = batch
        optimizer.zero_grad()
        outputs = model(inputs, lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss/len(train_dataloader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}", end=" ")

    # --- Validation ---
    model.eval() # Set model to evaluation mode for validation
    val_loss = 0.0
    with torch.no_grad(): # Disable gradient calculation for validation
        for batch in val_dataloader: # Use val_dataloader
            inputs, labels, lengths = batch
            outputs = model(inputs, lengths)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
    avg_val_loss = val_loss/len(val_dataloader)
    print(f"Validation Loss: {avg_val_loss:.4f}")
    model.train() # Set model back to training mode


# --- Evaluation on Test set ---
model.eval() # Set model to evaluation mode for testing
test_loss = 0.0
with torch.no_grad():
    for batch in test_dataloader: # Use test_dataloader
        inputs, labels, lengths = batch
        outputs = model(inputs, lengths)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
avg_test_loss = test_loss/len(test_dataloader)
print(f"Test Loss: {avg_test_loss:.4f}")

Epoch 1, Training Loss: 0.3398 Validation Loss: 0.2054
Epoch 2, Training Loss: 0.1840 Validation Loss: 0.1721
Epoch 3, Training Loss: 0.1529 Validation Loss: 0.1712
Epoch 4, Training Loss: 0.1295 Validation Loss: 0.1421
Epoch 5, Training Loss: 0.1140 Validation Loss: 0.1234
Epoch 6, Training Loss: 0.1050 Validation Loss: 0.1162
Epoch 7, Training Loss: 0.0860 Validation Loss: 0.0948
Epoch 8, Training Loss: 0.0715 Validation Loss: 0.0865
Epoch 9, Training Loss: 0.0601 Validation Loss: 0.0721
Epoch 10, Training Loss: 0.0518 Validation Loss: 0.0584
Epoch 11, Training Loss: 0.0441 Validation Loss: 0.0518
Epoch 12, Training Loss: 0.0395 Validation Loss: 0.0473
Epoch 13, Training Loss: 0.0385 Validation Loss: 0.0575
Epoch 14, Training Loss: 0.0328 Validation Loss: 0.0506
Epoch 15, Training Loss: 0.0280 Validation Loss: 0.0472
Epoch 16, Training Loss: 0.0287 Validation Loss: 0.0481
Epoch 17, Training Loss: 0.0249 Validation Loss: 0.0517
Epoch 18, Training Loss: 0.0230 Validation Loss: 0.0450
E

In [27]:
# Example of real-time prediction for a new word
model.eval()
test_word = "hello"
test_seq = [one_hot_encode(ch) for ch in test_word if ch in char_to_idx]
test_seq = torch.tensor(test_seq, dtype=torch.float32).unsqueeze(0)  # shape: (1, seq_length, 52)
length = torch.tensor([len(test_seq[0])])
with torch.no_grad():
    prediction = model(test_seq, length)
print(f"Prediction for '{test_word}': {prediction.item():.4f}")

Prediction for 'hello': 0.9997
