In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import pandas as pd

LSTM MODEL

In [None]:
class WordleLSTM(nn.Module):
    """
    LSTM-based neural network for predicting Wordle words.

    Architecture rationale:
    - Embedding layer: Converts integer-encoded letters into dense vectors.
      This allows the model to learn semantic similarities between letters or feedback.
    - LSTM: Captures sequential dependencies between consecutive guesses within a game.
      Since Wordle guesses follow a sequential pattern (later guesses depend on earlier feedback),
      LSTM is suitable for learning these temporal relationships.
    - Dropout layers: Reduce overfitting by randomly dropping units during training.
    - Fully connected layers: Transform LSTM hidden states into predictions for each letter.
      The output dimension is 5 letters * 29 possible classes (vocab_size).
    - Packed sequences: Handle variable-length games efficiently.
    """
    def __init__(
        self,
        vocab_size=29,
        embedding_dim=16,
        input_dim=10,
        hidden_dim=128,
        num_layers=2,
        dropout=0.3
    ):

        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.output_dim = 5 * vocab_size  # 5 Letter  * 29 class = 145

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm_input_dim = input_dim * embedding_dim

        self.lstm = nn.LSTM(
            self.lstm_input_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )

        self.post_lstm_dropout = nn.Dropout(p=dropout)

        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, self.output_dim)

    def forward(self, x, lengths):

        batch_size, seq_len, fifteen_dim = x.size()

        x_embed = self.embedding(x)
        x_embed = x_embed.view(batch_size, seq_len, -1)

        packed_x = nn.utils.rnn.pack_padded_sequence(
            x_embed,
            lengths.cpu(),
            batch_first=True,
            enforce_sorted=False
        )
        packed_out, (h, c) = self.lstm(packed_x)
        out, out_lengths = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)

        out = self.post_lstm_dropout(out)

        out = F.relu(self.fc1(out))
        out = self.post_lstm_dropout(out)
        out = self.fc2(out)

        b_size, seq_len, _ = out.size()
        out = out.view(b_size, seq_len, 5, -1)

        return out

TRAIN AND TEST FUNCTION

In [None]:
def train(model, dataloader, optimizer, criterion, device):
    """
    Trains the given model for one epoch using the provided DataLoader.

    This function performs a full training loop:
    - Iterates over all batches in the DataLoader
    - Moves data to the target device (CPU/GPU)
    - Computes model predictions (logits)
    - Calculates loss and backpropagates gradients
    - Updates model weights using the optimizer
    - Tracks average loss and overall accuracy

    Args:
        model (torch.nn.Module): The model to be trained
        dataloader (torch.utils.data.DataLoader): Provides training batches
        optimizer (torch.optim.Optimizer): Optimization algorithm (e.g., Adam, SGD)
        criterion (torch.nn.Module): Loss function (e.g., CrossEntropyLoss)
        device (torch.device): Target device ("cpu" or "cuda")

    Returns:
        avg_loss (float): Average loss over all batches
        accuracy (float): Overall prediction accuracy (in %)
    """
    model.train()
    total_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for X, Y, lengths in dataloader:
        X, Y, lengths = X.to(device), Y.to(device), lengths.to(device)
        optimizer.zero_grad()

        logits = model(X, lengths)
        logits_flat = logits.view(-1, 29)
        Y_flat = Y.view(-1)

        loss = criterion(logits_flat, Y_flat)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(logits_flat, dim=1)
        correct_predictions += (predicted == Y_flat).sum().item()
        total_predictions += Y_flat.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = 100.0 * correct_predictions / total_predictions if total_predictions > 0 else 0.0
    return avg_loss, accuracy

In [None]:
def test(model, dataloader, criterion, device):
    """
    Evaluates the given model on a validation or test dataset.

    This function performs an evaluation loop:
    - Sets the model to evaluation mode (disables dropout, batchnorm updates)
    - Iterates over all batches in the DataLoader without computing gradients
    - Computes model predictions (logits)
    - Calculates loss and overall accuracy
    - Returns average loss and accuracy across the dataset

    Args:
        model (torch.nn.Module): The model to be evaluated
        dataloader (torch.utils.data.DataLoader): Provides test/validation batches
        criterion (torch.nn.Module): Loss function (e.g., CrossEntropyLoss)
        device (torch.device): Target device ("cpu" or "cuda")

    Returns:
        avg_loss (float): Average loss over all batches
        accuracy (float): Overall prediction accuracy (in %)
    """
    model.eval()
    total_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for X, Y, lengths in dataloader:
            X, Y, lengths = X.to(device), Y.to(device), lengths.to(device)

            logits = model(X, lengths)
            logits_flat = logits.view(-1, 29)
            Y_flat = Y.view(-1)

            loss = criterion(logits_flat, Y_flat)
            total_loss += loss.item()

            _, predicted = torch.max(logits_flat, dim=1)
            correct_predictions += (predicted == Y_flat).sum().item()
            total_predictions += Y_flat.size(0)

    avg_loss = total_loss / len(dataloader) if len(dataloader) > 0 else 0.0
    accuracy = 100.0 * correct_predictions / total_predictions if total_predictions > 0 else 0.0
    return avg_loss, accuracy

Sequence Dataset 

In [None]:
class WordleSequenceDataset(Dataset):
    """
    A custom PyTorch Dataset for handling Wordle gameplay data in sequence format.
    - Each game (identified by 'gameid') contains multiple attempts (rows).
    - Each attempt has encoded features for guesses and feedback.
    - The dataset groups and converts them into sequential tensors for model training.
    """
    def __init__(self, data_frame):
        super().__init__()

        self.sequences = []
        self.labels = []
        self.lengths = []
        # Group all attempts by game ID (each game is treated as one sequence)
        grouped = data_frame.groupby("gameid")
        # Process each game sequence separately
        for gameid, group in grouped:
            group_sorted = group.sort_values(by="attempt_index")

            # Input vector X:
            # - pg1..pg5: previous guessed letters
            # - l1..l5: feedback for each letter (e.g., correct/wrong position)
            # Combined feature dimension = 10 per timestep
            X = group_sorted[[
                "pg1", "pg2", "pg3", "pg4", "pg5",
                "l1", "l2", "l3", "l4", "l5"
                        ]].values

            # Target vector Y:
            # - t1..t5: target word (true letters encoded as integers)
            Y = group_sorted[["t1", "t2", "t3", "t4", "t5"]].values

            X_tensor = torch.tensor(X, dtype=torch.long)
            Y_tensor = torch.tensor(Y, dtype=torch.long)

            self.sequences.append(X_tensor)
            self.labels.append(Y_tensor)
            self.lengths.append(X_tensor.size(0))

    def __len__(self):
        # Return the number of games in the dataset
        return len(self.sequences)

    def __getitem__(self, idx):
        # Return one game sequence (X, Y, length)
        return self.sequences[idx], self.labels[idx], self.lengths[idx]


def collate_fn(batch):
    """
    Custom collate function for DataLoader.
    Pads sequences of different lengths so they can form uniform batches.
    Each batch is a list of tuples: (X_seq, Y_seq, seq_len)
    """
    sequences = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    lengths = [item[2] for item in batch]
    # Pad sequences with zeros to match the length of the longest sequence in the batch
    # This allows variable-length games to be trained together efficiently
    padded_X = pad_sequence(sequences, batch_first=True)
    padded_Y = pad_sequence(labels, batch_first=True)
    # Convert lengths to a tensor (used later for masking or packing)
    lengths_tensor = torch.tensor(lengths, dtype=torch.long)

    return padded_X, padded_Y, lengths_tensor

In [None]:
def prepare_input_dataLoader(csv_path,batch_size,test_ratio,shuffle):
    # Read the CSV file containing the dataset
    df = pd.read_csv(csv_path)

    if df.isnull().any().any():
        raise ValueError("CSV dosyasında eksik değerler var!")
    # Get all unique game IDs (each game corresponds to a sequence of guesses)
    all_gameids = df['gameid'].unique()
    # Split the game IDs into training and test sets
    train_ids, test_ids = train_test_split(all_gameids, test_size=test_ratio, shuffle=shuffle)
    train_df = df[df['gameid'].isin(train_ids)]
    test_df = df[df['gameid'].isin(test_ids)]
    # Create dataset objects for training and testing
    train_dataset = WordleSequenceDataset(train_df)
    test_dataset = WordleSequenceDataset(test_df)
    # Create DataLoaders for batching and shuffling the data
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    return train_loader, test_loader

In [None]:
def main(csv_path="",
         epochs=1,
         batch_size=1,
         lr=1e-3,
         test_ratio=0.2,
         shuffle=True):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Running on: {device}")
    train_loader, test_loader = prepare_input_dataLoader(csv_path=csv_path,batch_size=batch_size,test_ratio=test_ratio,shuffle=shuffle)
    # Model, optimizer, loss
    model = WordleLSTM(
        vocab_size=29,      # letter numbers in alphabet
        embedding_dim=16,   # Embedding dimension
        input_dim=10,       # input dimension
        hidden_dim=256,     # LSTM hidden dimension
        num_layers=4,       # LSTM layer number
        dropout=0.3         # Drop out
    ).to(device)
    lstm_optimizer = optim.Adam(model.parameters(), lr=lr)
    lstm_criterion = nn.CrossEntropyLoss()
    #kfold_result = kfold_training(csv_path, model, k=5, batch_size=batch_size, num_epochs=epoch, device=device, random_state=42)
    for epoch in range(epochs):
        lstm_train_loss, lstm_train_acc = train(model, train_loader, lstm_optimizer, lstm_criterion, device)
        lstm_test_loss, lstm_test_acc = test(model, test_loader, lstm_criterion, device)
        print("-" * 50)
        print(f"Epoch [{epoch+1}/{epochs}]")
        print(f"LSTM  Train Loss: {lstm_train_loss:.4f}, Train Acc: {lstm_train_acc:.2f}%")
        print(f"LTSM  Test  Loss: {lstm_test_loss:.4f}, Test  Acc: {lstm_test_acc:.2f}%")

    torch.save(model.state_dict(), "tr_LSTMmodel_100epoch.pth")

In [None]:
if __name__ == "__main__":
    main(
        csv_path="/content/turkishgamelog1000000.csv",
        epochs=100,
        batch_size=256,
        lr=1e-4,
        test_ratio=0.2,
        shuffle=True
    )
