# Creation of a dataset.

We create a dataset from 220k given words to mirror hangman game. 
We replace all occurences 
of a few letters by '_' and set that letter as a target letter.
Then we save X and y as .npy file and will be using it for further training.

In [None]:
import numpy as np
import string
import random
import pandas as pd
import pickle

MAX_LEN = 35
CHAR_MAP = {c: i + 1 for i, c in enumerate(string.ascii_lowercase)}
CHAR_MAP['_'] = 27
PAD_VAL = 0
VOWELS = set("aeiou")

def encode_masked_word(word):
    encoded = [CHAR_MAP[c] for c in word]
    return [PAD_VAL] * (MAX_LEN - len(encoded)) + encoded

def encode_target(word):
    vec = [0] * 26
    for c in set(word):
        if c in CHAR_MAP and c != '_':
            vec[CHAR_MAP[c] - 1] = 1
    return vec

def load_words(filepath):
    with open(filepath, 'r') as f:
        words = [line.strip().lower() for line in f if line.strip()]
    return [w for w in words if 3 < len(w) <= 29 and len(set(w)) > 2 and VOWELS.intersection(set(w))]

def permute_all(word, vowel_mode=False):
    unique_letters = list(set(word))
    max_mask = len(unique_letters) - (1 if vowel_mode else 2)
    all_perm = set()
    for i in range(max_mask):
        random_letters = random.sample(unique_letters, i + 1)
        new_word = word
        for l in random_letters:
            new_word = new_word.replace(l, '_')
        all_perm.add(new_word)
    return list(all_perm)

def permute_consonants(word):
    vowels_only = "".join([c if c in VOWELS else "_" for c in word])
    vowel_indices = [i for i, c in enumerate(vowels_only) if c != "_"]
    abridged = vowels_only.replace("_", "")
    perms = permute_all(abridged, vowel_mode=True)
    
    results = []
    for p in perms:
        temp = ['_'] * len(word)
        for i, c in enumerate(p):
            temp[vowel_indices[i]] = c
        results.append("".join(temp))
    return results

# === Vowel Prior Support ===
def get_vowel_prob(df_vowel, vowel):
    try:
        return df_vowel[0].apply(lambda p: vowel in p).value_counts(normalize=True).loc[True]
    except:
        return 0

def get_vowel_prior(df_aug):
    prior_json = {}
    for word_len in range(1, df_aug[1].max() + 1):
        df_vowel = df_aug[df_aug[1] == word_len]
        probs = [get_vowel_prob(df_vowel, v) for v in ['a', 'e', 'i', 'o', 'u']]
        prior_json[word_len] = pd.DataFrame([
            pd.Series(['a', 'e', 'i', 'o', 'u']),
            pd.Series(probs)
        ]).T.sort_values(by=1, ascending=False)
    return prior_json

def save_vowel_prior(vowel_prior):
    pickle.dump(vowel_prior, open("prior_probabilities.pkl", "wb"))

def generate_dataset(words):
    X_data, y_data = [], []
    aug_list = []  # For vowel prior calculation

    for word in words:
        y = encode_target(word)
        aug_list.append((word, len(word)))
        masked_set = set(permute_all(word) + permute_consonants(word))
        for masked in masked_set:
            X_data.append(encode_masked_word(masked))
            y_data.append(y)
    df_aug = pd.DataFrame(aug_list)
    vowel_prior = get_vowel_prior(df_aug)
    save_vowel_prior(vowel_prior)
    print("Saved vowel priors to prior_probabilities.pkl")
    return np.array(X_data), np.array(y_data)

def save_dataset(X, y, prefix='hangman'):
    np.save(f"{prefix}_X.npy", X)
    np.save(f"{prefix}_y.npy", y)
    print(f"Saved dataset: {prefix}_X.npy shape={X.shape}, {prefix}_y.npy shape={y.shape}")

# === Execute ===
words = load_words("words_250000_train.txt")
X, y = generate_dataset(words)
save_dataset(X, y)


# Model Training

We create an LSTM model and train it.

In [None]:
import tensorflow as tf
import numpy as np
import os
from sklearn.model_selection import train_test_split

def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=64, output_dim=64, input_length=35,
                                  embeddings_constraint=tf.keras.constraints.MaxNorm(1)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.2)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False, dropout=0.2)),
        tf.keras.layers.Dense(48, activation='relu'),
        tf.keras.layers.Dropout(0.20),
        tf.keras.layers.Dense(26, activation='sigmoid')
    ])
    return model

def train_model(x_path="hangman_X.npy", y_path="hangman_y.npy", epochs=8, batch_size=128, val_split=0.02):
    # Load data
    X = 0
    y = 0
    X = np.load(x_path)
    y = np.load(y_path)

    # Split into train/val
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_split, random_state=42)

    model = create_model()
    model_path = "bi_lstm.weights.h5"

    if os.path.exists(model_path):
        print("Loading existing model weights...")
        model.load_weights(model_path)

    model.compile(
        optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),
        loss='binary_crossentropy',
        metrics=[tf.keras.metrics.BinaryAccuracy()]
    )

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        batch_size=batch_size,
        epochs=epochs,
        shuffle=True
    )

    model.save_weights(model_path)
    print("Training complete. Model weights saved!")

    return model, history


In [None]:
train_model(epochs = 10)

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"Enabled memory growth on {len(gpus)} GPU(s).")
    except RuntimeError as e:
        print(f"Could not set memory growth: {e}")
else:
    print("No GPU detected.")