We want to generate a large number of fake three letter words which sound like they could be real words. We will do this using the CMU of phonetic words and a machine learning model

In [15]:
import numpy as np
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import OneHotEncoder
import random

# Open list of scrabble words
with open("C:\\Users\\joshn\\Documents\\Coding\\3 Letter Words\\Collins Scrabble Words (2019).txt", 'r') as file:
    scrabble_words = file.read().splitlines()

# Convert to DataFrame and preprocess
df = pd.DataFrame(scrabble_words, columns=['Word'])
df = df[df['Word'].str.len() == 3]
df['Word'] = df['Word'].str.lower()
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

real_words = df['Word'].tolist()

# One-hot encode the real words
alphabet = list('abcdefghijklmnopqrstuvwxyz')
encoder = OneHotEncoder(categories=[alphabet]*3)
X = np.array([list(word) for word in real_words])
X_encoded = encoder.fit_transform(X).toarray()

# Train a One-Class SVM
svm = OneClassSVM(kernel='linear')
svm.fit(X_encoded)

# Potential Plausible Words must be 3 letters long, not in the list of real words, and contain at least one vowel
from itertools import product

# Define vowels and consonants
vowels = 'aeiouy'
consonants = 'bcdfghjklmnpqrstvwxz'

# Generate all possible 3-letter combinations with at least one vowel
potential_words = set()
for i in range(3):  # Position of the vowel
    for letters in product(*[vowels if position == i else consonants+vowels for position in range(3)]):
        potential_words.add(''.join(letters))
potential_words = list(potential_words.difference(set(real_words)))
# Encode the random words
X_test = np.array([list(word) for word in potential_words])
X_test_encoded = encoder.transform(X_test).toarray()

# Predict on the test set
y_pred = svm.predict(X_test_encoded)

# Find words that are predicted as similar to real words
predicted_as_real = [word for word, pred in zip(potential_words, y_pred) if pred == 1]

print(len(predicted_as_real))
print(predicted_as_real)

with open('C:\\Users\\joshn\\Documents\\Coding\\3 Letter Words\\predicted_as_real.txt', 'w') as file:
    for word in predicted_as_real:
        file.write(word + '\n')

8242
960
['saa', 'aud', 'ouu', 'aam', 'pih', 'mox', 'ioe', 'buo', 'ioy', 'coe', 'fep', 'aer', 'oap', 'oip', 'eow', 'iaw', 'uar', 'aat', 'iud', 'eiy', 'aik', 'aot', 'iep', 'oou', 'aui', 'uon', 'fai', 'pai', 'woy', 'aar', 'tob', 'rer', 'kua', 'kee', 'iek', 'ooz', 'fiy', 'loa', 'uew', 'vao', 'gia', 'yoo', 'uoy', 'eec', 'tib', 'fuy', 'hiy', 'mah', 'iag', 'oae', 'aop', 'dem', 'tue', 'eiu', 'oiv', 'ara', 'eue', 'ouo', 'eiv', 'arz', 'hio', 'kot', 'ooy', 'toi', 'jao', 'uom', 'uou', 'aog', 'doa', 'mex', 'eal', 'iug', 'ieo', 'ior', 'ael', 'yoe', 'eac', 'uea', 'eia', 'rio', 'iuy', 'uii', 'aie', 'aek', 'lon', 'oug', 'oac', 'aur', 'oiw', 'ioo', 'eog', 'aea', 'eun', 'arn', 'teh', 'uem', 'iok', 'uac', 'uek', 'geb', 'cet', 'eof', 'aug', 'uot', 'ceo', 'oad', 'luy', 'cuo', 'gom', 'ead', 'pem', 'euh', 'aab', 'iar', 'zaa', 'lue', 'aib', 'euz', 'euu', 'yeg', 'iob', 'euf', 'eus', 'nop', 'kuo', 'eur', 'aes', 'auz', 'uog', 'ruo', 'uum', 'aaq', 'aoc', 'oii', 'iip', 'uee', 'wao', 'oay', 'tud', 'jes', 'yot', 'ca