# Data Processing/Cleaning

In [1]:
import glob

raw_dir = '../raw/dictionaries/'
out_dir = '../webapp/data/languages/'

In [3]:
# for every folder in raw/ (except en), create a folder in webapp/data/

import os
import glob
import random

# problematic: tlh, vi, sr, ne, el, rw  

def dict_to_5words(lang):
    """
    filters a word dictionary to 5-letter words and saves them to a file
    """
    words = []
    with open(raw_dir + lang + '/index.dic', 'r') as f:
        i = 0
        for line in f:
            i += 1
            if line[0] == '/':
                continue
            try:
                word = line.split('/')[0]
                word = word.strip().lower()
                forbidden_charset = "0123456789/-"
                forbidden_charset += ""
                if any(c in forbidden_charset for c in word):
                    continue
                if len(word) == 5 and word not in words:
                    words.append(word.lower())
            except Exception as e:
                print(f"Error at line {i}: {e} in {lang}")
                pass
            
    # if '_characters.txt' file exists in out_dir, load it and filter by it
    try:
        with open(out_dir + lang + '/' + lang + '_characters.txt', 'r') as f:
            characters = [line.strip() for line in f]
    except FileNotFoundError:
        characters = []
    
    # filter words by characters
    if characters != []:
        words = [word for word in words if all([char in characters for char in word])]

    random.seed(42)
    random.shuffle(words)

    with open(out_dir + lang + '/' + lang + '_5words.txt', 'w') as f:
        for word in words:
            f.write(word + '\n')

    # make list of all appearing characters
    characters = []
    for word in words:
        for char in word:
            if char not in characters:
                characters.append(char)
    
    # order characters
    characters.sort()
    # write characters to file if not already there
    if not os.path.exists(out_dir + lang + '/' + lang + '_characters.txt'):
        with open(out_dir + lang + '/' + lang + '_characters.txt', 'w') as f:
            for char in characters:
                f.write(char + '\n')
    
    print(f"{lang} words: {len(words)} characters: {len(characters)}")  

    return words

In [4]:
folders = [folder for folder in glob.glob(raw_dir + '*') if "-" not in folder]

for folder in folders:
    lang = folder.split('/')[-1]
    if lang == 'en':  # use official word list for english
        continue
    if not glob.glob(out_dir + lang):
        os.mkdir(out_dir + lang)
    words = dict_to_5words(lang)

ie words: 2146 characters: 33
ko words: 8921 characters: 65
ne words: 2196 characters: 48
sl words: 11730 characters: 25
nl words: 7440 characters: 36
lb words: 1751 characters: 29
hu words: 6046 characters: 31
fr words: 4481 characters: 40
oc words: 4203 characters: 38
ia words: 2475 characters: 26
mk words: 5997 characters: 31
ga words: 5081 characters: 23
tr words: 9223 characters: 29
is words: 8284 characters: 32
it words: 2782 characters: 32
ru words: 4687 characters: 32
pt words: 9015 characters: 38
es words: 3601 characters: 33
lv words: 2774 characters: 33
br words: 7142 characters: 28
ca words: 9078 characters: 37
hr words: 3590 characters: 27
et words: 9458 characters: 28
ltg words: 387 characters: 34
sr words: 17967 characters: 30
pl words: 10183 characters: 32
he words: 64539 characters: 27
vi words: 738 characters: 88
fa words: 11252 characters: 37
eu words: 7519 characters: 27
uk words: 9588 characters: 33
ro words: 8617 characters: 30
fur words: 3568 characters: 36
fy wo

In [5]:
# Custom arabic word source
import pandas as pd
df = pd.read_excel(raw_dir + 'ar/Top-50000-Arabic-Words-Masterlist_ModernStandardArabic.com_.xlsx')

Unnamed: 0,Word Number,Frequency,Arabic,English,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,1,2285403,لا,No,,,,,
1,2,2229495,من,Of,,,,,
2,3,1789391,في,In,,,,,
3,4,1761748,أن,That,,,,,
4,5,1624794,هذا,This is,,,,,
...,...,...,...,...,...,...,...,...,...
4995,4996,2304,احتفظ,Keep a,,,,,
4996,4997,2303,الصخور,Rock,,,,,
4997,4998,2303,أخبروني,They told me,,,,,
4998,4999,2303,اخرجوا,Get out,,,,,


In [40]:
raw_words = df[["Arabic"]].values.tolist()
words = []
for word in raw_words:
    word = word[0]
    # ـ
    forbidden_charset = 'abcdefghijklmnopqrstuvwxyz -0123456789'
    try:
        if any (char.isupper() for char in word) or any (char in forbidden_charset for char in word) or any (char.isdigit() for char in word):
            continue
        if len(word) == 5 and word not in words:
            words.append(word.lower())
    except Exception as e:
        print(f"{e}")
        pass

# make list of all appearing characters
characters = []
for word in words:
    for char in word:
        if char not in characters:
            characters.append(char)

print(f"Arabic 5 words: {len(words)}")
print(f"Arabic characters: {len(characters)}")
print(" ".join(characters))

'bool' object is not iterable
Arabic 5 words: 14157
Arabic characters: 37
ع ن د م ا ل ذ أ ر ت ق و ي ك ج س ح ة ب ه ش ء ؤ خ ص ظ ث آ ئ ض ط ى غ ز ف إ ـ


In [59]:
# Let's do a more reusable version of all this
import random

def process_wordlist(wordlist, language_code, desired_word_length=5, forbidden_charset=None, acceptable_charset=None):
    """Takes in a wordlist and processes it for wordle consumption"""
    # EDIT: KINDA OUTDATED
    words = []
    for word in wordlist:
        word = word.strip().lower()
        if forbidden_charset:
            if any (char in forbidden_charset for char in word):
                continue
        if acceptable_charset:
            if not any (char in acceptable_charset for char in word):
                continue
        if len(word) == desired_word_length and word not in words:
            words.append(word)
    
    # mix up the words in case they were in a certain order
    random.seed(42)
    random.shuffle(words)
    
    characters = []
    for word in words:
        for char in word:
            if char not in characters:
                characters.append(char)
    
    # write to file
    with open(out_dir + 'languages/' + language_code + '/' + language_code + '_' + str(desired_word_length) + 'words.txt', 'w') as f:
        for word in words:
            f.write(word + '\n')
    
    with open(out_dir + 'languages/' + language_code + '/' + language_code + '_characters.txt', 'w') as f:
        for char in characters:
            f.write(char + '\n')
    
    print(f"{language_code} words: {len(words)} characters: {len(characters)}")
    print(f"characterset for {language_code}: {' '.join(characters)}")
    return words, characters

In [60]:
words, characters = process_wordlist(words, "ar", 5, forbidden_charset="abcdefghijklmnopqrstuvwxyz -0123456789 ـ")

ar words: 13882 characters: 36
characterset for ar: و ا ل س ق ب ه م ح أ ج ت ض ي ر ف ة ذ ع ك ن ز ش ص إ د ئ خ ط ظ ء ث غ آ ى ؤ


In [16]:
# let's do Tolkien's Quenya for funsies

url = "https://folk.uib.no/hnohf/quen-eng.htm"
import requests
from bs4 import BeautifulSoup

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# the words are the first bolded text in each top level paragraph
# <p><b>WORD</b>...</p>

# extract words
words = []
for p in soup.find_all('p'):
    try:
        word = p.find('b').text
        word = word.strip().lower()
        if len(word) == 5 and word not in words:
            words.append(word)
    except:
        pass

# shuffle the words
random.seed(42)
random.shuffle(words)

words[:10]


['intye',
 'palme',
 'callo',
 'ormal',
 'nosse',
 'rúnya',
 'pasta',
 'remba',
 'rambe',
 'lauca']

In [18]:

# save to '../webapp/data/languages/qya/qya_5words.txt'
with open(out_dir + 'qya/qya_5words.txt', 'w') as f:
    for word in words:
        f.write(word + '\n')

# save to '../webapp/data/languages/qya/qya_characters.txt'
characters = []
for word in words:
    for char in word:
        if char not in characters:
            characters.append(char)
characters.sort()
with open(out_dir + 'qya/qya_characters.txt', 'w') as f:
    for char in characters:
        f.write(char + '\n')

In [26]:
# let's try a better arabic wordlist
# using file: '../raw/ar/hans-wehr.wordlist.txt' and acceptable characters: '../webapp/data/languages/ar/ar_characters.txt'

with open('../raw/ar/hans-wehr.wordlist.txt', 'r') as f:
    words = f.read().split('\n')

with open(out_dir + 'ar/ar_characters.txt', 'r') as f:
    characters = f.read().split('\n')

print(f"Arabic 5 words: {len(words)}")
words_clean = []
for word in words:
    if len(word) == 5 and word not in words_clean and all(char in characters for char in word):
        words_clean.append(word)

print(len(words_clean))
    

# shuffle the words
random.seed(42)
random.shuffle(words_clean)

# save to '../webapp/data/languages/ar/ar_5words.txt'
with open(out_dir + 'ar/ar_5words.txt', 'w') as f:
    for word in words_clean:
        f.write(word + '\n')


Arabic 5 words: 34553
10174


In [25]:
words
len('مستميت')

6

In [16]:
# gather all external wordles from 'https://rwmpelstilzchen.gitlab.io/wordles/'

import requests
from bs4 import BeautifulSoup

response = requests.get('https://rwmpelstilzchen.gitlab.io/wordles/')
soup = BeautifulSoup(response.text, 'html.parser')

In [18]:

# find table that has a child caption with text ' Multilingual Wordle-like games; 402 entries'
tables = soup.find_all('table')
for t in tables:
    if t.caption:
        if t.caption.text == ' Multilingual Wordle-like games; 402 entries':
            break

table = t

<table class="pure-table mediatable">
<caption>📰 Other related media articles; 6 entries</caption>
<thead>
<tr>
<th>Publication</th>
<th>Title</th>
<th>Type</th>
<th>Date</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<a href="https://pudding-entertainment.medium.com/">Pudding Entertainment</a>
</td>
<td>
<a href="https://pudding-entertainment.medium.com/unity-create-your-own-wordle-in-under-24-hours-a030024b2a4e">Unity: Create your own Wordle in under 24 hours</a>
</td>
<td>tutorial</td>
<td>2022-02-22</td>
</tr>
<tr>
<td>
<a href="https://www.afp.com/">AFP</a>
</td>
<td>
<a href="https://youtu.be/xpXyrreW4Zw">Wordle goes global: the race is on to translate viral game</a>
</td>
<td>video</td>
<td>2022-02-08</td>
</tr>
<tr>
<td>
<a href="https://www.theguardian.com/">The Guardian</a>
</td>
<td>
<a href="https://www.theguardian.com/games/2022/feb/06/worried-about-losing-wordle-here-are-some-alternatives-just-in-case">Worried about losing Wordle? Here are some alternatives, just in case</a>
</td>

In [4]:
soup

<!DOCTYPE html>

<html>
<head>
<title>Wordles of the World</title><title>Wordles of the World</title>
<link href="gfx/logo.png" rel="icon"/>
<link href="https://fonts.googleapis.com" rel="preconnect"/>
<link crossorigin="" href="https://fonts.gstatic.com" rel="preconnect"/>
<link href="https://fonts.googleapis.com/css2?family=Gentium+Basic:ital,wght@0,400;0,700;1,400;1,700&amp;display=swap" rel="stylesheet"/>
<link crossorigin="anonymous" href="https://unpkg.com/purecss@2.0.6/build/pure-min.css" integrity="sha384-Uu6IeWbM+gzNVXJcM9XV3SohHtmWE+3VGi496jvgX1jyvDTXfdK+rfZc8C1Aehk5" rel="stylesheet"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="style.css" rel="stylesheet"/>
<script crossorigin="anonymous" src="https://twemoji.maxcdn.com/v/latest/twemoji.min.js"></script>
<script src="https://code.jquery.com/jquery-git.js"></script>
<meta content="An ever-growing multilingual list of Wordle-like games from all over the world 🟩🟩🟩🟩🟩" name="description"/>
<