In [1]:
%pip install morphemes

Note: you may need to restart the kernel to use updated packages.


In [3]:
import re
import pandas as pd
import requests
from tinydb import TinyDB, where
import os

In [4]:
import json
import os
import re
import pandas as pd
import requests
from tinydb import TinyDB, where
import warnings
from tqdm import tqdm

warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')

default_morpholex_git_location = "https://github.com/hugomailhot/MorphoLex-en/raw/master/MorphoLEX_en.xlsx"


def process_df(df, db):
    table = db.table("WORDS")
    rows = json.loads(df.reset_index().to_json(orient='records'))
    table.insert_multiple(rows)


class MorphemeDatabase:
    def __init__(self, data_path):
        self.data_path = data_path

    def get_excel_dictionary_path(self):
        filename = self.data_path + "/MorphoLEX_en.xlsx"
        return filename

    def get_db_path(self):
        filename = self.data_path + "/db.json"
        return filename

    def create_db(self):
        path = self.get_db_path()
        if os.path.exists(path):
            os.remove(path)
        db = TinyDB(path)
        return db

    def load_db(self):
        path = self.get_db_path()
        if not os.path.exists(path):
            self.refresh()
        db = TinyDB(path)
        return db

    def get_excel(self):
        path = self.get_excel_dictionary_path()
        if os.path.exists(path):
            return pd.ExcelFile(self.get_excel_dictionary_path())
        else:
            self.download_morpholex_dictionary()
            return pd.ExcelFile(self.get_excel_dictionary_path())

    def download_morpholex_dictionary(self, url=default_morpholex_git_location):
        r = requests.get(url)
        f = open(self.get_excel_dictionary_path(), "wb")
        f.write(r.content)

    def refresh(self):
        print("---- Downloading Morpheme Database ----")
        db = self.create_db()
        xl = self.get_excel()
        sheet_names = xl.sheet_names
        for sheet_name in tqdm(sheet_names):
            if re.match("^[0-9]-[0-9]-[0-9]$", sheet_name):
                df = xl.parse(sheet_name)
                process_df(df, db)

    def lookup(self, word):
        db = self.load_db()
        tbl = db.table("WORDS")
        result = tbl.search(where("Word").matches("^" + word + "$", flags=re.IGNORECASE))
        return result

In [9]:
# mb = MorphemeDatabase("data")
# mb.refresh()

---- Downloading Morpheme Database ----


100%|██████████| 34/34 [01:21<00:00,  2.41s/it]


In [5]:
from morphemes import Morphemes

path = "./data"

m = Morphemes(path)

In [6]:
test = m.parse("milkshake")
print(type(test["morpheme_count"]))
print(m.parse("mafoirja"))

<class 'int'>
{'status': 'NOT_FOUND', 'word': 'mafoirja', 'morpheme_count': 1}


In [11]:
%pip install nltk




In [8]:
import nltk # pip install nltk

In [39]:
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\Brad\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\Brad\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\gazetteers.zip.
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\Brad\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\genesis.zip.
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\Brad\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\Brad\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     C:\Users\Brad\AppData\Roaming\

True

In [41]:
print(nltk.corpus.gutenberg.fileids())
print(nltk.corpus.inaugural.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', '1801-Jefferson.txt', '1805-Jefferson.txt', '1809-Madison.txt', '1813-Madison.txt', '1817-Monroe.txt', '1821-Monroe.txt', '1825-Adams.txt', '1829-Jackson.txt', '1833-Jackson.txt', '1837-VanBuren.txt', '1841-Harrison.txt', '1845-Polk.txt', '1849-Taylor.txt', '1853-Pierce.txt', '1857-Buchanan.txt', '1861-Lincoln.txt', '1865-Lincoln.txt', '1869-Grant.txt', '1873-Grant.txt', '1877-Hayes.txt', '1881-Garfield.txt', '1885-Cleveland.txt', '1889-Harrison.txt', '1893-Cleveland.txt', '1897-McKinley.txt', '1901-McKinley.t

In [42]:
words = nltk.corpus.gutenberg.words('melville-moby_dick.txt')
currWords = nltk.corpus.inaugural.words('2021-Biden.txt')


In [21]:
lostWords = []

In [43]:
def getMPW(words):
    t_morph = 0
    error = 0
    len_word = 0
    for w in tqdm(words):
        try: 
            results = m.parse(w)
            if results["status"] == "NOT_FOUND":
                error = error + 1
                lostWords.append(w)
            else:
                t_morph = t_morph + results["morpheme_count"]
                len_word = len_word + 1
        except:
            continue
    print(t_morph)
    print(len_word)
    print(t_morph / len_word)
    print(error)
        

In [44]:
#old: 1.115
getMPW(currWords[:1000])

100%|██████████| 1000/1000 [10:39<00:00,  1.56it/s]

983
862
1.140371229698376
138





In [45]:
lostWords

['Moby',
 '1851',
 ']',
 'Moby',
 '1851',
 ']',
 '--',
 ',',
 ',',
 ',',
 ';',
 ',',
 ',',
 ';',
 '"',
 ',',
 '-',
 ',',
 ',',
 'H',
 ',',
 'maketh',
 ',',
 'true',
 '."',
 '--',
 'HACKLUYT',
 '"',
 'Sw',
 'HVAL',
 ';',
 'HVALT',
 '."',
 '--',
 "'",
 'S',
 '"',
 'Dut',
 'Ger',
 'WALLEN',
 ';',
 'S',
 'WALW',
 '-',
 'IAN',
 ',',
 ',',
 '."',
 '--',
 "'",
 'S',
 'KETOS',
 ',',
 'CETUS',
 ',',
 'WHOEL',
 ',',
 'ANGLO',
 '-',
 'HVALT',
 ',',
 'WAL',
 ',',
 'HWAL',
 ',',
 ',',
 ',',
 'BALEINE',
 ',',
 'BALLENA',
 ',',
 'PEKEE',
 '-',
 'NUEE',
 '-',
 'NUEE',
 ',',
 'FEGEE',
 'PEKEE',
 '-',
 'NUEE',
 '-',
 'NUEE',
 ',',
 'ERROMANGOAN',
 '-',
 '-',
 'burrower',
 '-',
 '-',
 'Vaticans',
 '-',
 ',',
 ',',
 ',',
 ',',
 'higgledy',
 '-',
 'piggledy',
 ',',
 ',',
 ',',
 'cetology',
 ',',
 ',',
 ',',
 "'",
 's',
 ',',
 ',',
 ',',
 ',',
 ',',
 ',',
 '-',
 ',',
 'belongest',
 ',',
 ';',
 '-',
 ';',
 ',',
 '-',
 ',',
 ';',
 ';',
 ',',
 ',',
 '--',
 ',',
 '-',
 '!',
 ',',
 '!',
 'Tuileries',
 '!',
 'hie