## Helper Functions

#### Syllable Count (adjusts NLTK tokenizer for the *magic "e"* rule)

In [None]:
SSP = SyllableTokenizer()
def magic_e(word):
    
    result = SSP.tokenize(word)
    syll_count = len(result)
    
    if syll_count == 1:
        return syll_count
    
    if re.search('e$', result[len(result) - 1]):
        modified = ''.join([result[i] for i in [len(result) - 2, len(result) - 1]])
        result[len(result) - 2] = modified
        del result[len(result) - 1]
        syll_count = len(result)
        
    return syll_count

#### POS tagger

In [None]:
def get_POS(row):
    
    retList = []
    
    for tag in nltk.pos_tag(row):
        retList.append(tag[1])
    
    return retList

#### Get IPA translations (using provided translations from https://github.com/open-dict-data/ipa-dict)

In [None]:
df = pd.read_csv('phoneticDictionary.csv')
df = pd.DataFrame(list(zip(df['word'], df['phon'])), columns=['word', 'ipa'])

def ipa(section):
    total_words = 0
    ipa_word = list(df['word'])
    ipa_notation = list(df['ipa'])
    ipa_dict = dict(zip(ipa_word, ipa_notation))
    new_sent = []
    for row in section.text:
        sent = []
        words = 0
        for word in row:
            total_words += 1
            words += 1
            if word in ipa_dict.keys():
                this_word = ipa_dict[word].replace("ˈ", "")
                this_word = this_word.replace("ˌ", "")
                sent.append(this_word)
            elif word in punctuation:
                sent.append(word)
            else:
                sent.append(' ')
        new_sent.append(sent)
    return new_sent

#### Functions to retrieve Suffix Tree Language Model for Greek/Latin roots

In [None]:
def get_STLM(train_text):
    trie = SuffixTree()
    sents = []
    words = []
    for sent in train_text:
        sents.append(sent)
        for word in sent:
            words.append(word)
            trie.add(word)
        
    trie.update_all_counts()
    stlm = STLM(trie)
    
    return stlm

def get_STLM_prob(stlm, test):
    y = 0
    seq = Sequence()
    for t in test: seq.push_back(t)
        
    return stlm.prob(seq)

## Letter-Name Alphabetic Stage Functions

#### Check for words with consonant-vowel-consonant short vowel pattern

In [None]:
def check_CVC_short(dataset):
    
    CVC_short = []
    for row in tqdm(dataset['ipa']):
        cvc = 0
        total_words = 0
        for word in row:
            total_words += 1
            if re.search('^[btkzɹsjmfgndɫwpθvhʃð][btkzɹsjmfgndɫwpθvhʃðʒŋ]*[ɪɑæəʊɛ][btkzɹsjmfgndɫwpθvhʃðʒŋ]*[btkzɹsjmfgndɫwpθvhʃðʒŋ]$', word):
                cvc += 1
        
        CVC_short.append(cvc / total_words)
    
    return CVC_short

## Within-Word Pattern Stage

#### Check for basic inflectionals

In [None]:
def check_basic_inflectional(dataset):
    
    verb_tags = ['VBD', 'VBG', 'VBN', 'VBZ']
    text_POS = list(zip(dataset['text'], dataset['POS']))
    inflectional = []
    
    for item in tqdm(text_POS):
        total_words = 0
        inf_end = 0
        i = 0
        for word in item[0]:
            total_words += 1
            if item[1][i] in verb_tags:
                if re.search('es$', word):
                    inf_end += 1
                if re.search('s$', word):
                    inf_end += 1
            i += 1
            
        inflectional.append(inf_end / total_words)
    
    return inflectional

#### Check for complex consonants

In [None]:
def check_complex_cons(dataset):
    
    _complex = []
    text_ipa = list(zip(dataset['text'], dataset['ipa']))
    
    for item in tqdm(text_ipa):
        total_words = 0
        com_cons = 0
        i = 0
        for word in item[0]:
            total_words += 1
            sylls = magic_e(word)
            if sylls == 1:
                if re.search('g', item[0][i]):
                    if not re.search('g', item[1][i]):
                        com_cons += 1
                if re.search('^w', item[0][i]):
                    if not re.search('w', item[1][i]):
                        com_cons += 1
                if re.search('c', item[0][i]):
                    if not re.search('k', item[1][i]):
                        com_cons += 1
                if re.search('k', item[0][i]):
                    if not re.search('k', item[1][i]):
                        com_cons += 1
                if re.search('dʒ$', item[1][i]):
                    com_cons += 1
                if re.search('se$', item[0][i]):
                    if re.search('z$', item[1][i]):
                        com_cons += 1
                if re.search('b$', item[0][i]):
                    if not re.search('b', item[1][i]):
                        com_cons += 1
                if re.search('ce$', item[0][i]):
                    if re.search('s$', item[1][i]):
                        com_cons += 1
                if re.search('[btkzrsjmfndlwpvhg]ch$', item[0][i]):
                    com_cons += 1
                if re.search('[btkzrsjmfndlwpvhg]ge$', item[0][i]):
                    com_cons += 1
                    
            i += 1
            
        _complex.append(com_cons / total_words)
        
    return _complex

## Syllables & Affixes Stage

#### Check type of syllable juncture

In [None]:
def VV(dataset):
    
    DIPHTHONGS = ['aɪ', 'eɪ', 'ɪə', 'ɔɪ', 'aʊ', 'oʊ', 'ʊə', 'eə']
    
    all_vv = []
    for row in tqdm(dataset['ipa']):
        total_words = 0
        vv = 0
        for word in row:
            total_words += 1
            if re.search('[ɪɑæəʊɛiuɔaoe][ɪɑæəʊɛiuɔaoe][ɪɑæəʊɛiuɔaoe]', word):
                vv += 1
            elif re.search('[ɪɑæəʊɛiuɔaoe][ɫɝ]', word):
                vv += 1
            elif re.search('[ɪɑæəʊɛiuɔaoe][ɪɑæəʊɛiuɔaoe]', word):
                result = re.findall('[ɪɑæəʊɛiuɔaoe][ɪɑæəʊɛiuɔaoe]', word)
                for res in result:
                    if res not in DIPHTHONGS:
                        vv += 1
        all_vv.append(vv / total_words)
        
    return all_vv
    
    
def VCCV_doublet(dataset):
    
    vccv_doublet = []
    for row in tqdm(dataset['text']):
        total_words = 0
        vccv = 0
        for word in row:
            total_words += 1
            sylls = magic_e(word)
            if sylls < 2:
                break
            if re.search('[aeiou][btkzrsjmfndlwpvhg][btkzrsjmfndlwpvhg][aeiou]', word):
                result = magic_e_result(word)
                if result[0][-1] in CONSONANTS_TEXT:
                    con = result[0][-1]
                    if con == result[1][0]:
                        vccv += 1
                
        vccv_doublet.append(vccv / total_words)
        
    return vccv_doublet
    

def VCCV(dataset):
    
    vccv_all = []
    
    for row in tqdm(dataset['text']):
        total_words = 0
        vccv = 0
        for word in row:
            total_words += 1
            sylls = magic_e(word)
            if sylls < 2:
                break
            if re.search('[aeiou][btkzrsjmfndlwpvhg][btkzrsjmfndlwpvhg][aeiou]', word):
                result = magic_e_result(word)
                if result[0][-1] in CONSONANTS_TEXT:
                    con = result[0][-1]
                    if con != result[1][0]:
                        vccv += 1
                    
        vccv_all.append(vccv / total_words)
            
    return vccv_all


def VCCCV(dataset):
    
    vcccv_all = []
    
    for row in tqdm(dataset['text']):
        total_words = 0
        vcccv = 0
        for word in row:
            total_words += 1
            sylls = magic_e(word)
            if sylls == 2:
                if re.search('[aeiou][btkzrsjmfndlwpvhg][btkzrsjmfndlwpvhg][btkzrsjmfndlwpvhg][aeiou]', word):
                    vcccv += 1
                    
        vcccv_all.append(vcccv / total_words)
            
    return vcccv_all


def VVCV(dataset):
    
    vvcv_all = []
    
    for row in tqdm(dataset['text']):
        total_words = 0
        vvcv = 0
        for word in row:
            total_words += 1
            sylls = magic_e(word)
            if sylls == 2:
                if re.search('[aeiou][aeiou][btkzrsjmfndlwpvhg][aeiou]', word):
                    vvcv += 1
                    
        vvcv_all.append(vvcv / total_words)
            
    return vvcv_all

#### Check for compound words

In [None]:
def check_compound_words_greedy(dataset):

    cmp_words = []
    words = list(df['word'])
    
    for row in tqdm(dataset['text']):
        compound = 0
        total_words = 0
        i = 0
        for word in row:
            total_words += 1
            if magic_e(word) > 1:
                if len(word) > 4:
                    if word[:2] in words:
                        if word[2:] in words:
                            compound += 1
                    if word[:3] in words:
                        if word[3:] in words:
                            compound += 1
                    if word[:4] in words:
                        if word[4:] in words:
                            compound += 1
                if len(word) > 5:
                    if word[:5] in words:
                        if word[5:] in words:
                            compound += 1
                if len(word) > 6:
                    if word[:6] in words:
                        if word[6:] in words:
                            compound += 1
            elif word in COMPOUND_WORDS:
                compound += 1
            i += 1
                
        cmp_words.append(compound / total_words)           
    
    return cmp_words

#### Check for advanced inflectional endings

In [None]:
def check_adv_inflectional(dataset):
    
    verb_tags = ['VBD', 'VBG', 'VBN', 'VBZ']
    text_POS = list(zip(dataset['text'], dataset['POS']))
    inflectional = []
    
    for item in tqdm(text_POS):
        total_words = 0
        inf_end = 0
        i = 0
        for word in item[0]:
            total_words += 1
            if item[1][i] in verb_tags:
                if re.search('ing$', word):
                    inf_end += 1
                if re.search('ed$', word):
                    inf_end += 1
            i += 1
            
        inflectional.append(inf_end / total_words)
    
    return inflectional

#### Check for inflectional endings for adjectives

In [None]:
def check_adv_inflectional_adj(dataset):
    
    adj_tags = ['JJR', 'JJS', 'RBR', 'RBS', 'JJ']
    text_POS = list(zip(dataset['text'], dataset['POS']))
    inflectional = []              
    
    for item in tqdm(text_POS):
        total_words = 0
        inf_end = 0
        i = 0
        for word in item[0]:
            total_words += 1
            if item[1][i] in adj_tags:
                if item[1][i] == 'JJ':
                    if re.search('ful$', word):
                        inf_end += 1
                    if re.search('ness$', word):
                        inf_end += 1
                    if re.search('less$', word):
                        inf_end += 1
                    if re.search('ily$', word):
                        inf_end += 1
                else:
                    inf_end += 1
                    
            i += 1
            
        inflectional.append(inf_end / total_words)
    
    return inflectional

## Derivational Relations Stage

#### Check for advanced suffixes

In [None]:
def check_adv_suffix(dataset):
    
    adjs_nouns = ['JJR', 'JJS', 'JJ', 'NN', 'NNP', 'NNS']
    verbs = ['VBD', 'VBG', 'VBN', 'VBZ']
    adj_n_suffix = ['ɛɹi$', 'ɔɹi$', 'ənsi$', 'əns$', 'ʒən', 'ʃən', 'əbəɫ$', 'əbɫi$']
    v_suffix = ['aɪz', 'ɪfaɪ', 'əfaɪ']
    adv_suf = []
    ipa_text = list(zip(dataset['ipa'], dataset['text'], dataset['POS']))
    for item in tqdm(ipa_text):
        total_words = 0
        adv = 0
        i = 0
        for word in item[0]:
            if len(word) > 0:
                total_words += 1
                if magic_e(item[1][i]) > 1:
                    if item[2][i] in adjs_nouns:
                        for suf in adj_n_suffix:
                            if re.search(suf, word):
                                adv += 1
                    if item[2][i] in verbs:
                        for suf in v_suffix:
                            if re.search(suf, word):
                                adv += 1
                                
            i += 1
        adv_suf.append(adv / total_words)
        
    return adv_suf

#### Check for assimilated prefixes

In [None]:
def check_assimilated(dataset):
    assim_prefix = []
    for row in tqdm(dataset['text']):
        assimilated = 0
        total_words = 0
        for word in row:
            total_words += 1
            if len(magic_e_result(word)) > 1:
                if re.search('^ill', word):
                    assimilated += 1
                if re.search('^imm[aeiou]', word):
                    assimilated += 1
                if re.search('^imp', word):
                    assimilated += 1
                if re.search('^irr[aeiou]', word):
                    assimilated += 1
                if re.search('^suff', word):
                    assimilated += 1
                if re.search('^supp', word):
                    assimilated += 1
                if re.search('^succ', word):
                    assimilated += 1
                if re.search('^surr', word):
                    assimilated += 1
                if re.search('^coll', word):
                    assimilated += 1
                if re.search('^corr', word):
                    assimilated += 1
                if re.search('^att', word):
                    assimilated += 1
                if re.search('^aff', word):
                    assimilated += 1
                if re.search('^agg', word):
                    assimilated += 1
                if re.search('^all', word):
                    assimilated += 1
                if re.search('^ann', word):
                    assimilated += 1
                if re.search('^app', word):
                    if not re.search('apples', word):
                        assimilated += 1
                if re.search('^ass', word):
                    assimilated += 1
                if re.search('^arr', word):
                    assimilated += 1
                if re.search('^diff', word):
                    assimilated += 1
                if re.search('^eff', word):
                    assimilated += 1
                if re.search('^opp', word):
                    assimilated += 1
                if re.search('^off', word):
                    assimilated += 1
                if re.search('^occ', word):
                    assimilated += 1
                    
        assim_prefix.append(assimilated / total_words)
                    
    return assim_prefix 

#### Check for Greek roots

In [None]:
def check_greek_roots(row):
    
    greek_list = GREEK_ROOTS.keys()

    total_prob = 0
    total_words = 0
    
    for word in row:
        total_words += 1
        prob = 0
        if magic_e(word) > 1:
            for root in greek_list:
                if re.search(root, word):
                    prob = get_STLM_prob(greek_stlms[root], word)
        total_prob += prob
    
    return total_prob / total_words

#### Check for Latin roots

In [None]:
def check_latin_roots(row):
    
    latin_list = LATIN_ROOTS.keys()
    
    total_prob = 0
    total_words = 0
    
    for word in row:
        prob = 0
        total_words += 1
        if word.isalpha():
            if magic_e(word) > 1:
                for root in latin_list:
                    if re.search(root, word):
                        prob = get_STLM_prob(latin_stlms[root], word)
            total_prob += prob
    
    return total_prob / total_words