# Requirements

In [34]:
import pandas as pd
import numpy as np
import json

import mlconjug3

import warnings
warnings.filterwarnings('ignore') 

# Load idioms

## Load idioms

In [35]:
# https://github.com/prateeksaxena2809/EPIE_Corpus - English idioms
# I have cleaned them a little

file = open('../raw_data/Formal_idioms.txt', 'r') # flexible idioms
formal_idioms = file.readlines()
file.close()

file = open('../raw_data/Static_idioms.txt', 'r') # static idioms
static_idioms = file.readlines()
file.close()

# split idiom strings into lists of idiom words
for i in range(len(formal_idioms)):
    formal_idioms[i] = formal_idioms[i].replace("\n", '').split(' ')
for i in range(len(static_idioms)):
    static_idioms[i] = static_idioms[i].replace("\n", '').split(' ')


In [36]:
print(formal_idioms[0:5])
print()
print(static_idioms[0:5])

[['get', 'to', '[pron]', 'point'], ['bring', '(somebody)', 'to', '([pron])', 'knees'], ['make', 'up', '[pron]', 'mind'], ['build', 'bridges'], ['raise', 'eyebrows']]

[['just', 'in', 'case'], ['sorry', 'sight'], ['rule', 'of', 'thumb'], ['carpe', 'diem'], ['salad', 'days']]


## Conjugate formal idioms

In [37]:
# prepare conjugates

conjugator = mlconjug3.Conjugator(language="en")

conjugated_formal_idioms = []
for idiom in formal_idioms:
    conjugated_idiom = []

    conjugated_verbs = []
    raw_conjugated_verbs = conjugator.conjugate(idiom[0])
    raw_conjugated_verbs = raw_conjugated_verbs.iterate()
    for verb in raw_conjugated_verbs:
        if "to " not in verb[-1] and "/" not in verb[-1]:
            conjugated_verbs += [verb[-1]]
    
    # remove duplicates
    conjugated_verbs = [*set(conjugated_verbs)]


    for verb in conjugated_verbs:
        raw_conjugated_idiom = [verb] + idiom[1:]
        conjugated_idiom += [raw_conjugated_idiom]
    
    conjugated_formal_idioms += [conjugated_idiom]


In [38]:
formal_idioms = conjugated_formal_idioms
print(formal_idioms[0:5])

[[['gets', 'to', '[pron]', 'point'], ['got', 'to', '[pron]', 'point'], ['getting', 'to', '[pron]', 'point'], ['gotten', 'to', '[pron]', 'point'], ['get', 'to', '[pron]', 'point']], [['brought', '(somebody)', 'to', '([pron])', 'knees'], ['bringing', '(somebody)', 'to', '([pron])', 'knees'], ['brings', '(somebody)', 'to', '([pron])', 'knees'], ['bring', '(somebody)', 'to', '([pron])', 'knees']], [['makes', 'up', '[pron]', 'mind'], ['made', 'up', '[pron]', 'mind'], ['make', 'up', '[pron]', 'mind'], ['making', 'up', '[pron]', 'mind']], [['build', 'bridges'], ['builds', 'bridges'], ['building', 'bridges'], ['built', 'bridges']], [['raising', 'eyebrows'], ['raise', 'eyebrows'], ['raises', 'eyebrows'], ['raised', 'eyebrows']]]


# Load phrasal verbs

## Load phrasal verbs

In [39]:
# https://github.com/WithEnglishWeCan/generated-english-phrasal-verbs
# I've fixed it a little too
file = open('../raw_data/phrasal_verbs_clean.json', 'r')
raw_phrasal_verbs = json.load(file)
keys = list(raw_phrasal_verbs.keys())

phrasal_verbs = []
for i in range(len(keys)):
    # split phrasal verb
    phrasal_verb = [keys[i].split(' ')]

    phrasal_verbs += phrasal_verb

In [40]:
print(phrasal_verbs[0:5])

[['abide', 'by'], ['accord', 'with'], ['account', 'for'], ['ache', 'for'], ['act', 'as']]


## Conjugate phrasal verbs

In [41]:
# prepare conjugates

conjugator = mlconjug3.Conjugator(language="en")

conj_phrasal_verbs = []
for phrasal_verb in phrasal_verbs:
    conj_phrasal_verb = []

    conj_verbs = []
    raw_conj_verbs = conjugator.conjugate(phrasal_verb[0])
    raw_conj_verbs = raw_conj_verbs.iterate()
    for verb in raw_conj_verbs:
        if "to " not in verb[-1] and "/" not in verb[-1]:
            conj_verbs += [verb[-1]]
    
    # remove duplicates
    conj_verbs = [*set(conj_verbs)]


    for verb in conj_verbs:
        raw_conj_phrasal_verb = [verb] + phrasal_verb[1:]
        conj_phrasal_verb += [raw_conj_phrasal_verb]
    
    conj_phrasal_verbs += [conj_phrasal_verb]

In [42]:
phrasal_verbs = conj_phrasal_verbs
print(phrasal_verbs[0:5])

[[['abided', 'by'], ['abide', 'by'], ['abides', 'by'], ['abiding', 'by']], [['according', 'with'], ['accord', 'with'], ['accorded', 'with'], ['accords', 'with']], [['accounts', 'for'], ['accounting', 'for'], ['account', 'for'], ['accounted', 'for']], [['ache', 'for'], ['aches', 'for'], ['ached', 'for'], ['aching', 'for']], [['act', 'as'], ['acts', 'as'], ['acted', 'as'], ['acting', 'as']]]


# Save processed corpuses

In [43]:
formal_idioms = np.array(formal_idioms)
static_idioms = np.array(static_idioms)
phrasal_verbs = np.array(phrasal_verbs)

# save processed corpuses
np.savez(
    "../preprocessed_data/corpuses.npz",
    formal_idioms=formal_idioms,
    static_idioms=static_idioms,
    phrasal_verbs=phrasal_verbs,
)

In [44]:
# load corpuses
np_data = np.load("../preprocessed_data/corpuses.npz", allow_pickle=True)

formal_idioms = np_data['formal_idioms']
static_idioms = np_data['static_idioms']
phrasal_verbs = np_data['phrasal_verbs']

In [45]:
print(formal_idioms[0:5])
print()
print(static_idioms[0:5])
print()
print(phrasal_verbs[0:5])

[list([['gets', 'to', '[pron]', 'point'], ['got', 'to', '[pron]', 'point'], ['getting', 'to', '[pron]', 'point'], ['gotten', 'to', '[pron]', 'point'], ['get', 'to', '[pron]', 'point']])
 list([['brought', '(somebody)', 'to', '([pron])', 'knees'], ['bringing', '(somebody)', 'to', '([pron])', 'knees'], ['brings', '(somebody)', 'to', '([pron])', 'knees'], ['bring', '(somebody)', 'to', '([pron])', 'knees']])
 list([['makes', 'up', '[pron]', 'mind'], ['made', 'up', '[pron]', 'mind'], ['make', 'up', '[pron]', 'mind'], ['making', 'up', '[pron]', 'mind']])
 list([['build', 'bridges'], ['builds', 'bridges'], ['building', 'bridges'], ['built', 'bridges']])
 list([['raising', 'eyebrows'], ['raise', 'eyebrows'], ['raises', 'eyebrows'], ['raised', 'eyebrows']])]

[list(['just', 'in', 'case']) list(['sorry', 'sight'])
 list(['rule', 'of', 'thumb']) list(['carpe', 'diem'])
 list(['salad', 'days'])]

[list([['abided', 'by'], ['abide', 'by'], ['abides', 'by'], ['abiding', 'by']])
 list([['according', '