# Requirements

In [69]:
import pandas as pd
import numpy as np

# Load data and corpuses

In [70]:
# load data
np_data = np.load("../preprocessed_data/phraseology.npz", allow_pickle=True)
data = np_data['data']
labels = np_data['scores']

In [71]:
# load corpuses
np_data = np.load("../preprocessed_data/corpuses.npz", allow_pickle=True)

formal_idioms = np_data['formal_idioms']
static_idioms = np_data['static_idioms']
phrasal_verbs = np_data['phrasal_verbs']

# Extract features

In [72]:
def found_match(text, position, phrasal_verb):
    j = 0
    for word in phrasal_verb:
        if "[" in word or "(" in word:
            continue
        try:
            if text[position+j] == word:
                j += 1
                continue
            else:
                deep_found = False
                for counter in range(1, 3): # 3 is arbitrary
                    if text[position+j+counter] == word:
                        j += 1+counter
                        deep_found = True
                        break
                if deep_found:
                    continue
                return False
        except IndexError:
            return False
    
    return True

## Extract formal idioms

In [None]:
# formal idioms

formal_idioms_vector = np.zeros((data.shape[0], formal_idioms.shape[0]))
print(formal_idioms_vector.shape)

for i, text in enumerate(data):
    for j, word in enumerate(text):
        for k, idiom in enumerate(formal_idioms):
            found = False
            for derivative in idiom:
                if word == derivative[0]:
                    if found_match(text, j, derivative):
                        formal_idioms_vector[i, k] += 1
                        found = True
                        break
            if found:
                break

    
    if(i%500 == 0):
        print(f"{i} texts scanned")
print("Done!")

In [7]:
# save formal_idioms_vector (it takes quite some time to process it)
np.save(
    "../preprocessed_data/formal_idioms_vector.npy",
    formal_idioms_vector
)

In [85]:
# load formal_idioms_vector
formal_idioms_vector = np.load("../preprocessed_data/formal_idioms_vector.npy", allow_pickle=True)
print(formal_idioms_vector.shape)

(3911, 290)


In [86]:
# a bit of analysis
summed_texts = np.sum(formal_idioms_vector, axis=0)
print(summed_texts.shape)

zero_indices = []
count = 0
for i, s in enumerate(summed_texts):
    if s == 0.0:
        count += 1
        zero_indices += [i]
zero_indices = np.array(zero_indices)
print("Number of zero features: ", count)
# print("Zero features: ", zero_indices)

nonzero_indices = np.setdiff1d(np.arange(0, summed_texts.shape[0], 1), zero_indices)
nonzero_idioms = summed_texts[nonzero_indices]
nonzero_corpus = formal_idioms[nonzero_indices]

nonzero_sorted_indices = np.argsort(nonzero_idioms)
nonzero_sorted_idioms = nonzero_idioms[nonzero_sorted_indices]
nonzero_sorted_corpus = nonzero_corpus[nonzero_sorted_indices]

print("Sorted non-zero idioms:")
for i in range(nonzero_sorted_idioms.shape[0]):
    print(f"{nonzero_sorted_idioms[i]}: {nonzero_sorted_corpus[i][0]}")


(290,)
Number of zero features:  246
Sorted non-zero idioms:
1.0: ['caught', '[pron]', 'eye']
1.0: ['gets', 'on', '[pron]', 'nerves']
1.0: ['come', 'to', 'terms', 'with']
1.0: ['taken', '[pron]', 'hike']
1.0: ['watches', '[pron]', 'clock']
1.0: ['loses', '[pron]', 'head']
1.0: ['keeping', 'at', 'bay']
1.0: ['have', '[pron]', 'guts']
1.0: ['thrown', 'in', '[pron]', 'towel']
1.0: ['come', 'to', '[pron]', 'head']
1.0: ['gone', 'with', '[pron]', 'flow']
1.0: ['keeping', 'in', 'touch']
1.0: ['dropping', '[pron]', 'ball']
1.0: ['played', '[pron]', 'joke']
1.0: ['clocked', 'out']
1.0: ['gets', 'out', 'of', 'hand']
1.0: ['were', '[pron]', 'item']
1.0: ['come', 'to', 'fruition']
2.0: ['played', 'safe']
2.0: ['come', 'to', '[pron]', 'attention']
2.0: ['were', 'on', '[pron]', 'ball']
2.0: ['come', 'clean']
2.0: ['brought', 'to', '[pron]', 'table']
3.0: ['loses', '[pron]', 'touch']
3.0: ['gives', '[pron]', '[pron]', 'hand']
3.0: ['do', '[pron]', 'trick']
4.0: ['keeping', '[pron]', 'eye', 'on']
4.0

In [87]:
# save nonzero corpus and feature vector
nonzero_corpus = formal_idioms[nonzero_indices]
nonzero_formal_idioms_vector = formal_idioms_vector[:, nonzero_indices]

print(nonzero_corpus.shape)
print(nonzero_formal_idioms_vector.shape)

np.save(
    "../cropped_data/formal_idioms_corpus.npy",
    nonzero_corpus
)
np.save(
    "../cropped_data/formal_idioms_vector.npy",
    nonzero_formal_idioms_vector
)
np.save(
    "../cropped_data/nonzero_formal_idioms_indices.npy",
    nonzero_indices
)

(44,)
(3911, 44)


## Extract static idioms

In [None]:
# static idioms

static_idioms_vector = np.zeros((data.shape[0], static_idioms.shape[0]))
print(static_idioms_vector.shape)

for i, text in enumerate(data):
    for j, word in enumerate(text):
        for k, idiom in enumerate(static_idioms):
            if word == idiom[0]:
                if found_match(text, j, idiom):
                    static_idioms_vector[i, k] += 1
                    found = True
                    break

    
    if(i%500 == 0):
        print(f"{i} texts scanned")
print("Done!")

In [30]:
# save static_idioms_vector (it takes quite some time to process it)
np.save(
    "../preprocessed_data/static_idioms_vector.npy",
    static_idioms_vector
)

In [88]:
# load formal_idioms_vector
static_idioms_vector = np.load("../preprocessed_data/static_idioms_vector.npy", allow_pickle=True)
print(static_idioms_vector.shape)

(3911, 425)


In [89]:
# a bit of analysis
summed_texts = np.sum(static_idioms_vector, axis=0)
print(summed_texts.shape)

zero_indices = []
count = 0
for i, s in enumerate(summed_texts):
    if s == 0.0:
        count += 1
        zero_indices += [i]
zero_indices = np.array(zero_indices)
print("Number of zero features: ", count)
# print("Zero features: ", zero_indices)

nonzero_indices = np.setdiff1d(np.arange(0, summed_texts.shape[0], 1), zero_indices)
nonzero_idioms = summed_texts[nonzero_indices]
nonzero_corpus = static_idioms[nonzero_indices]

nonzero_sorted_indices = np.argsort(nonzero_idioms)
nonzero_sorted_idioms = nonzero_idioms[nonzero_sorted_indices]
nonzero_sorted_corpus = nonzero_corpus[nonzero_sorted_indices]

print("Sorted non-zero idioms:")
for i in range(nonzero_sorted_idioms.shape[0]):
    print(f"{nonzero_sorted_idioms[i]}: {nonzero_sorted_corpus[i]}")


(425,)
Number of zero features:  362
Sorted non-zero idioms:
1.0: ['black', 'sheep']
1.0: ['pull', 'yourself', 'together']
1.0: ['per', 'se']
1.0: ['piece', 'of', 'cake']
1.0: ['big', 'head']
1.0: ['keep', 'up', 'the', 'good', 'work']
1.0: ['through', 'thick', 'and', 'thin']
1.0: ['blessing', 'in', 'disguise']
1.0: ['nine', 'to', 'five']
1.0: ['god', 'knows']
1.0: ['safe', 'and', 'sound']
1.0: ['by', 'and', 'large']
1.0: ['time', 'flies']
1.0: ['up', 'in', 'the', 'air']
1.0: ['on', 'thin', 'ice']
1.0: ['on', 'the', 'rocks']
1.0: ['about', 'face']
1.0: ['fall', 'for', 'it']
1.0: ['behind', 'bars']
1.0: ['big', 'bucks']
1.0: ['out', 'of', 'sight']
1.0: ['dog', 'days']
1.0: ['the', 'best', 'of', 'both', 'worlds']
1.0: ['bottom', 'line']
1.0: ['home', 'truth']
1.0: ['middle', 'of', 'the', 'road']
1.0: ['in', 'bad', 'shape']
2.0: ['you', 'bet']
2.0: ['down', 'to', 'earth']
2.0: ['beyond', '[pron]', 'wildest', 'dreams']
2.0: ['rest', 'up']
2.0: ['at', 'hand']
2.0: ['time', 'is', 'money']
2.0

In [90]:
# save nonzero corpus and feature vector
nonzero_corpus = static_idioms[nonzero_indices]
nonzero_static_idioms_vector = static_idioms_vector[:, nonzero_indices]

print(nonzero_corpus.shape)
print(nonzero_static_idioms_vector.shape)

np.save(
    "../cropped_data/static_idioms_corpus.npy",
    nonzero_corpus
)
np.save(
    "../cropped_data/static_idioms_vector.npy",
    nonzero_static_idioms_vector
)
np.save(
    "../cropped_data/nonzero_static_idioms_indices.npy",
    nonzero_indices
)

(63,)
(3911, 63)


## Extract phrasal verbs

In [None]:
# phrasal verbs

phrasal_verbs_vector = np.zeros((data.shape[0], phrasal_verbs.shape[0]))
print(phrasal_verbs_vector.shape)

for i, text in enumerate(data):
    for j, word in enumerate(text):
        for k, phrasal_verb in enumerate(phrasal_verbs):
            found = False
            for derivative in phrasal_verb:
                if word == derivative[0]:
                    if found_match(text, j, derivative):
                        phrasal_verbs_vector[i, k] += 1
                        found = True
                        break
            if found:
                break

    
    if(i%500 == 0):
        print(f"{i} texts scanned")
print("Done!")

In [37]:
# save phrasal_verbs_vector (it takes quite some time to process it)
np.save(
    "../preprocessed_data/phrasal_verbs_vector.npy",
    phrasal_verbs_vector
)

In [91]:
# load formal_idioms_vector
phrasal_verbs_vector = np.load("../preprocessed_data/phrasal_verbs_vector.npy", allow_pickle=True)
print(phrasal_verbs_vector.shape)

(3911, 3348)


In [92]:
# a bit of analysis
summed_texts = np.sum(phrasal_verbs_vector, axis=0)
print(summed_texts.shape)

zero_indices = []
count = 0
for i, s in enumerate(summed_texts):
    if s == 0.0:
        count += 1
        zero_indices += [i]
zero_indices = np.array(zero_indices)
print("Number of zero features: ", count)
# print("Zero features: ", zero_indices)

nonzero_indices = np.setdiff1d(np.arange(0, summed_texts.shape[0], 1), zero_indices)
nonzero_idioms = summed_texts[nonzero_indices]
nonzero_corpus = phrasal_verbs[nonzero_indices]

nonzero_sorted_indices = np.argsort(nonzero_idioms)
nonzero_sorted_idioms = nonzero_idioms[nonzero_sorted_indices]
nonzero_sorted_corpus = nonzero_corpus[nonzero_sorted_indices]

print("Sorted non-zero phrasal verbs:")
for i in range(nonzero_sorted_idioms.shape[0]):
    print(f"{nonzero_sorted_idioms[i]}: {nonzero_sorted_corpus[i][0]}")


(3348,)
Number of zero features:  2248
Sorted non-zero phrasal verbs:
1.0: ['abided', 'by']
1.0: ['drawing', 'into']
1.0: ['reads', 'through']
1.0: ['drawing', 'away']
1.0: ['drag', 'out']
1.0: ['dozed', 'off']
1.0: ['reports', 'to']
1.0: ['doubling', 'as']
1.0: ['drawing', 'out']
1.0: ['rings', 'up']
1.0: ['rose', 'above']
1.0: ['dividing', 'up']
1.0: ['dividing', 'into']
1.0: ['dive', 'in']
1.0: ['dish', 'out']
1.0: ['rolls', 'off']
1.0: ['dipped', 'in']
1.0: ['rings', 'with']
1.0: ['drawing', 'up']
1.0: ['reads', 'off']
1.0: ['drilled', 'down']
1.0: ['pricing', 'up']
1.0: ['pulling', 'back']
1.0: ['pulling', 'in']
1.0: ['pulling', 'off']
1.0: ['pulling', 'on']
1.0: ['pulling', 'through']
1.0: ['pulling', 'together']
1.0: ['pump', 'up']
1.0: ['pushed', 'ahead']
1.0: ['pushed', 'around']
1.0: ['pushed', 'out']
1.0: ['drug', 'up']
1.0: ['put', 'around']
1.0: ['dropping', 'by']
1.0: ['drove', 'up']
1.0: ['drove', 'down']
1.0: ['drank', 'down']
1.0: ['digging', 'out']
1.0: ['power', 'off

In [93]:
# save nonzero corpus and feature vector
nonzero_corpus = phrasal_verbs[nonzero_indices]
nonzero_phrasal_verbs_vector = phrasal_verbs_vector[:, nonzero_indices]

print(nonzero_corpus.shape)
print(nonzero_phrasal_verbs_vector.shape)

np.save(
    "../cropped_data/phrasal_verbs_corpus.npy",
    nonzero_corpus
)
np.save(
    "../cropped_data/phrasal_verbs_vector.npy",
    nonzero_phrasal_verbs_vector
)
np.save(
    "../cropped_data/nonzero_phrasal_verbs_indices.npy",
    nonzero_indices
)

(1100,)
(3911, 1100)


# Other

In [68]:
# save labels
np.save(
    "../cropped_data/labels.npy",
    labels
)
np.save(
    "../preprocessed_data/labels.npy",
    labels
)

In [94]:
# test load data
formal_idioms_vector = np.load("../cropped_data/formal_idioms_vector.npy", allow_pickle=True)
static_idioms_vector = np.load("../cropped_data/static_idioms_vector.npy", allow_pickle=True)
phrasal_verbs_vector = np.load("../cropped_data/phrasal_verbs_vector.npy", allow_pickle=True)

print(formal_idioms_vector.shape)
print(static_idioms_vector.shape)
print(phrasal_verbs_vector.shape)

(3911, 44)
(3911, 63)
(3911, 1100)


In [97]:
# resave cropped corpuses

formal_idioms = np.load("../cropped_data/formal_idioms_corpus.npy", allow_pickle=True)
static_idioms = np.load("../cropped_data/static_idioms_corpus.npy", allow_pickle=True)
phrasal_verbs = np.load("../cropped_data/phrasal_verbs_corpus.npy", allow_pickle=True)

print(formal_idioms.shape)
print(static_idioms.shape)
print(phrasal_verbs.shape)

np.savez(
    "../cropped_data/corpuses.npz",
    formal_idioms=formal_idioms,
    static_idioms=static_idioms,
    phrasal_verbs=phrasal_verbs,
)


(44,)
(63,)
(1100,)
