## Subset of Glossary for HEROKU

In [2]:
# Based on the larger glossary, pick only those SRC terms that are in tf-idf feature names. At the same
# time, TRG translations must occur in the small corpus to be eligibile for inclusion 

In [3]:
PROC_DATA_PREFIX = '/Users/alexskrn/Documents/NLP/WordAlign/wordalign_notebooks/data'

In [4]:
!wc -l {PROC_DATA_PREFIX}/lex_preproc40_cleaned

   56460 /Users/alexskrn/Documents/NLP/WordAlign/wordalign_notebooks/data/lex_preproc40_cleaned


In [5]:
# Build eng-rus dictionary
en_ru_dict = dict()
with open(PROC_DATA_PREFIX + '/' + 'lex_preproc40_cleaned', 'r', encoding='utf8') as inF:
    for line in inF:
        src, trg = line.split('\t')
        try:
            en_ru_dict[src.strip()].append(trg.strip())
        except KeyError:
             en_ru_dict[src.strip()] = [trg.strip()]

print(len(en_ru_dict))

8419


In [6]:
# Collect vocabulary sets for small datasets
import string
import nltk

stopwords = ['the', 'a', 'an', 'of',
#              's', 'and', 'и', 'or', 'или',
#              'been', 'being', 'by'
            ]

def preprocess(text):
    """Return a string cleaned up."""
    # lowercase
    text = text.lower()
    # word-tokenize & remove numbers if the entire token consists of numbers
    text = ' '.join(t for t in nltk.wordpunct_tokenize(text) if not t.isdigit() and not t in stopwords)
    # remove punctuation
    punct_remove = set(string.punctuation) | {'−', '\t', '\n', '\r', '\x0b', '\x0c', '◦', '°'}
    text = ''.join(char for char in text if char not in punct_remove)
    # strip extra whitespaces
    text = ' '.join(text.split())
    return text

voc_set_src = set()
voc_set_trg = set()
with open(PROC_DATA_PREFIX + '/' + 'en_ru_heroku_1000', 'r', encoding='utf8') as inF:
    for line in inF:
        line_list = line.split('\t')
        src_str, trg_str = preprocess(line_list[0].strip()), preprocess(line_list[1].strip())
        src, trg = src_str.split(), trg_str.split()
        for tok in src:
            voc_set_src.add(tok)
        for tok in trg:
            voc_set_trg.add(tok)
print('Unique words in SRC and TRG:')            
print(len(voc_set_src))
print(len(voc_set_trg))

Unique words in SRC and TRG:
2452
3638


In [7]:
# Nuber of lines in small raw and tok-ed files 
!wc -l {PROC_DATA_PREFIX}/'en_ru_heroku_1000'
!wc -l {PROC_DATA_PREFIX}/'en_ru_heroku_tok_1000'

     994 /Users/alexskrn/Documents/NLP/WordAlign/wordalign_notebooks/data/en_ru_heroku_1000
     980 /Users/alexskrn/Documents/NLP/WordAlign/wordalign_notebooks/data/en_ru_heroku_tok_1000


In [8]:
# Collect a vocabulary of most important words

# Get a list of tok-ed sentences from the raw text file
corpus_src = []
with open(PROC_DATA_PREFIX + '/' + 'en_ru_heroku_1000', 'r', encoding='utf8') as inF:
    for line in inF:
        src, _ = line.split('\t')
        corpus_src.append(preprocess(src).strip())
print(len(corpus_src))

994


In [9]:
# Score vocab items (in iterations, starting from 1000 features and going up)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=True,
    max_features=2300,  # Keep increasing until the target of 9000 lines in Glossary is reached
)
tfidf_vectorizer.fit_transform(corpus_src)
feature_names = tfidf_vectorizer.get_feature_names()

print('Nuber of features in SRC corpus:\n', len(feature_names))
print('First 50 features:\n', feature_names[:50])

Nuber of features in SRC corpus:
 2291
First 50 features:
 ['7328th', 'aa', 'abdallah', 'abdelrahman', 'abdulkader', 'abdullah', 'abdulqader', 'absentia', 'abu', 'abubakar', 'abuses', 'abzar', 'academy', 'acceptable', 'acceptance', 'access', 'accommodations', 'accordance', 'according', 'accordingly', 'account', 'accountability', 'accounts', 'accuracy', 'accused', 'achievable', 'achieve', 'achieved', 'achievements', 'acknowledge', 'acordos', 'acoustic', 'act', 'acting', 'action', 'actions', 'active', 'actively', 'activities', 'activity', 'acts', 'adam', 'add', 'additional', 'address', 'addressing', 'administration', 'admiral', 'adolphus', 'adopt']


In [10]:
# Based on the large glossary, pick only those SRC terms that are in feature names. At the same
# time, TRG translations must occur in the small corpus to be eligibile for inclusion 
print('current number of important words:\n', len(feature_names))
heroku_glossary = []
for src in feature_names:
    try:
        trg_list = en_ru_dict[src]
    except KeyError:
        pass
    else:
        for trg in trg_list:
            if trg in voc_set_trg:
                heroku_glossary.append('{}\t{}\n'.format(src, trg))

print('resulting heroku glossary size in lines:\n', len(heroku_glossary))

current number of important words:
 2291
resulting heroku glossary size in lines:
 6827


In [11]:
# Write Heroku glossary to file
with open(PROC_DATA_PREFIX + '/' + 'heroku_glossary', 'w', encoding='utf8') as toF:
    for line in heroku_glossary: 
        toF.write(line)

In [12]:
!wc -l {PROC_DATA_PREFIX}/heroku_glossary

    6827 /Users/alexskrn/Documents/NLP/WordAlign/wordalign_notebooks/data/heroku_glossary


In [1]:
# %history -g -f history