In [1]:
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.corpus import brown
import re
from nltk import WordNetLemmatizer, pos_tag, word_tokenize
from nltk.corpus import wordnet
import spacy
import numpy as np
import collections

In [2]:
with open('corpus.txt', 'r') as f:
    corpus = []
    for line in f: # loops over all the lines in the corpus
        line = line.strip() # strips off \n \r from the ends 
        if line: # Take only non empty lines
            line = re.sub(r'[^a-zA-Z0-9\s]', '', line) 
            line = line.lower()
            line = re.sub(' +',' ', line) # Removes consecutive spaces
            # add more pre-processing steps
            corpus.append(line)
print("\n".join(corpus[:5])) # Shows the first 5 lines of the corpus

deep learning also known as deep structured learning or hierarchical learning is part of a broader family of machine learning methods based on learning data representations as opposed to taskspecific algorithms learning can be supervised partially supervised or unsupervised
some representations are loosely based on interpretation of information processing and communication patterns in a biological nervous system such as neural coding that attempts to define a relationship between various stimuli and associated neuronal responses in the brain research attempts to create efficient systems to learn these representations from largescale unlabeled data sets
deep learning architectures such as deep neural networks deep belief networks and recurrent neural networks have been applied to fields including computer vision speech recognition natural language processing audio recognition social network filtering machine translation and bioinformatics where they produced results comparable to and in

In [3]:
# Convert treebank tags to Wordnet tag
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None # for easy if-statement

In [4]:
def ngram_generator(s, n, Stemming):
    
    if(len(s.split()) < n):
        return "ERROR, NUMBER OF GRAM EXCEED TEXT!"

    # Break sentence in the token, remove empty tokens
    token = [token for token in s.split(" ") if token != ""]
    token.insert(0,'<BOS>')
    token.insert(-1,'<EOS>')
    if(Stemming == True):
        lemmatizer = WordNetLemmatizer()
        tagged = nltk.pos_tag(token)
        token = []
        for word, tag in tagged:
            wntag = get_wordnet_pos(tag)
            if wntag is None:# not supply tag in case of None
                lemma = lemmatizer.lemmatize(word) 
                token.append(lemma)
            else:
                lemma = lemmatizer.lemmatize(word, pos = wntag) 
                token.append(lemma)
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[token[i:] for i in range(n)])
    return [' '.join(ngram) for ngram in ngrams]

In [5]:
# Constructiong vocabulary list
vocabulary_list = list(str(corpus).split())
vocabulary_list[:10]

["['deep",
 'learning',
 'also',
 'known',
 'as',
 'deep',
 'structured',
 'learning',
 'or',
 'hierarchical']

In [6]:
# Counting word frequency
word_counter = collections.Counter()
for term in str(corpus).split():
    word_counter.update({term: 1})
word_counter.most_common(10)

[('the', 19062),
 ('of', 13563),
 ('and', 9444),
 ('in', 7130),
 ('to', 6965),
 ('a', 6740),
 ('that', 3544),
 ('development', 3292),
 ('is', 3089),
 ('for', 2891)]

In [7]:
vocabulary_list.append(('UNK', 1))
Idx = range(1, len(vocabulary_list)+1)
vocab = [t[0] for t in vocabulary_list]

Word2Idx = dict(zip(vocab, Idx))
Idx2Word = dict(zip(Idx, vocab))

Word2Idx['PAD'] = 0
Idx2Word[0] = 'PAD'
VOCAB_SIZE = len(Word2Idx)
print('Word2Idx Size: {}'.format(len(Word2Idx)))
print('Idx2Word Size: {}'.format(len(Idx2Word)))

Word2Idx Size: 41
Idx2Word Size: 314168


In [8]:
# Spliting Train Validation Test
from sklearn.model_selection import train_test_split

train_validation, test = train_test_split(corpus ,test_size = 0.2,train_size = 0.8)
train, validation = train_test_split(train_validation ,test_size = 0.25,train_size =0.75)

In [9]:
tri_gram = ngram_generator(str(corpus), 3 , Stemming = False)
tri_gram[:10]

["<BOS> ['deep learning",
 "['deep learning also",
 'learning also known',
 'also known as',
 'known as deep',
 'as deep structured',
 'deep structured learning',
 'structured learning or',
 'learning or hierarchical',
 'or hierarchical learning']

In [10]:
# def get_sequence_prob(in_string, n, model):
#     in_tokens, in_lengths = preprocess_corpus(in_string)
#     in_ids = word2idseq(in_tokens, Word2Idx)
#     X, Y_, Y = prepare_data(in_ids, n)
#     preds = model.predict(X)
#     log_prob = 0.0
#     for y_i, y in enumerate(Y):
#         log_prob += np.log(preds[y_i, y])

#     log_prob = log_prob/len(Y)
#     return log_prob

# in_strings = ['hello I am science', 'blah blah blah', 'deep learning', 'answer',
#               'Boltzman', 'from the previous layer as input', 'ahcblheb eDHLHW SLcA']
# for in_string in in_strings:
#     log_prob = get_sequence_prob(in_string, 5, model)
#     print(log_prob)