In [None]:
from google.colab import drive
drive.mount("/content/drive")

## Installing Libraries

In [None]:
!pip install inltk

In [None]:
import nltk
import numpy
import math

from nltk.util import ngrams
from matplotlib import pyplot
from inltk.inltk import tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('wordnet')

## Loading The Corpus

In [None]:
# The address has to change according to the location of file on the drive.
en_path = '/content/drive/My Drive/Data/en_wiki.txt'
hi_path = '/content/drive/My Drive/Data/hi_wiki.txt'

# Note: Only 20% of the corpus is used in both cases, catering to the
#       memory/time constraints of Google Colab Notebook.
en_wiki = open(en_path,'r').read()
en_wiki = en_wiki[0 : round(0.2*len(en_wiki))]

hi_wiki = open(hi_path,'r').read()
hi_wiki = hi_wiki[0 : round(0.2*len(hi_wiki))]

# 1.3.1

## Sentence Segmentation

#### **English** 

In [None]:
#Installation for using SpaCy library
!pip install -U spacy
!pip install -U spacy-lookups-data
!python -m spacy download en_core_web_sm

In [None]:
# Using SpaCy Sentence Tokenizer
import spacy

def read_in_chunks(en_path):
  fo = open(en_path, 'r')
  while True:
    data = fo.read(1000000)
    if not data:
      break
    yield data

nlp = spacy.load('en_core_web_sm')

total_length = len(en_wiki)
length = 0
en_sent = []
for data in read_in_chunks(en_path):
    if (length + 1000000 > total_length): 
      break
    length += len(data)
    sentences = nlp(data) 
    en_sent += (sentence.text for sentence in sentences.sents)

print(len(en_sent))
print(en_sent[0 : 5])

180770
['The word "atom" was coined by ancient Greek philosophers.', 'However, these ideas were founded in philosophical and theological reasoning rather than evidence and experimentation.', 'As a result, their views on what atoms look like and how they behave were incorrect.', 'They also could not convince everybody, so atomism was but one of a number of competing theories on the nature of matter.', 'It was not until the 19th century that the idea was embraced and refined by scientists, when the blossoming science of chemistry produced discoveries that only the concept of atoms could explain.\n\n']


In [None]:
# Using NLTK Sentence Tokenizer
en_sent = nltk.sent_tokenize(en_wiki)

print(len(en_sent))
print(en_sent[0 : 5])

176321
['The word "atom" was coined by ancient Greek philosophers.', 'However, these ideas were founded in philosophical and theological reasoning rather than evidence and experimentation.', 'As a result, their views on what atoms look like and how they behave were incorrect.', 'They also could not convince everybody, so atomism was but one of a number of competing theories on the nature of matter.', 'It was not until the 19th century that the idea was embraced and refined by scientists, when the blossoming science of chemistry produced discoveries that only the concept of atoms could explain.']


### **Hindi**

In [None]:
!pip install stanza
import stanza
stanza.download('hi')

In [None]:
# USING Stanza LIBRARY
sent_tokenizer = stanza.Pipeline(lang='hi', processors='tokenize')

hi_sent = sent_tokenizer(hi_wiki).sentences
hi_sent = [sentence.text for sentence in hi_sent]

print(len(hi_sent))
print(hi_sent[0 : 5])

2020-10-01 10:13:35 INFO: Loading these models for language: hi (Hindi):
| Processor | Package |
-----------------------
| tokenize  | hdtb    |

2020-10-01 10:13:35 INFO: Use device: cpu
2020-10-01 10:13:35 INFO: Loading: tokenize
2020-10-01 10:13:35 INFO: Done loading processors!


72888
['मास्टर ऑफ़ हेल्थ एडमिनिस्ट्रेशन या मास्टर ऑफ हेल्थकेयर एडमिनिस्ट्रेशन (एमएचए या एम.एच.ए) स्नातकोत्तर (पोस्ट ग्रेजुएशन) की एक पेशेवर डिग्री है जो स्वास्थ्य प्रशासन के क्षेत्र में दी जाती हैं।', 'यह उन छात्रों को प्रदान की जाती हैं जिन्होंने स्वास्थ्य प्रशासन, अस्पताल प्रबंधन एवं अन्य स्वास्थ्य सेवा संगठनों के क्षेत्र में जरूरी ज्ञान और दक्षता हासिल की हैं।', 'इन पाठ्यक्रमो में परिस्थितियों के अनुसार इनके सरंचना में अंतर हो सकता हैं', 'हालांकि व्यवसायी-शिक्षक मॉडल कार्यक्रम आमतौर पर चिकित्सा, स्वास्थ्य व्यवसायों या संबद्ध स्वास्थ्य के कॉलेजों में पाए जाते हैं, कक्षा-आधारित कार्यक्रम व्यवसाय या सार्वजनिक स्वास्थ्य के कॉलेजों में होते हैं।', 'इस पाठ्यक्रम के अध्ययन के दौरान आम तौर पर विद्यार्थियों को जनसंख्या स्वास्थ्य, स्वास्थ्य देखभाल अर्थशास्त्र, स्वास्थ्य नीति, संगठनात्मक व्यवहार, स्वास्थ्य से जुड़े संगठनों के प्रबंधन, स्वास्थ्य विपणन और संचार, मानव संसाधन प्रबंधन, सूचना प्रणाली प्रबंधन के अध्ययन एवं अन्य क्षेत्रों में व्यावहारिक अनुभव की भी आवश्यकता होती है।']


In [None]:
!pip install indic-nlp-library

from indicnlp.tokenize import sentence_tokenize

In [None]:
#USING IndicNLP LIBRARY
hi_sent = sentence_tokenize.sentence_split(hi_wiki, lang='hi')

print(len(hi_sent))
print(hi_sent[0 : 5])

68862
['मास्टर ऑफ़ हेल्थ एडमिनिस्ट्रेशन या मास्टर ऑफ हेल्थकेयर एडमिनिस्ट्रेशन (एमएचए या एम. एच. ए) स्नातकोत्तर (पोस्ट ग्रेजुएशन) की एक पेशेवर डिग्री है जो स्वास्थ्य प्रशासन के क्षेत्र में दी जाती हैं।', 'यह उन छात्रों को प्रदान की जाती हैं जिन्होंने स्वास्थ्य प्रशासन, अस्पताल प्रबंधन एवं अन्य स्वास्थ्य सेवा संगठनों के क्षेत्र में जरूरी ज्ञान और दक्षता हासिल की हैं।', 'इन पाठ्यक्रमो में परिस्थितियों के अनुसार इनके सरंचना में अंतर हो सकता हैं हालांकि व्यवसायी-शिक्षक मॉडल कार्यक्रम आमतौर पर चिकित्सा, स्वास्थ्य व्यवसायों या संबद्ध स्वास्थ्य के कॉलेजों में पाए जाते हैं, कक्षा-आधारित कार्यक्रम व्यवसाय या सार्वजनिक स्वास्थ्य के कॉलेजों में होते हैं।', 'इस पाठ्यक्रम के अध्ययन के दौरान आम तौर पर विद्यार्थियों को जनसंख्या स्वास्थ्य, स्वास्थ्य देखभाल अर्थशास्त्र, स्वास्थ्य नीति, संगठनात्मक व्यवहार, स्वास्थ्य से जुड़े संगठनों के प्रबंधन, स्वास्थ्य विपणन और संचार, मानव संसाधन प्रबंधन, सूचना प्रणाली प्रबंधन के अध्ययन एवं अन्य क्षेत्रों में व्यावहारिक अनुभव की भी आवश्यकता होती है।', 'यह डिग्री प्रोग्

## Word Tokenization

#### **English** 

In [None]:
# USING NLTK LIBRARY - default method
en_words = nltk.word_tokenize(en_wiki)

print(len(en_words))
print(en_words[0 : 10])

4086209
['The', 'word', '``', 'atom', "''", 'was', 'coined', 'by', 'ancient', 'Greek']


In [None]:
# USING NLTK LIBRARY - Treebank method
from nltk.tokenize import TreebankWordTokenizer

word_tokenizer = TreebankWordTokenizer()
en_words = word_tokenizer.tokenize(en_wiki)

print(len(en_words))
print(en_words[0 : 10])

3927506
['The', 'word', '``', 'atom', "''", 'was', 'coined', 'by', 'ancient', 'Greek']


In [None]:
# USING NLTK LIBRARY - Word Punctuation method
from nltk.tokenize import WordPunctTokenizer

word_tokenizer = WordPunctTokenizer()
en_words = word_tokenizer.tokenize(en_wiki)

print(len(en_words))
print(en_words[0 : 10])

4458087
['The', 'word', '"', 'atom', '"', 'was', 'coined', 'by', 'ancient', 'Greek']


### **Hindi**

In [None]:
# USING NLTK LIBRARY
hi_words = nltk.word_tokenize(hi_wiki)

print(len(hi_words))
print(hi_words[0 : 10])

1633824
['मास्टर', 'ऑफ़', 'हेल्थ', 'एडमिनिस्ट्रेशन', 'या', 'मास्टर', 'ऑफ', 'हेल्थकेयर', 'एडमिनिस्ट्रेशन', '(']


In [None]:
# USING Stanza LIBRARY
hi_word_tokenizer = stanza.Pipeline(lang='hi', processors='tokenize', tokenize_no_ssplit=True)

hi_sentences = hi_word_tokenizer(hi_wiki).sentences
hi_words = []
for _, sentence in enumerate(hi_sentences):
  for token in sentence.tokens:
    hi_words.append(token.text)

print(len(hi_words))
print(hi_words[0 : 10])

2020-10-01 10:19:56 INFO: Loading these models for language: hi (Hindi):
| Processor | Package |
-----------------------
| tokenize  | hdtb    |

2020-10-01 10:19:56 INFO: Use device: cpu
2020-10-01 10:19:56 INFO: Loading: tokenize
2020-10-01 10:19:56 INFO: Done loading processors!


1681604
['मास्टर', 'ऑफ़', 'हेल्थ', 'एडमिनिस्ट्रेशन', 'या', 'मास्टर', 'ऑफ', 'हेल्थकेयर', 'एडमिनिस्ट्रेशन', '(']


In [None]:
from indicnlp.tokenize import indic_tokenize

In [None]:
#USING IndicNLP LIBRARY
hi_words = indic_tokenize.trivial_tokenize(hi_wiki)

print(len(hi_words))
print(hi_words[0 : 10])

1729146
['मास्टर', 'ऑफ़', 'हेल्थ', 'एडमिनिस्ट्रेशन', 'या', 'मास्टर', 'ऑफ', 'हेल्थकेयर', 'एडमिनिस्ट्रेशन', '(']


## Plotting N-Grams

In [None]:
def freq_plot(freq_dist, n, lang):
    frequencies = [freq_dist[sample] for sample,_ in freq_dist.most_common(100)]
    pos = numpy.arange(100)
    
    axes = pyplot.axes()

    if (n == 1):
      ngram_label = "unigrams"
    elif (n == 2):
      ngram_label = "bigrams"
    elif (n == 3):
      ngram_label = "trigrams"
    
    axes.set_xlabel("Rank")
    axes.set_ylabel("Frequency")
    axes.set_title("Frequency plot of {} by rank".format(ngram_label))
    axes.grid(True)

    pyplot.ylim(0, max(frequencies) + 5)
    pyplot.bar(pos, frequencies, 1.0, color='grey', edgecolor='black')
    
    figure = pyplot.gcf()
    figure.savefig(lang + '_' + ngram_label, dpi=figure.dpi)
    pyplot.close()

In [None]:
# This can be repeated for Hindi words
en_unigram_dist = nltk.FreqDist(en_words)
freq_plot(en_unigram_dist, 1, "en")

en_bigram_dist  = nltk.FreqDist(list(ngrams(en_words, 2)))
freq_plot(en_bigram_dist, 2, "en")

en_trigram_dist = nltk.FreqDist(list(ngrams(en_words, 3)))
freq_plot(en_trigram_dist, 3, "en")

In [None]:
print(len(en_unigram_dist))
print(en_unigram_dist.most_common(10))

91903
[('.', 291371), (',', 231646), ('the', 207921), ('of', 164211), ('and', 108203), ('%', 85282), ('was', 68873), ('in', 65428), ('a', 64016), ('to', 59192)]


# 1.3.2

## Stemming Tokens

### English

In [None]:
ps = PorterStemmer()
en_stemmed_words = [ps.stem(word) for word in en_words]

### Hindi

In [None]:
def StemHindi(word):    
    suffixes = {
      1: [u"ो",u"े",u"ू",u"ु",u"ी",u"ि",u"ा"],
      2: [u"कर",u"ाओ",u"िए",u"ाई",u"ाए",u"ने",u"नी",u"ना",u"ते",u"ीं",u"ती",u"ता",u"ाँ",u"ां",u"ों",u"ें"],
      3: [u"ाकर",u"ाइए",u"ाईं",u"ाया",u"ेगी",u"ेगा",u"ोगी",u"ोगे",u"ाने",u"ाना",u"ाते",u"ाती",u"ाता",u"तीं",u"ाओं",u"ाएं",u"ुओं",u"ुएं",u"ुआं"],
      4: [u"ाएगी",u"ाएगा",u"ाओगी",u"ाओगे",u"एंगी",u"ेंगी",u"एंगे",u"ेंगे",u"ूंगी",u"ूंगा",u"ातीं",u"नाओं",u"नाएं",u"ताओं",u"ताएं",u"ियाँ",u"ियों",u"ियां"],
      5: [u"ाएंगी",u"ाएंगे",u"ाऊंगी",u"ाऊंगा",u"ाइयाँ",u"ाइयों",u"ाइयां"],
    }

    for L in range(5, 1, -1):
      if len(word) > L:
        for suffix in suffixes[L]:
          if word.endswith(suffix):
            return word[:-L]
    return word

In [None]:
hi_stemmed_words = [StemHindi(word) for word in hi_words]

## Coverage Analysis

In [None]:
def find_coverage(words, n, coverage) :
    ngrams_list = list(ngrams(words, n))
    freq_dist = nltk.FreqDist(ngrams_list)
    freq_list = numpy.array(list(reversed(sorted([val for _, val in freq_dist.items()]))))
    word_count = numpy.argmin(freq_list.cumsum() < freq_dist.N()*coverage)

    return word_count

In [None]:
# This can be repeated for Hindi words
num_stem1 = find_coverage(en_stemmed_words, 1, 0.9)
num_unstem1 = find_coverage(en_words, 1, 0.9)

num_stem2 = find_coverage(en_stemmed_words, 2, 0.8)
num_unstem2 = find_coverage(en_words, 2, 0.8)

num_stem3 = find_coverage(en_stemmed_words, 3, 0.7)
num_unstem3 = find_coverage(en_words, 3, 0.7)

# 1.3.3

## Applying Heuristics

In [None]:
def en_regex_tokenizer(text):
    pattern = r'''(?x)         
          (?:[A-Z]\.)+        # abbreviations, e.g. U.N.O.
        | (?:\s\w\w\.)+       # titles, e.g. Dr., Ms.
        | \w+(?:-\w+)*        # words with hyphens, e.g. zig-zag
        | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $5, 25%
        | \.\.\.              # ellipsis, e.g. a,b,c...
        | [][.,;"'?():_`-]    # separate tokens, e.g. ], [
    '''
    
    sentences = nltk.tokenize.RegexpTokenizer(pattern).tokenize(text)
   
    en_words = []
    for sentence in sentences:
      words = nltk.word_tokenize(sentence)
      en_words += words
    return en_words

In [None]:
new_en_words = en_regex_tokenizer(en_wiki)
new_en_words = [word.lower() for word in new_en_words]

new_en_stemmed_words = [ps.stem(word) for word in new_en_words]

In [None]:
num_stem1 = find_coverage(new_en_stemmed_words, 1, 0.9)
num_unstem1 = find_coverage(new_en_words, 1, 0.9)

num_stem2 = find_coverage(new_en_stemmed_words, 2, 0.8)
num_unstem2 = find_coverage(new_en_words, 2, 0.8)

num_stem3 = find_coverage(new_en_stemmed_words, 3, 0.7)
num_unstem3 = find_coverage(new_en_words, 3, 0.7)

## Likelihood Ratio Testing

In [None]:
from math import log10

def get_value(k, n, x):
  return (log10(x)*k + log10(1-x)*(n-k))

def constr_collocations(bigram_dist, unigram_dist, n):
    collocation = []
    i = 0
    for bigram, freq in bigram_dist.items():
        c12 = freq
        c1 	= unigram_dist[bigram[0]]
        c2 	= unigram_dist[bigram[1]]
        
        p 	= c2/n
        p1 	= c12/c1
        p2 	= (c2-c12)/(n-c1)

        if(p2 == 0 or p1 == 0 or p == 0):
            continue

        if(p2 == 1 or p1 == 1 or p == 1):
            continue

        val  = get_value(c12, c1, p)
        val += get_value(c2-c12, n-c1, p)
        val -= get_value(c12, c1, p1)
        val -= get_value(c2-c12, n-c1, p2)
        val *= -2

        if(val >= 7.88):
            collocation += bigram

    return collocation

In [None]:
from math import log10


def getVal(k, n, x):
  temp = log10(x) * k
  temp2 = log10(1-x) * (n-k)
  temp += temp2
  return temp

def construct_collocations(bigram_dist, unigram_dist, number_tokens):
    collocation = []
    i = 0
    for bigram, freq in bigram_dist.items():
        c12 = freq
        c1 = unigram_dist[bigram[0]]
        c2 = unigram_dist[bigram[1]]
        n = number_tokens
        p = c2/n
        p1 = c12/c1
        p2 = (c2 - c12)/(n-c1)
        if(p2 == 0 or p1==0 or p==0):
            continue

        if(p2 == 1 or p1==1 or p==1):
            continue

        val = getVal(c12, c1, p) + getVal(c2 - c12, n-c1, p) - getVal(c12, c1, p1) - getVal(c2 - c12, n-c1, p2)
        val *= -2

        if(val >= 7.88):
            collocation.append(bigram)

    return collocation

In [None]:
en_collocations = constr_collocations(en_bigram_dist, en_unigram_dist,len(en_words))
hi_collocations = constr_collocations(hi_bigram_dist, hi_unigram_dist,len(hi_words))

# 1.3.4

## Morphological Analysis

In [None]:
import random

def random_freq_unigrams(freq_dist, count, set_count):
  freq_unigrams = freq_dist.most_common(set_count)
  return random.choices(freq_unigrams, k = count)

def random_least_freq_unigrams(freq_dist, count, set_count):
  least_freq_unigrams = freq_dist.most_common()[-set_count:]
  return random.choices(least_freq_unigrams, k = count)

### English

In [None]:
!sudo apt-get install python-numpy libicu-dev
!pip install PyICU polyglot pycld2 Morfessor

In [None]:
from polyglot.downloader import downloader
!polyglot download morph2.en
from polyglot.text import Word

In [None]:
def morph_analysis(unigrams):
  random_words = [word for word,_ in unigrams]
  for word in random_words:
    word = Word(word, language='en')
    print("{} -> {}".format(word, word.morphemes))

In [None]:
morph_analysis(random_freq_unigrams(en_unigram_dist, 5, 1000))

contains -> ['contain', 's']
each -> ['e', 'ach']
various -> ['vari', 'ous']
as -> ['a', 's']
being -> ['be', 'ing']


In [None]:
morph_analysis(random_least_freq_unigrams(en_unigram_dist, 5, 1000))

unscientific -> ['un', 'scientific']
Mabhouh -> ['Ma', 'b', 'ho', 'u', 'h']
Artery -> ['Arte', 'ry']
incurring -> ['incur', 'ring']
Qumsiyeh -> ['Qu', 'm', 's', 'i', 'y', 'eh']


### Hindi

In [None]:
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install indic-nlp-library

In [None]:
from indicnlp.morph import unsupervised_morph 
from indicnlp import common
common.INDIC_RESOURCES_PATH="/content/indic_nlp_resources"
analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer('hi')

In [None]:
def morph_analysis(unigrams):
  random_words = [word for word,_ in unigrams]
  for word in random_words:
    morpheme = analyzer.morph_analyze(word)
    print("{} -> {}".format(word, morpheme))


In [None]:
morph_analysis(random_freq_unigrams(hi_unigram_dist, 5, 1000))

निर्धारित -> ['निर्धारित']
करना -> ['कर', 'ना']
सैन्य -> ['सैन्य']
प्रसिद्ध -> ['प्रसिद्ध']
राष्ट्रपति -> ['राष्ट्रपति']


In [None]:
morph_analysis(random_least_freq_unigrams(hi_unigram_dist, 5, 1000))

उपास्यदेव -> ['उपास्य', 'देव']
'खैरागढ़ -> ["'खैरागढ़"]
ख्+ -> ['ख्+']
फ़िल्मी-हस्तियां -> ['फ़िल्मी-हस्तियां']
रैंडर -> ['रैंड', 'र']


# 1.3.5

## Sub-Word Tokenization

### Training on Corpus

In [None]:
import re
from collections import defaultdict

def build_vocab(freq_dist):
  vocab = nltk.FreqDist([])
  for word, freq in freq_dist.items():
    w = ''
    for c in word:
      w += c + ' '
    vocab[w +'$'] = freq
  return vocab

def get_stats(vocab):
    pairs = defaultdict(int)
    for word, frequency in vocab.items():
        symbols = word.split()

        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += frequency
    
    return pairs

In [None]:
def merge_vocab(pair, vocab):
    new_vocab =  nltk.FreqDist([])
    bigram = re.escape(' '.join(pair))
    pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    
    for word in vocab:
        w_out = pattern.sub(''.join(pair), word)
        new_vocab[w_out] = vocab[word]

    return new_vocab

In [None]:
def train_corpus(freq_dist):
  vocab = build_vocab(freq_dist)
  fin_pairs = []

  for i in range(500):
    pairs = get_stats(vocab)
    
    if not pairs:
        break

    freq_pair = max(pairs, key=pairs.get)
    fin_pairs.append(freq_pair)
    vocab = merge_vocab(freq_pair, vocab)
  
  return (vocab, fin_pairs)

### Testing on New Data

In [None]:
def merge_vocab_tokenize(pair, vocab):
    new_vocab = []
    bigram = re.escape(' '.join(pair))
    pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
   
    for word in vocab:
        w_out = pattern.sub(''.join(pair), word)
        new_vocab.append(w_out)

    return new_vocab

def tokenize_corpus(corpus, fin_pairs):
    words = [" ".join(word) + " $" for word in corpus.split()]
    for pair in fin_pairs:
      words = merge_vocab_tokenize(pair, words)
    return words 

### English

In [None]:
vocab, fin_pairs = train_corpus(en_unigram_dist)
print(vocab.most_common(20))
print(vocab.most_common()[-20:])

[('.$', 291371), (',$', 231646), ('the$', 207921), ('of$', 164211), ('and$', 108203), ('%$', 85282), ('was$', 68873), ('in$', 65428), ('a$', 64016), ('to$', 59192), ('were$', 54127), ('The$', 52467), ('"$', 35371), ('is$', 34277), ('age$', 30634), ('for$', 29821), ('from$', 28787), ('-$', 27435), ('($', 26440), ('with$', 26127)]
[('S z u c k o$', 1), ('C Q T s$', 1), ('S um mar iz ing$', 1), ('pre t ens e$', 1), ('P s y ch ol og ist s$', 1), ('P ol y g ra ph s$', 1), ('P T S D $', 1), ('h y p og l y c em ia$', 1), ('di sh on est y$', 1), ('un sc i ent if ic$', 1), ('vi ol at or s$', 1), ('V al id ity$', 1), ('M o y n i h an$', 1), ('S ec rec y$', 1), ('. . " .$', 1), ('A s k ed$', 1), ('" ` $', 1), ('pr ic k ing$', 1), ('B é land$', 1), ('su b j ec $', 1)]


In [None]:
new_corpus = 'friendly president makes statement capture meaning behind different emotions present'
sub_word_tokens = tokenize_corpus(new_corpus, fin_pairs)
print(sub_word_tokens)

['fr i en d ly$', 'pre si d ent$', 'ma k es$', 'st at em ent$', 'cap tur e$', 'me an ing$', 'be h in d$', 'di ff er ent$', 'em ot ions$', 'present$']


In [None]:
new_corpus_words = new_corpus.split()
for word in new_corpus_words:
  word = Word(word, language="en")
  sub_word_tokens = ' + '.join(word.morphemes)
  print("{} -> {}".format(word, sub_word_tokens))

friendly -> friend + ly
president -> president
makes -> make + s
statement -> state + ment
capture -> capture
meaning -> mean + ing
behind -> be + hind
different -> different
emotions -> e + motion + s
present -> present


### Hindi

In [None]:
vocab, fin_pairs = train_corpus(hi_unigram_dist)
print(vocab.most_common(20))
print(vocab.most_common()[-20:])

[('के$', 71492), ('में$', 54162), (',$', 53807), ('की$', 33545), ('और$', 31227), ('से$', 26883), ('का$', 24191), ('को$', 23815), ('है।$', 23771), ('है$', 21197), ('एक$', 15831), (')$', 13725), ('($', 13671), ('पर$', 12717), ('ने$', 11363), ('लिए$', 10580), ('भी$', 9482), ('हैं।$', 9384), ('किया$', 8923), ("''$", 8847)]
[('ए ड ो आ र् ड$', 1), ('से गु इन$', 1), ('9 - वर् ष ीय$', 1), ('है । ज न् म जा त$', 1), ('क्र ी ज$', 1), ('। इन$', 1), ('वा यु मार् गों$', 1), ('ए प न िया$', 1), ('पै ट र् न । अ ट ला ं ट ै क् स िय ल$', 1), ('है ं - पु रु ष ों$', 1), ('5 0 - 6 9$', 1), ('20 - 3 5$', 1), ('1 0 - 3 0$', 1), ('है । 1 0$', 1), ('स्ट टर$', 1), ('है ं । वे$', 1), ('स् पै स् म$', 1), ('5 0 - 7 0$', 1), ('स् ट्र ै बि स् म स$', 1), ('के रा ट ो को न स$', 1)]


In [None]:
new_corpus = 'वर्तमान भविष्य विभाजित अद्भुत रचना खिलौने किताब विभिन्न भावनाएँ सिखाने'
sub_word_tokens = tokenize_corpus(new_corpus, fin_pairs)
print(sub_word_tokens)

['वर्त मान$', 'भ वि ष ्य$', 'वि भा ज ित$', 'अ द् भ ु त$', 'र च ना$', 'खि ल ौ ने$', 'कि ता ब$', 'वि भि न्न$', 'भा व ना ए ँ$', 'सि खा ने$']


In [None]:
new_corpus_words = new_corpus.split()
for word in new_corpus_words:
  sub_word_tokens = ' + '.join(analyzer.morph_analyze(word))
  print("{} -> {}".format(word, sub_word_tokens))

वर्तमान -> वर्तमान
भविष्य -> भविष्य
विभाजित -> विभाजित
अद्भुत -> अद्भुत
रचना -> रचना
खिलौने -> खिलौने
किताब -> किताब
विभिन्न -> विभिन्न
भावनाएँ -> भावना + एँ
सिखाने -> सिखा + ने
