In [1]:
"""
@author: Eric Tsai <eric492718@gmail.com>
@brief: utils for ngram

"""

'\n@author: Eric Tsai <eric492718@gmail.com>\n@brief: utils for ngram\n\n'

## Table of Content
* [1. N-gram model](#1.-N-gram-model)
    * [1.1 unigrams](#1.1-unigrams)
    * [1.2 bigrams](#1.2-bigrams)
    * [1.3 trigrams](#1.3-trigrams)
    * [1.4 fourgrams](#1.4-fourgrams)
* [2. N-terms](#2.-N-terms)
    * [2.1 uniterms](#2.1-uniterms) 
    * [2.2 biterms](#2.2-biterms)
    * [2.3 triterms](#2.3-triterms)    
    * [2.4 fourterms](#2.4-fourterms)       
* [3. Dictionary for N-grams and N-iterms method](#3.-Dictionary-for-N-grams-and-N-iterms-method)
* [4. N-grams functon](#4.-N-grams-functon)
* [5. N-term function](#5.-N-term-function)
* [6. Test Process](#6.-Test-Process)

## 1. N-gram model

**Ngram Description:**<br>
**Object:**<br>
Extract the possibly meaningful compound word in the sentence.
>**words**<br> 
['a', 'b', 'c', 'd', 'e', 'f', 'g']<br>
**function**<br> 
\_trigrams(words,&emsp;join_string,&emsp;skip)<br>
**return**<br>
['a_b_c', 'a_b_d', 'a_b_e', 'a_c_d', 'a_c_e', 'a_c_f', 'a_d_e', 'a_d_f', 'a_d_g']

**\_trigrams function:** Set variable which `join_string='_'` and `skip=2`. <br>
Extract the possible features which are the compound word from the sentence. And example only displays the first element of the compound word is `a`.<br>
<br>
### **Algorithm:**
#### **Structure**
**first stage&ensp;&ensp;&ensp;&ensp;&ensp;secondstage&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;third stage**
_____________________________________________________________________________________________________________________
                               c (skip=0) => a_b_c
                             ↗ 
                b           |→ d (skip=1) => a_b_d
             ↗               ↘   
             | (skip=0)        e (skip=2) => a_b_e
             |                 
             |    
             |                 d (skip=0) => a_c_d
             |               ↗ 
    a        |→ c           |→ e (skip=1) => a_c_e
             | (skip=1)      ↘ 
             |                 f (skip=2) => a_c_f
             |    
             |    
             |                 e (skip=0) => a_d_e
             ↘               ↗ 
                d           |→ f (skip=1) => a_d_f
               (skip=2)      ↘
                               g (skip=2) => a_d_g

#### **Code**
```python
def _trigrams(words, join_string='_', skip=2):               
    assert type(words) == list                           
    L = len(words)
    if L > 2:
        lst = []
        for i in range(L-2): # number of first stage element index
            for k1 in range(1,skip+2): # number of 'skip' on the second(third) stage
                for k2 in range(1,skip+2): # plus number of the first(second) stage element index
                    if i+k1 < L and i+k1+k2 < L:
                        lst.append( join_string.join([words[i], words[i+k1], words[i+k1+k2]]) )
    else:
        # set it as bigram
        lst = _bigrams(words, join_string, skip)
    return lst
```

### 1.1 unigrams

In [2]:
def _unigrams(words):
    """
        Input: a list of words, e.g., ['I', 'am', 'Denny']
        Output: a list of unigram
    """
    assert type(words) == list # Assertion Testing
    return words

### 1.2 bigrams

In [3]:
# if skip = 5, implies we will skip up to five words
# and combine all combinations(bigrams)
def _bigrams(words, join_string, skip=0):
    """
       Input: a list of words, e.g., ["I", "am", "Denny"]
       Output: a list of bigram, e.g., ["I_am", "am_Denny"]
       I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for k in range(1,skip+2):
                if i+k < L:
                    lst.append( join_string.join([words[i], words[i+k]]) )
    else:
        # set it as unigram
        lst = _unigrams(words)
    return lst

### 1.3 trigrams

In [4]:
def _trigrams(words, join_string, skip=0):
    """
       Input: a list of words, e.g., ["I", "am", "Denny"]
       Output: a list of trigram, e.g., ["I_am_Denny"]
       I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 2:
        lst = []
        for i in range(L-2):
            for k1 in range(1,skip+2):
                for k2 in range(1,skip+2):
                    if i+k1 < L and i+k1+k2 < L:
                        lst.append( join_string.join([words[i], words[i+k1], words[i+k1+k2]]) )
    else:
        # set it as bigram
        lst = _bigrams(words, join_string, skip)
    return lst


### 1.4 fourgrams

In [5]:
def _fourgrams(words, join_string):
    """
        Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
        Output: a list of trigram, e.g., ["I_am_Denny_boy"]
        I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 3:
        lst = []
        for i in range(L-3):
            lst.append( join_string.join([words[i], words[i+1], words[i+2], words[i+3]]) )
    else:
        # set it as trigram
        lst = _trigrams(words, join_string)
    return lst
 

## 2. N-terms 

### 2.1 uniterms

In [6]:
def _uniterms(words):
    return _unigrams(words)

### 2.2 biterms

In [7]:
def _biterms(words, join_string):
    """
        Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
        Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"]
        I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for j in range(i+1,L):
                lst.append( join_string.join([words[i], words[j]]) )
    else:
        # set it as uniterm
        lst = _uniterms(words)
    return lst

### 2.3 triterms

In [8]:
def _triterms(words, join_string):
    """
        Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
        Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"]
        I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 2:
        lst = []
        for i in range(L-2):
            for j in range(i+1,L-1):
                for k in range(j+1,L):
                    lst.append( join_string.join([words[i], words[j], words[k]]) )
    else:
        # set it as biterm
        lst = _biterms(words, join_string)
    return lst

### 2.4 fourterms

In [9]:
def _fourterms(words, join_string):
    """
        Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"]
        Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"]
        I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 3:
        lst = []
        for i in range(L-3):
            for j in range(i+1,L-2):
                for k in range(j+1,L-1):
                    for l in range(k+1,L):
                        lst.append( join_string.join([words[i], words[j], words[k], words[l]]) )
    else:
        # set it as triterm
        lst = _triterms(words, join_string)
    return lst

## 3. Dictionary for N-grams and N-iterms method

In [10]:
_ngram_str_map = {
    1: "Unigram",
    2: "Bigram",
    3: "Trigram",
    4: "Fourgram",
    5: "Fivegram",
    12: "UBgram",
    123: "UBTgram",
}

## 4. N-grams functon

In [11]:
def _ngrams(words, ngram, join_string=" "):
    """wrapper for ngram"""
    if ngram == 1:
        return _unigrams(words)
    elif ngram == 2:
        return _bigrams(words, join_string)
    elif ngram == 3:
        return _trigrams(words, join_string)
    elif ngram == 4:
        return _fourgrams(words, join_string)
    elif ngram == 12:
        unigram = _unigrams(words)
        bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2]
        return unigram + bigram
    elif ngram == 123:
        unigram = _unigrams(words)
        bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2]
        trigram = [x for x in _trigrams(words, join_string) if len(x.split(join_string)) == 3]
        return unigram + bigram + trigram

In [12]:
_nterm_str_map = {
    1: "Uniterm",
    2: "Biterm",
    3: "Triterm",
    4: "Fourterm",
    5: "Fiveterm",
}

## 5. N-term function

In [13]:
def _nterms(words, nterm, join_string=" "):
    """wrapper for nterm"""
    if nterm == 1:
        return _uniterms(words)
    elif nterm == 2:
        return _biterms(words, join_string)
    elif nterm == 3:
        return _triterms(words, join_string)
    elif nterm == 4:
        return _fourterms(words, join_string)

## 6. Test Process

In [14]:
if __name__ == "__main__":

    text = "I am Denny boy ha"
    words = text.split(" ")

    assert _ngrams(words, 1) == ["I", "am", "Denny", "boy", "ha"]
    assert _ngrams(words, 2) == ["I am", "am Denny", "Denny boy", "boy ha"]
    assert _ngrams(words, 3) == ["I am Denny", "am Denny boy", "Denny boy ha"]
    assert _ngrams(words, 4) == ["I am Denny boy", "am Denny boy ha"]

    assert _nterms(words, 1) == ["I", "am", "Denny", "boy", "ha"]
    assert _nterms(words, 2) == ["I am", "I Denny", "I boy", "I ha", "am Denny", "am boy", "am ha", "Denny boy", "Denny ha", "boy ha"]
    assert _nterms(words, 3) == ["I am Denny", "I am boy", "I am ha", "I Denny boy", "I Denny ha", "I boy ha", "am Denny boy", "am Denny ha", "am boy ha", "Denny boy ha"]
    assert _nterms(words, 4) == ["I am Denny boy", "I am Denny ha", "I am boy ha", "I Denny boy ha", "am Denny boy ha"]


In [15]:
# convert notebook.ipynb to a .py file
!jupytext --to py ngram_utils.ipynb

[jupytext] Reading ngram_utils.ipynb in format ipynb
[jupytext] Writing ngram_utils.py (destination file replaced)
