# N-Grams

In [1]:
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.util import ngrams

In [4]:
# Function to generate n-grams from sentences.
def extract_ngrams(data, num):
    n_grams = ngrams(word_tokenize(data), num)
    return [ ' '.join(grams) for grams in n_grams]

In [2]:
data = 'Mary had a little Lamb.'

In [5]:
print("1-gram: ", extract_ngrams(data, 1))
print("2-gram: ", extract_ngrams(data, 2))
print("3-gram: ", extract_ngrams(data, 3))
print("4-gram: ", extract_ngrams(data, 4))

1-gram:  ['Mary', 'had', 'a', 'little', 'Lamb', '.']
2-gram:  ['Mary had', 'had a', 'a little', 'little Lamb', 'Lamb .']
3-gram:  ['Mary had a', 'had a little', 'a little Lamb', 'little Lamb .']
4-gram:  ['Mary had a little', 'had a little Lamb', 'a little Lamb .']


In [8]:
import nltk

In [14]:
help(nltk.util)

Help on module nltk.stem.util in nltk.stem:

NAME
    nltk.stem.util

DESCRIPTION
    # Natural Language Toolkit: Stemmer Utilities
    #
    # Copyright (C) 2001-2022 NLTK Project
    # Author: Helder <he7d3r@gmail.com>
    # URL: <https://www.nltk.org/>
    # For license information, see LICENSE.TXT

FUNCTIONS
    prefix_replace(original, old, new)
        Replaces the old prefix of the original string by a new suffix
        
        :param original: string
        :param old: string
        :param new: string
        :return: string
    
    suffix_replace(original, old, new)
        Replaces the old suffix of the original string by a new suffix

FILE
    c:\users\ajey kumar\.conda\envs\ta\lib\site-packages\nltk\stem\util.py




In [11]:
help(ngrams)

Help on function ngrams in module nltk.util:

ngrams(sequence, n, **kwargs)
    Return the ngrams generated from a sequence of items, as an iterator.
    For example:
    
        >>> from nltk.util import ngrams
        >>> list(ngrams([1,2,3,4,5], 3))
        [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
    
    Wrap with list for a list version of this function.  Set pad_left
    or pad_right to true in order to get additional ngrams:
    
        >>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
        [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
        >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
        [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
        >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
        [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]
        >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
        [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
 

In [16]:
help(bigrams)

Help on function bigrams in module nltk.util:

bigrams(sequence, **kwargs)
    Return the bigrams generated from a sequence of items, as an iterator.
    For example:
    
        >>> from nltk.util import bigrams
        >>> list(bigrams([1,2,3,4,5]))
        [(1, 2), (2, 3), (3, 4), (4, 5)]
    
    Use bigrams for a list version of this function.
    
    :param sequence: the source data to be converted into bigrams
    :type sequence: sequence or iter
    :rtype: iter(tuple)



In [17]:
from nltk.util import bigrams
from nltk.util import everygrams

In [18]:
list(bigrams(word_tokenize(data)))

[('Mary', 'had'),
 ('had', 'a'),
 ('a', 'little'),
 ('little', 'Lamb'),
 ('Lamb', '.')]

In [19]:
list(everygrams(word_tokenize(data)))

[('Mary',),
 ('Mary', 'had'),
 ('Mary', 'had', 'a'),
 ('Mary', 'had', 'a', 'little'),
 ('Mary', 'had', 'a', 'little', 'Lamb'),
 ('Mary', 'had', 'a', 'little', 'Lamb', '.'),
 ('had',),
 ('had', 'a'),
 ('had', 'a', 'little'),
 ('had', 'a', 'little', 'Lamb'),
 ('had', 'a', 'little', 'Lamb', '.'),
 ('a',),
 ('a', 'little'),
 ('a', 'little', 'Lamb'),
 ('a', 'little', 'Lamb', '.'),
 ('little',),
 ('little', 'Lamb'),
 ('little', 'Lamb', '.'),
 ('Lamb',),
 ('Lamb', '.'),
 ('.',)]

Using TextBlob

In [20]:
from textblob import TextBlob

In [21]:
# Function to generate n-grams from sentences.
def tb_extract_ngrams(data, num):
    n_grams = TextBlob(data).ngrams(num)
    return [ ' '.join(grams) for grams in n_grams]

In [22]:
print("1-gram: ", tb_extract_ngrams(data, 1))
print("2-gram: ", tb_extract_ngrams(data, 2))
print("3-gram: ", tb_extract_ngrams(data, 3))
print("4-gram: ", tb_extract_ngrams(data, 4))

1-gram:  ['Mary', 'had', 'a', 'little', 'Lamb']
2-gram:  ['Mary had', 'had a', 'a little', 'little Lamb']
3-gram:  ['Mary had a', 'had a little', 'a little Lamb']
4-gram:  ['Mary had a little', 'had a little Lamb']


Using SpaCy

In [11]:
# do yourself

In [23]:
import spacy

In [24]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [25]:
doc = nlp(data)
doc

Mary had a little Lamb.

In [26]:
!pip3 install textacy

Collecting textacy
  Downloading textacy-0.12.0-py3-none-any.whl (208 kB)
     -------------------------------------- 208.4/208.4 kB 2.1 MB/s eta 0:00:00
Collecting cytoolz>=0.10.1
  Downloading cytoolz-0.11.2.tar.gz (481 kB)
     -------------------------------------- 481.0/481.0 kB 3.8 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting jellyfish>=0.8.0
  Downloading jellyfish-0.9.0-cp39-cp39-win_amd64.whl (26 kB)
Collecting pyphen>=0.10.0
  Downloading pyphen-0.12.0-py3-none-any.whl (2.0 MB)
     ---------------------------------------- 2.0/2.0 MB 5.0 MB/s eta 0:00:00
Collecting cachetools>=4.0.0
  Downloading cachetools-5.2.0-py3-none-any.whl (9.3 kB)
Collecting toolz>=0.8.0
  Downloading toolz-0.11.2-py3-none-any.whl (55 kB)
     ---------------------------------------- 55.8/55.8 kB 3.0 MB/s eta 0:00:00
Building wheels for collected packages: cytoolz
  Building wheel for cytoolz (setup.py): started
  Bui

In [27]:
import textacy

In [29]:
help(textacy.extract.ngrams)

Help on function ngrams in module textacy.extract.basics:

ngrams(doclike: 'types.DocLike', n: 'int | Collection[int]', *, filter_stops: 'bool' = True, filter_punct: 'bool' = True, filter_nums: 'bool' = False, include_pos: 'Optional[str | Collection[str]]' = None, exclude_pos: 'Optional[str | Collection[str]]' = None, min_freq: 'int' = 1) -> 'Iterable[Span]'
    Extract an ordered sequence of n-grams (``n`` consecutive tokens) from a spaCy
    ``Doc`` or ``Span``, for one or multiple ``n`` values, optionally filtering n-grams
    by the types and parts-of-speech of the constituent tokens.
    
    Args:
        doclike
        n: Number of tokens included per n-gram; for example, ``2`` yields bigrams
            and ``3`` yields trigrams. If multiple values are specified, then the
            collections of n-grams are concatenated together; for example, ``(2, 3)``
            yields bigrams and then trigrams.
        filter_stops: If True, remove ngrams that start or end with a stop w

In [32]:
Ngrams = list(textacy.extract.ngrams(doc, 2, min_freq=1))
print(Ngrams)

[little Lamb]


In [None]:
# General

In [34]:
from nltk.util import ngrams

In [35]:
import re
 
s = "Natural-language processing (NLP) is an area of computer science " \
    "and artificial intelligence concerned with the interactions " \
    "between computers and human (natural) languages."
 
s = s.lower()
s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
tokens = [token for token in s.split(" ") if token != ""]
output = list(ngrams(tokens, 5))
output

[('natural', 'language', 'processing', 'nlp', 'is'),
 ('language', 'processing', 'nlp', 'is', 'an'),
 ('processing', 'nlp', 'is', 'an', 'area'),
 ('nlp', 'is', 'an', 'area', 'of'),
 ('is', 'an', 'area', 'of', 'computer'),
 ('an', 'area', 'of', 'computer', 'science'),
 ('area', 'of', 'computer', 'science', 'and'),
 ('of', 'computer', 'science', 'and', 'artificial'),
 ('computer', 'science', 'and', 'artificial', 'intelligence'),
 ('science', 'and', 'artificial', 'intelligence', 'concerned'),
 ('and', 'artificial', 'intelligence', 'concerned', 'with'),
 ('artificial', 'intelligence', 'concerned', 'with', 'the'),
 ('intelligence', 'concerned', 'with', 'the', 'interactions'),
 ('concerned', 'with', 'the', 'interactions', 'between'),
 ('with', 'the', 'interactions', 'between', 'computers'),
 ('the', 'interactions', 'between', 'computers', 'and'),
 ('interactions', 'between', 'computers', 'and', 'human'),
 ('between', 'computers', 'and', 'human', 'natural'),
 ('computers', 'and', 'human', 'na

In [36]:
data2="Machine learning is the science of getting computers to act without being explicitly programmed. In the past decade, machine learning has given us self-driving cars, practical speech recognition, effective web search, and a vastly improved understanding of the human genome. Machine learning is so pervasive today that you probably use it dozens of times a day without knowing it. Many researchers also think it is the best way to make progress towards human-level AI. In this class, you will learn about the most effective machine learning techniques, and gain practice implementing them and getting them to work for yourself. More importantly, you'll learn about not only the theoretical underpinnings of learning, but also gain the practical know-how needed to quickly and powerfully apply these techniques to new problems. Finally, you'll learn about some of Silicon Valley's best practices in innovation as it pertains to machine learning and AI."

data2

"Machine learning is the science of getting computers to act without being explicitly programmed. In the past decade, machine learning has given us self-driving cars, practical speech recognition, effective web search, and a vastly improved understanding of the human genome. Machine learning is so pervasive today that you probably use it dozens of times a day without knowing it. Many researchers also think it is the best way to make progress towards human-level AI. In this class, you will learn about the most effective machine learning techniques, and gain practice implementing them and getting them to work for yourself. More importantly, you'll learn about not only the theoretical underpinnings of learning, but also gain the practical know-how needed to quickly and powerfully apply these techniques to new problems. Finally, you'll learn about some of Silicon Valley's best practices in innovation as it pertains to machine learning and AI."

In [27]:
print("1-gram: ", extract_ngrams(data2, 1))
print("\n")
print("2-gram: ", extract_ngrams(data2, 2))
print("\n")
print("3-gram: ", extract_ngrams(data2, 3))
print("\n")
print("4-gram: ", extract_ngrams(data2, 4))

1-gram:  ['Machine', 'learning', 'is', 'the', 'science', 'of', 'getting', 'computers', 'to', 'act', 'without', 'being', 'explicitly', 'programmed', '.', 'In', 'the', 'past', 'decade', ',', 'machine', 'learning', 'has', 'given', 'us', 'self-driving', 'cars', ',', 'practical', 'speech', 'recognition', ',', 'effective', 'web', 'search', ',', 'and', 'a', 'vastly', 'improved', 'understanding', 'of', 'the', 'human', 'genome', '.', 'Machine', 'learning', 'is', 'so', 'pervasive', 'today', 'that', 'you', 'probably', 'use', 'it', 'dozens', 'of', 'times', 'a', 'day', 'without', 'knowing', 'it', '.', 'Many', 'researchers', 'also', 'think', 'it', 'is', 'the', 'best', 'way', 'to', 'make', 'progress', 'towards', 'human-level', 'AI', '.', 'In', 'this', 'class', ',', 'you', 'will', 'learn', 'about', 'the', 'most', 'effective', 'machine', 'learning', 'techniques', ',', 'and', 'gain', 'practice', 'implementing', 'them', 'and', 'getting', 'them', 'to', 'work', 'for', 'yourself', '.', 'More', 'importantly'