In [2]:
# Rolling window size / n in ngram
n = 3

# Example sentence / sequence
doc = "The quick brown fox jumps over the lazy dog"
tokens = doc.split(' ')
print(tokens)

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']


In [3]:
out = []
for i in range(len(tokens)):
    out.append(tokens[i:i+n])

out

[['The', 'quick', 'brown'],
 ['quick', 'brown', 'fox'],
 ['brown', 'fox', 'jumps'],
 ['fox', 'jumps', 'over'],
 ['jumps', 'over', 'the'],
 ['over', 'the', 'lazy'],
 ['the', 'lazy', 'dog'],
 ['lazy', 'dog'],
 ['dog']]

In [4]:
[tokens[i:i+n] for i in range(len(tokens))]

[['The', 'quick', 'brown'],
 ['quick', 'brown', 'fox'],
 ['brown', 'fox', 'jumps'],
 ['fox', 'jumps', 'over'],
 ['jumps', 'over', 'the'],
 ['over', 'the', 'lazy'],
 ['the', 'lazy', 'dog'],
 ['lazy', 'dog'],
 ['dog']]

In [5]:
list(zip(*[tokens[i:] for i in range(n)]))

[('The', 'quick', 'brown'),
 ('quick', 'brown', 'fox'),
 ('brown', 'fox', 'jumps'),
 ('fox', 'jumps', 'over'),
 ('jumps', 'over', 'the'),
 ('over', 'the', 'lazy'),
 ('the', 'lazy', 'dog')]

In [6]:
import pandas as pd
list(map(list,pd.Series(tokens).rolling(n)))

[['The'],
 ['The', 'quick'],
 ['The', 'quick', 'brown'],
 ['quick', 'brown', 'fox'],
 ['brown', 'fox', 'jumps'],
 ['fox', 'jumps', 'over'],
 ['jumps', 'over', 'the'],
 ['over', 'the', 'lazy'],
 ['the', 'lazy', 'dog']]

In [7]:
import pandas as pd

indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=n)
list(map(list,pd.Series(tokens).rolling(indexer)))

[['The', 'quick', 'brown'],
 ['quick', 'brown', 'fox'],
 ['brown', 'fox', 'jumps'],
 ['fox', 'jumps', 'over'],
 ['jumps', 'over', 'the'],
 ['over', 'the', 'lazy'],
 ['the', 'lazy', 'dog'],
 ['lazy', 'dog'],
 ['dog']]

In [8]:
import numpy as np

arr = np.array(tokens)
shape = (arr.shape[0] - n + 1, n)            # (7, 3)
strides = (arr.strides[0], arr.strides[0])   # (20, 20) bytes

np.lib.stride_tricks.as_strided(arr, shape=shape, strides=strides)

array([['The', 'quick', 'brown'],
       ['quick', 'brown', 'fox'],
       ['brown', 'fox', 'jumps'],
       ['fox', 'jumps', 'over'],
       ['jumps', 'over', 'the'],
       ['over', 'the', 'lazy'],
       ['the', 'lazy', 'dog']], dtype='<U5')

In [9]:
from nltk import ngrams
list(ngrams(tokens, n))

[('The', 'quick', 'brown'),
 ('quick', 'brown', 'fox'),
 ('brown', 'fox', 'jumps'),
 ('fox', 'jumps', 'over'),
 ('jumps', 'over', 'the'),
 ('over', 'the', 'lazy'),
 ('the', 'lazy', 'dog')]

In [10]:
#pip install more_itertools
import more_itertools
list(more_itertools.windowed(tokens, n))

[('The', 'quick', 'brown'),
 ('quick', 'brown', 'fox'),
 ('brown', 'fox', 'jumps'),
 ('fox', 'jumps', 'over'),
 ('jumps', 'over', 'the'),
 ('over', 'the', 'lazy'),
 ('the', 'lazy', 'dog')]

In [11]:
#pip install toolz
import toolz
list(toolz.sliding_window(n, tokens))

[('The', 'quick', 'brown'),
 ('quick', 'brown', 'fox'),
 ('brown', 'fox', 'jumps'),
 ('fox', 'jumps', 'over'),
 ('jumps', 'over', 'the'),
 ('over', 'the', 'lazy'),
 ('the', 'lazy', 'dog')]

In [12]:
from itertools import islice, tee
list(zip(*(islice(s, i, None) for i, s in enumerate(tee(tokens, n)))))

[('The', 'quick', 'brown'),
 ('quick', 'brown', 'fox'),
 ('brown', 'fox', 'jumps'),
 ('fox', 'jumps', 'over'),
 ('jumps', 'over', 'the'),
 ('over', 'the', 'lazy'),
 ('the', 'lazy', 'dog')]

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(ngram_range=(n,n))
analyzer = cv.build_analyzer()
analyzer(doc)

['the quick brown',
 'quick brown fox',
 'brown fox jumps',
 'fox jumps over',
 'jumps over the',
 'over the lazy',
 'the lazy dog']