In [1]:
import gensim
from gensim import corpora

### In order to work on text documents, Gensim requires the words (aka tokens) be converted to unique ids. In order to achieve that, Gensim lets you create a Dictionary object that maps each word to a unique id.

The dictionary object is typically used to create a ‘bag of words’ Corpus. It is this Dictionary and the bag-of-words (Corpus) that are used as inputs to topic modeling and other models that Gensim specializes in.

Alright, what sort of text inputs can gensim handle? The input text typically comes in 3 different forms:

- As sentences stored in python’s native list object
- As one single text file, small or large.
- In multiple text files.

A ‘token’ typically means a ‘word’. A ‘document’ can typically refer to a ‘sentence’ or ‘paragraph’ and a ‘corpus’ is typically a ‘collection of documents as a bag of words’. That is, for each document, a corpus contains each word’s id and its frequency count in that document

In [2]:
# How to create a dictionary from a list of sentences?
documents_1 = ["Your childhood teacher did not wrong you when they taught you that there should be three, or four, or five sentences in a paragraph." ,
             "It is important to understand, however, that the aim in teaching this was not to impart a hard-and-fast rule of grammar" ,
             "drawn from an authoritative-but-dusty book.",
             "The true aim of this strategy was to teach you that your ideas must be well supported to be persuasive and effective."]

documents_2 = ["Recent research has provided a wealth of insight about how dogs came to be domesticated by humans and the roles they played in Native American culture.",
               "DNA studies on archaeological finds suggest that dogs may have been domesticated by humans as long as 40,000 years ago.",
               "When the first humans came to North America from Eurasia, at least 12,000 years ago, domesticated dogs came with them.",
               " They appear to have been highly prized by early North American hunter-gatherers and were their only animal companions for centuries", 
               "since there were no horses on the continent until the 16th century."]
documents = documents_1 + documents_2
# Tokenize(split) the sentences into words
texts = [[text.lower() for text in doc.split()] for doc in documents]

# Create dictionary
dictionary = corpora.Dictionary(texts)


In [3]:
print(dictionary)

Dictionary(114 unique tokens: ['a', 'be', 'childhood', 'did', 'five']...)


In [4]:
print(dictionary.token2id)

{'a': 0, 'be': 1, 'childhood': 2, 'did': 3, 'five': 4, 'four,': 5, 'in': 6, 'not': 7, 'or': 8, 'paragraph.': 9, 'sentences': 10, 'should': 11, 'taught': 12, 'teacher': 13, 'that': 14, 'there': 15, 'they': 16, 'three,': 17, 'when': 18, 'wrong': 19, 'you': 20, 'your': 21, 'aim': 22, 'grammar': 23, 'hard-and-fast': 24, 'however,': 25, 'impart': 26, 'important': 27, 'is': 28, 'it': 29, 'of': 30, 'rule': 31, 'teaching': 32, 'the': 33, 'this': 34, 'to': 35, 'understand,': 36, 'was': 37, 'an': 38, 'authoritative-but-dusty': 39, 'book.': 40, 'drawn': 41, 'from': 42, 'and': 43, 'effective.': 44, 'ideas': 45, 'must': 46, 'persuasive': 47, 'strategy': 48, 'supported': 49, 'teach': 50, 'true': 51, 'well': 52, 'about': 53, 'american': 54, 'by': 55, 'came': 56, 'culture.': 57, 'dogs': 58, 'domesticated': 59, 'has': 60, 'how': 61, 'humans': 62, 'insight': 63, 'native': 64, 'played': 65, 'provided': 66, 'recent': 67, 'research': 68, 'roles': 69, 'wealth': 70, '40,000': 71, 'ago.': 72, 'archaeological'

## Let's create corpus object that contains the word id and its frequency in each document.

In [5]:
corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in texts]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1)], [(0, 1), (6, 1), (7, 1), (14, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 2), (36, 1), (37, 1)], [(38, 1), (39, 1), (40, 1), (41, 1), (42, 1)], [(1, 2), (14, 1), (20, 1), (21, 1), (22, 1), (30, 1), (33, 1), (34, 1), (35, 2), (37, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1)], [(0, 1), (1, 1), (6, 1), (16, 1), (30, 1), (33, 1), (35, 1), (43, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1)], [(14, 1), (55, 1), (58, 1), (59, 1), (62, 1), (71, 1), (72, 1), (73, 1), (74, 2), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1

In [6]:
word_counts = [[(dictionary[id], count) for id, count in line] for line in corpus]
print(word_counts)

[[('a', 1), ('be', 1), ('childhood', 1), ('did', 1), ('five', 1), ('four,', 1), ('in', 1), ('not', 1), ('or', 2), ('paragraph.', 1), ('sentences', 1), ('should', 1), ('taught', 1), ('teacher', 1), ('that', 1), ('there', 1), ('they', 1), ('three,', 1), ('when', 1), ('wrong', 1), ('you', 2), ('your', 1)], [('a', 1), ('in', 1), ('not', 1), ('that', 1), ('aim', 1), ('grammar', 1), ('hard-and-fast', 1), ('however,', 1), ('impart', 1), ('important', 1), ('is', 1), ('it', 1), ('of', 1), ('rule', 1), ('teaching', 1), ('the', 1), ('this', 1), ('to', 2), ('understand,', 1), ('was', 1)], [('an', 1), ('authoritative-but-dusty', 1), ('book.', 1), ('drawn', 1), ('from', 1)], [('be', 2), ('that', 1), ('you', 1), ('your', 1), ('aim', 1), ('of', 1), ('the', 1), ('this', 1), ('to', 2), ('was', 1), ('and', 1), ('effective.', 1), ('ideas', 1), ('must', 1), ('persuasive', 1), ('strategy', 1), ('supported', 1), ('teach', 1), ('true', 1), ('well', 1)], [('a', 1), ('be', 1), ('in', 1), ('they', 1), ('of', 1),

## In paragraphs, certain words always tend to occur in pairs (bigram) or in groups of threes (trigram). Because the two words combined together form the actual entity. For example: The word ‘machine’ refers a device or a gadget or apparatus and the word ‘learning’ can refer to acquisition of knowledge or skills . But combining them, ‘Machine Learning’, refers to something completely different.



In [11]:
import gensim.downloader as api

In [13]:
# 31.68MB file will be downloaded
dataset = api.load("text8")
dataset = [wd for wd in dataset]

dct = corpora.Dictionary(dataset)
corpus = [dct.doc2bow(line) for line in dataset]

# Build the bigram models
# Words below min_count frequency are ignored
# Threshold represents a threshold for forming the phrases (higher means fewer phrases). 
# A phrase of words a and b is accepted if (cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold, where N is the total vocabulary size.
bigram = gensim.models.phrases.Phrases(dataset, min_count=3, threshold=10)

In [17]:
bigram[dataset[10]][:50]

['study',
 'the',
 'surface',
 'of',
 'the',
 'moon',
 'astronomy',
 'is',
 'generally_thought',
 'to',
 'have_begun',
 'in',
 'ancient',
 'babylon',
 'by',
 'the',
 'persian',
 'zoroastrian',
 'priests',
 'the',
 'magi',
 'recent_studies',
 'of',
 'babylonian',
 'records',
 'have',
 'shown',
 'them',
 'to',
 'be',
 'extremely_accurate',
 'for',
 'the',
 'ancient',
 'night_sky',
 'following',
 'the',
 'babylonians',
 'the',
 'egyptians',
 'also',
 'had',
 'an',
 'emphasis_on',
 'observations',
 'of',
 'the',
 'sky',
 'mixtures',
 'of']