What is Gensim?
Gensim is a popular open-source natural language processing (NLP) library specialising in unsupervised topic modeling. Topic modeling is a technique to extract hidden topics from large volumes of text.

The Gensim library is designed to handle large amounts of text data and provide efficient and scalable algorithms for topic modeling, similarity detection, and text summarization.

Gensim makes it easy to perform these tasks by providing efficient implementations of popular algorithms such as Latent Dirichlet Allocation (LDA).



In [6]:
from gensim.corpora import Dictionary

# Create a corpus from a list of documents
documents = [['this', 'is', 'a', 'document'], ['this', 'is', 'another', 'document']]

# dictionary of all the unique words in the documents.
dictionary = Dictionary(documents)
print(dictionary)

Dictionary<5 unique tokens: ['a', 'document', 'is', 'this', 'another']>


In [3]:
corpus = [dictionary.doc2bow(document) for document in documents]
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(1, 1), (2, 1), (3, 1), (4, 1)]]

In [5]:
from gensim.models import Word2Vec
import numpy as np

# Create a list of tokenized documents
documents = [['this', 'is', 'a', 'document'], ['this', 'is', 'another', 'document']]

# Train a Word2Vec model on the documents
model = Word2Vec(documents, window=5, min_count=1)

# Get the vector for a word
word_vector = model.wv['document']

# Get the mean vector for a document
new_document = ['this', 'is', 'another', 'document']
document_vector = np.mean([model.wv[word] for word in new_document], axis=0)

In [8]:
# data pre-processing
# Cleaning the text, 

# Removing stop words and punctuation, 
from gensim import corpora
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

# Sample documents
documents = ["This is the first document.", "This is the second document.", "This is the third document."]

# Create a dictionary from the documents
dictionary = corpora.Dictionary([doc.split() for doc in documents])

# Remove stopwords from the dictionary: Stopwords are commonly used words such as "the",, "and", "is", "in", etc., that frequently occur in a language but do not add much meaning to the text. 
stop_words = set(stopwords.words('english'))
dictionary.filter_tokens(bad_ids=[dictionary.token2id[stopword] for stopword in stop_words if stopword in dictionary.token2id])

# Remove low-frequency terms from the dictionary
dictionary.filter_extremes(no_below=1)

print(dictionary)

# Tokenizing the text into individual words or phrases, 

# Converting the text into a numerical representation. 

Dictionary<3 unique tokens: ['first', 'second', 'third']>


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saksh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [None]:
import gensim
from gensim import corpora

# Sample tokenized documents
documents = [["apple", "banana", "orange"], ["orange", "juice"], ["banana", "apple", "juice", "orange"]]

# Create a dictionary from the documents
dictionary = corpora.Dictionary(documents)

# Create a corpus from the tokenized documents
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Print the BoW representation for the first document
print(corpus[0])

In [18]:
# Gensim downloader API: provides an inbuilt API to download popular text datasets and word embedding models.
import gensim.downloader as api

# List all the datasets
for dataset_name in api.info()['corpora'].keys():
    print(dataset_name)

semeval-2016-2017-task3-subtaskBC
semeval-2016-2017-task3-subtaskA-unannotated
patent-2017
quora-duplicate-questions
wiki-english-20171001
text8
fake-news
20-newsgroups
__testing_matrix-synopsis
__testing_multipart-matrix-synopsis


In [24]:
# comprises approximately 20,000 newsgroup documents across 20 topics, such as sports, politics, and technology.

dataset = api.load('20-newsgroups')
for text in dataset:
    print(text['data'])  # Print a sample document from the dataset
    break

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived', 'from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political', 'philosophy', 'is', 'the', 'belief', 'that', 'rulers', 'are', 'unnecessary', 'and', 'should', 'be', 'abolished', 'although', 'there', 'are', 'differing', 'interpretations', 'of', 'what', 'this', 'means', 'anarchism', 'also', 'refers', 'to', 'related', 'so

In [22]:
# Download pre-trained Word2Vec model
w2v_model = api.load('word2vec-google-news-300')

# Get the vector of a word
word_vector = w2v_model['computer']
print(word_vector)

# Find similar words to 'computer'
similar_words = w2v_model.most_similar('computer')
print(similar_words)

[=-------------------------------------------------] 3.9% 64.5/1662.8MB downloaded

KeyboardInterrupt: 

In [23]:
# Download and load the Text8 dataset (a small subset of Wikipedia)
corpus = api.load('text8')

# Preprocess the dataset (convert it to list of tokens)
tokenized_corpus = [list(doc) for doc in corpus]
print(tokenized_corpus[:1])  # Print first document


[['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived', 'from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political', 'philosophy', 'is', 'the', 'belief', 'that', 'rulers', 'are', 'unnecessary', 'and', 'should', 'be', 'abolished', 'although', 'there', 'are', 'differing', 'interpretations', 'of', 'what', 'this', 'means', 'anarchism', 'also', 'refers', 'to', 'related', 's

In [3]:
from gensim import corpora
from pprint import pprint

dic



OverflowError: Python int too large to convert to C long