<a href="https://colab.research.google.com/github/Aravind8281/Natural_language_Processing/blob/main/GenSim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install gensim




# Vector space model

In [12]:
from gensim import corpora
from gensim.models import TfidfModel
corpus = [
    "Gensim is an open-source library for topic modeling.",
    "Vector space models are essential in natural language processing.",
    "The TfidfModel in Gensim calculates the term-document matrix.",
    "Inverse document frequency (IDF) is crucial for information retrieval.",
]


tokenized_corpus=[doc.split() for doc in corpus]
print(tokenized_corpus)
dictionary=corpora.Dictionary(tokenized_corpus)
print("Dictionary :",dictionary)
term_document=[dictionary.doc2bow(doc) for doc in tokenized_corpus]
tfid_model=TfidfModel(term_document)
tfid_matrix=tfid_model[term_document]
print("Term-Document Matrix:")
for doc in tfidf_matrix:
    print(doc)
print("\nInverse Document Frequency (IDF):")
for term, idf_value in tfidf_model.idfs.items():
    print(f"{dictionary[term]}: {idf_value}")


[['Gensim', 'is', 'an', 'open-source', 'library', 'for', 'topic', 'modeling.'], ['Vector', 'space', 'models', 'are', 'essential', 'in', 'natural', 'language', 'processing.'], ['The', 'TfidfModel', 'in', 'Gensim', 'calculates', 'the', 'term-document', 'matrix.'], ['Inverse', 'document', 'frequency', '(IDF)', 'is', 'crucial', 'for', 'information', 'retrieval.']]
Dictionary : Dictionary<30 unique tokens: ['Gensim', 'an', 'for', 'is', 'library']...>
Term-Document Matrix:
[(0, 0.20851441405707477), (1, 0.41702882811414954), (2, 0.20851441405707477), (3, 0.20851441405707477), (4, 0.41702882811414954), (5, 0.41702882811414954), (6, 0.41702882811414954), (7, 0.41702882811414954)]
[(8, 0.3481553119113957), (9, 0.3481553119113957), (10, 0.3481553119113957), (11, 0.17407765595569785), (12, 0.3481553119113957), (13, 0.3481553119113957), (14, 0.3481553119113957), (15, 0.3481553119113957), (16, 0.3481553119113957)]
[(0, 0.19611613513818404), (11, 0.19611613513818404), (17, 0.3922322702763681), (18, 

# Latent Semantic Analysis

In [14]:
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models import LsiModel
from pprint import pprint
documents = [
    "Gensim is a Python library for topic modeling and document similarity analysis.",
    "Latent Semantic Analysis (LSA) is a technique used for extracting hidden semantic structures in a document collection.",
    "The Vector Space Model (VSM) is fundamental for tasks like document similarity and topic modeling.",
]

tokenized_corpus=[doc.lower().split() for doc in documents]
dictionary=corpora.Dictionary(tokenized_corpus)
corpus=[dictionary.doc2bow(doc) for doc in tokenized_corpus]
tfidf_model = TfidfModel(corpus)
tfidf_corpus = tfidf_model[corpus]
num_topics = 2
lsa_model = LsiModel(tfidf_corpus, num_topics=num_topics)
pprint(lsa_model.print_topics())
term_document_matrix = lsa_model[tfidf_corpus]
for doc, as_text in zip(term_document_matrix, tokenized_docs):
    print(as_text)
    print(doc)
    print()
idf = tfidf_model.idfs
print("Inverse Document Frequency (IDF):")
for term, value in zip(dictionary.token2id.keys(), idf):
    print(f"{term}: {value}")




[(0,
  '0.291*"1" + 0.291*"5" + 0.291*"8" + 0.291*"7" + 0.291*"9" + 0.197*"24" + '
  '0.197*"27" + 0.197*"23" + 0.197*"25" + 0.197*"26"'),
 (1,
  '0.462*"19" + 0.231*"18" + 0.231*"14" + 0.231*"17" + 0.231*"12" + 0.231*"22" '
  '+ 0.231*"21" + 0.231*"20" + 0.231*"13" + 0.231*"16"')]
['gensim', 'is', 'a', 'python', 'library', 'for', 'topic', 'modeling', 'and', 'document', 'similarity', 'analysis.']
[(0, 0.7294502910023443)]

['latent', 'semantic', 'analysis', '(lsa)', 'is', 'a', 'technique', 'used', 'for', 'extracting', 'hidden', 'semantic', 'structures', 'in', 'a', 'document', 'collection.']
[(0, 0.3446988960651882), (1, 0.8813059489870925)]

['the', 'vector', 'space', 'model', '(vsm)', 'is', 'fundamental', 'for', 'tasks', 'like', 'document', 'similarity', 'and', 'topic', 'modeling.']
[(0, 0.6428688809507336), (1, -0.47254610809947484)]

Inverse Document Frequency (IDF):
a: 0
analysis.: 1
and: 2
document: 3
for: 4
gensim: 5
is: 6
library: 7
modeling: 8
python: 9
similarity: 10
topic: 11

# Latent Dirichlet Allocation

In [16]:
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
documents = [
    "Topic modeling is an interesting area of natural language processing.",
    "Latent Dirichlet Allocation is a popular technique for topic modeling.",
    "Gensim provides an implementation of Latent Dirichlet Allocation.",
    "The Python programming language is commonly used in data science.",
    "Stopwords are common words that are often removed in text processing.",
]
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [stemmer.stem(token) for token in tokens if token.isalnum() and token not in stop_words]
    return tokens
processed_documents = [preprocess_text(doc) for doc in documents]
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)


(0, '0.056*"process" + 0.056*"languag" + 0.054*"stopword" + 0.054*"common" + 0.054*"word"')
(1, '0.075*"dirichlet" + 0.075*"latent" + 0.075*"alloc" + 0.075*"model" + 0.075*"topic"')


#Word2vec

In [17]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
sentences = [
    "Word embeddings are powerful tools in NLP.",
    "They capture semantic relationships between words.",
    "Word2Vec is a popular technique for creating word embeddings."
]
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)
vector = model.wv['word']
similar_words = model.wv.most_similar('word', topn=3)
print("Vector for 'word':", vector)
print("Similar words to 'word':", similar_words)


Vector for 'word': [-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419385e-03
  7.4669183e-03 -6.1676754e-03  1.1056137e-03  6.0472824e-03
 -2.8400505e-03 -6.1735227e-03 -4.1022300e-04 -8.3689485e-03
 -5.6000124e-03  7.1045388e-03  3.3525396e-03  7.2256695e-03
  6.8002474e-03  7.5307419e-03 -3.7891543e-03 -5.6180597e-04
  2.3483764e-03 -4.5190323e-03  8.3887316e-03 -9.8581640e-03
  6.7646410e-03  2.9144168e-03 -4.9328315e-03  4.3981876e-03
 -1.7395747e-03  6.7113843e-03  9.9648498e-03 -4.3624435e-03
 -5.9933780e-04 -5.6956373e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384968e-03  9.2734173e-03
  7.8980681e-03 -6.9895042e-03 -9.1558648e-03 -3.5575271e-04
 -3.0998408e-03  7.8943167e-03  5.9385742e-03 -1.5456629e-03
  1.5109634e-03  1.7900408e-03  7.8175711e-03 -9.5101865e-03
 -2.0553112e-04  3.4691966e-03 -9.3897223e-04  8.3817719e-03
  9.0107834e-03  6.5365066e-03 -7.1162102e-04  7.7104042e-03
 -8.5343346e-03  3.2071066e-03 -4.6379971e-03 -5.0889552e-03
  3.5

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Doc2Vec

In [19]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
documents = [
    "Doc2Vec is an extension of Word2Vec.",
    "It represents entire documents as vectors.",
    "Doc2Vec is used for document similarity and categorization."
]
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]) for i, doc in enumerate(documents)]
model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=100)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
vector = model.dv['0']
similar_documents = model.dv.most_similar('0', topn=2)
print("Vector for document 0:", vector)
print("Similar documents to document 0:", similar_documents)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Vector for document 0: [-7.9716919e-03 -8.3736442e-03 -1.0944714e-02  9.9241920e-03
  4.5422944e-03 -5.4450217e-04 -1.0094763e-02 -5.2044634e-03
 -1.0782715e-02  2.9547068e-03  2.1002844e-03  5.0075110e-03
 -6.5667988e-03 -4.4701481e-03 -2.2102748e-03 -9.9799922e-03
  1.1813106e-03  1.0580492e-02 -1.0827275e-02 -4.1885907e-03
 -4.3763341e-03  2.8289685e-03 -5.8091190e-03  4.9553094e-03
  5.1590684e-03 -8.7494971e-03 -1.0693724e-02 -1.1138376e-02
  4.8447046e-03 -1.0996391e-02  6.8827504e-03  6.9135614e-03
 -6.2759575e-03 -6.1971759e-03 -2.4070984e-03  2.4437967e-03
 -2.3207248e-03 -1.0111362e-02 -5.1758345e-03  3.7550332e-04
 -1.5949557e-04 -7.6236208e-03  5.5744550e-03 -1.0257954e-02
  2.5768082e-03 -5.1730750e-03  2.6390146e-04 -5.9319602e-04
  5.8112508e-03 -1.0814729e-02 -4.2317528e-03 -4.0683089e-04
 -7.5302068e-03 -8.7972917e-03 -3.1240392e-03  1.1477897e-02
 -6.2208658e-04  5.0746379e-03 -7.2606523e-03  1.0341367e-02
  5.0619435e-03  1.2346759e-02  7.1806652e-03 -5.0291573e-03
 