<a href="https://colab.research.google.com/github/DeepthiTabithaBennet/NaturalLanguageProcessing/blob/main/TextRep_Doc2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install gensim



In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

# Sample data
documents = [
    "I love programming in Python.",
    "Natural language processing is a fascinating field.",
    "Machine learning and deep learning are subsets of artificial intelligence.",
    "Doc2Vec is an extension of Word2Vec.",
    "Gensim provides efficient implementations of popular NLP algorithms."
]

# Preprocess data: tokenization
tagged_data = [TaggedDocument(words=simple_preprocess(doc), tags=[str(i)]) for i, doc in enumerate(documents)]

# Initialize and train Doc2Vec model
model = Doc2Vec(vector_size=50, window=2, min_count=1, epochs=100)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Infer vector for a new sentence
new_sentence = "I am learning about Doc2Vec."
new_sentence_vector = model.infer_vector(simple_preprocess(new_sentence))

print("Vector for the new sentence:", new_sentence_vector)

# Find most similar documents to the new sentence
similar_docs = model.dv.most_similar([new_sentence_vector])
print("Most similar documents:")
for doc_id, similarity in similar_docs:
    print("Document ID:", doc_id, "Similarity:", similarity, "Document:", documents[int(doc_id)])

Vector for the new sentence: [ 4.72261105e-03  6.72688102e-03  9.27181169e-03  5.47953695e-03
  5.16829081e-03  5.49017265e-03 -2.18511466e-03  1.05680479e-02
 -3.92303150e-03  3.92060960e-03  1.00852028e-02  4.72586798e-05
 -1.16804289e-03  3.24215461e-03 -7.09559955e-03  3.66169470e-03
 -4.36637551e-04 -8.31920840e-03 -4.52698953e-03  3.96481575e-03
 -4.42491379e-03  1.04362154e-02 -1.17365026e-03 -3.19059705e-03
 -8.82375147e-03  9.66457243e-04 -5.86954970e-03 -1.13134263e-02
 -8.34899489e-03  1.42220454e-03  1.05380500e-02  5.43269969e-04
 -9.09433607e-03 -7.33674271e-03 -2.86207534e-03  3.38522368e-03
  1.10703462e-03 -9.76164732e-03  1.04220435e-02 -7.25680916e-03
 -2.38288753e-03  1.31552545e-02  2.09682016e-03 -9.28900950e-03
  1.94098288e-03  4.48812405e-03  6.01902790e-03  1.34276669e-03
  1.16752665e-02 -8.37264161e-05]
Most similar documents:
Document ID: 1 Similarity: 0.4606287479400635 Document: Natural language processing is a fascinating field.
Document ID: 3 Similarity

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

# define a list of documents.
data = [
    "I love programming in Python.",
    "Natural language processing is a fascinating field.",
    "Machine learning and deep learning are subsets of artificial intelligence.",
    "Doc2Vec is an extension of Word2Vec.",
    "Gensim provides efficient implementations of popular NLP algorithms."
]

# preproces the documents, and create TaggedDocuments
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),
							tags=[str(i)]) for i,
			doc in enumerate(data)]

# train the Doc2vec model
model = Doc2Vec(vector_size=20,
				min_count=2, epochs=50)

model.build_vocab(tagged_data)

model.train(tagged_data,
			total_examples=model.corpus_count,
			epochs=model.epochs)

# get the document vectors
document_vectors = [model.infer_vector(
	word_tokenize(doc.lower())) for doc in data]

# print the document vectors
for i, doc in enumerate(data):
	print("Document", i+1, ":", doc)
	print("Vector:", document_vectors[i])
	print()


Document 1 : I love programming in Python.
Vector: [ 0.00468896 -0.00651363 -0.00624672 -0.01846151 -0.00022137 -0.02485958
  0.01064255  0.02412496 -0.02449686 -0.00298595  0.01080469  0.02459338
 -0.01282512 -0.00505272  0.00191159 -0.01186773  0.01923111 -0.00104116
  0.02097891 -0.01662402]

Document 2 : Natural language processing is a fascinating field.
Vector: [-0.01388139  0.01655041 -0.00023787  0.00953289 -0.02298607  0.01190146
 -0.01248948 -0.01475126  0.00889673  0.02166019 -0.01868207 -0.00650447
 -0.0083094  -0.00708576  0.00116256  0.00897374  0.02481955 -0.00533592
 -0.01768001  0.01152035]

Document 3 : Machine learning and deep learning are subsets of artificial intelligence.
Vector: [-0.00878998  0.0170193   0.0158102  -0.00970508  0.02302436  0.01707866
  0.00563933 -0.02454659 -0.01118153 -0.02517018  0.01681244 -0.01689572
  0.01943532  0.00386697 -0.02514085 -0.01512298 -0.00418437 -0.00468899
  0.00149293 -0.01320517]

Document 4 : Doc2Vec is an extension of Wo