In [1]:
!pip install nltk gensim scikit-learn

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [2]:
#imports
import nltk
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# sample dataset

documents = [
    "The movie was fantastic with brilliant acting",
    "I did not like the movie it was boring",
    "The film had a great story and excellent visuals",
    "Terrible movie with poor direction and weak plot",
    "Amazing performance by the actors and a good script",
    "The movie was average but the music was nice"
]

print(documents)

['The movie was fantastic with brilliant acting', 'I did not like the movie it was boring', 'The film had a great story and excellent visuals', 'Terrible movie with poor direction and weak plot', 'Amazing performance by the actors and a good script', 'The movie was average but the music was nice']


In [4]:
#Bag-of-Words (Count Occurrence)

count_vectorizer = CountVectorizer()
bow_counts = count_vectorizer.fit_transform(documents)

print("Vocabulary:")
print(count_vectorizer.get_feature_names_out())

print("\nBag-of-Words (Count Occurrence):")
print(bow_counts.toarray())

Vocabulary:
['acting' 'actors' 'amazing' 'and' 'average' 'boring' 'brilliant' 'but'
 'by' 'did' 'direction' 'excellent' 'fantastic' 'film' 'good' 'great'
 'had' 'it' 'like' 'movie' 'music' 'nice' 'not' 'performance' 'plot'
 'poor' 'script' 'story' 'terrible' 'the' 'visuals' 'was' 'weak' 'with']

Bag-of-Words (Count Occurrence):
[[1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1]
 [0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 1 1]
 [0 1 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 2 0 2 0 0]]


In [5]:
# Bag-of-Words (Normalized Count Occurrence) (L2 Normalization)
from sklearn.preprocessing import normalize

count_vectorizer_norm = CountVectorizer()
bow_counts = count_vectorizer_norm.fit_transform(documents)
bow_normalized = normalize(bow_counts, norm='l2', axis=1)

print("Vocabulary:")
print(count_vectorizer_norm.get_feature_names_out())

print("\nNormalized Bag-of-Words (L2 Normalization):")
print(bow_normalized.toarray())

Vocabulary:
['acting' 'actors' 'amazing' 'and' 'average' 'boring' 'brilliant' 'but'
 'by' 'did' 'direction' 'excellent' 'fantastic' 'film' 'good' 'great'
 'had' 'it' 'like' 'movie' 'music' 'nice' 'not' 'performance' 'plot'
 'poor' 'script' 'story' 'terrible' 'the' 'visuals' 'was' 'weak' 'with']

Normalized Bag-of-Words (L2 Normalization):
[[0.37796447 0.         0.         0.         0.         0.
  0.37796447 0.         0.         0.         0.         0.
  0.37796447 0.         0.         0.         0.         0.
  0.         0.37796447 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.37796447
  0.         0.37796447 0.         0.37796447]
 [0.         0.         0.         0.         0.         0.35355339
  0.         0.         0.         0.35355339 0.         0.
  0.         0.         0.         0.         0.         0.35355339
  0.35355339 0.35355339 0.         0.         0.35355339 0.
  0.         0.         0.         0.         0.

In [6]:
# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())

Vocabulary:
['acting' 'actors' 'amazing' 'and' 'average' 'boring' 'brilliant' 'but'
 'by' 'did' 'direction' 'excellent' 'fantastic' 'film' 'good' 'great'
 'had' 'it' 'like' 'movie' 'music' 'nice' 'not' 'performance' 'plot'
 'poor' 'script' 'story' 'terrible' 'the' 'visuals' 'was' 'weak' 'with']

TF-IDF Matrix:
[[0.4580532  0.         0.         0.         0.         0.
  0.4580532  0.         0.         0.         0.         0.
  0.4580532  0.         0.         0.         0.         0.
  0.         0.27174425 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.23467289
  0.         0.31711592 0.         0.37561017]
 [0.         0.         0.         0.         0.         0.40509636
  0.         0.         0.         0.40509636 0.         0.
  0.         0.         0.         0.         0.         0.40509636
  0.40509636 0.24032712 0.         0.         0.40509636 0.
  0.         0.         0.         0.         0.         0.20754169
  0.     

In [7]:
# Tokenize Sentences for Word2Vec
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt_tab')
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
print(tokenized_docs)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


[['the', 'movie', 'was', 'fantastic', 'with', 'brilliant', 'acting'], ['i', 'did', 'not', 'like', 'the', 'movie', 'it', 'was', 'boring'], ['the', 'film', 'had', 'a', 'great', 'story', 'and', 'excellent', 'visuals'], ['terrible', 'movie', 'with', 'poor', 'direction', 'and', 'weak', 'plot'], ['amazing', 'performance', 'by', 'the', 'actors', 'and', 'a', 'good', 'script'], ['the', 'movie', 'was', 'average', 'but', 'the', 'music', 'was', 'nice']]


In [8]:
# Train Word2Vec Model
w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

In [10]:
# Word Embeddings (Vector Representation)
print("Vector for word 'movie':")
print(w2v_model.wv['movie'])

print("\nVector size:", w2v_model.wv.vector_size)

Vector for word 'movie':
[ 9.4563962e-05  3.0773198e-03 -6.8126451e-03 -1.3754654e-03
  7.6685809e-03  7.3464094e-03 -3.6732971e-03  2.6427018e-03
 -8.3171297e-03  6.2054861e-03 -4.6373224e-03 -3.1641065e-03
  9.3113566e-03  8.7338570e-04  7.4907029e-03 -6.0740625e-03
  5.1605068e-03  9.9228229e-03 -8.4573915e-03 -5.1356913e-03
 -7.0648370e-03 -4.8626517e-03 -3.7785638e-03 -8.5361991e-03
  7.9556061e-03 -4.8439382e-03  8.4236134e-03  5.2625705e-03
 -6.5500261e-03  3.9578713e-03  5.4701497e-03 -7.4265362e-03
 -7.4057197e-03 -2.4752307e-03 -8.6257253e-03 -1.5815723e-03
 -4.0343284e-04  3.2996845e-03  1.4418805e-03 -8.8142155e-04
 -5.5940580e-03  1.7303658e-03 -8.9737179e-04  6.7936908e-03
  3.9735902e-03  4.5294715e-03  1.4343059e-03 -2.6998555e-03
 -4.3668128e-03 -1.0320747e-03  1.4370275e-03 -2.6460087e-03
 -7.0737829e-03 -7.8053069e-03 -9.1217868e-03 -5.9351693e-03
 -1.8474245e-03 -4.3238713e-03 -6.4606704e-03 -3.7173224e-03
  4.2891586e-03 -3.7390434e-03  8.3781751e-03  1.5339935e-03

In [11]:
#Similar Words using Word2Vec
similar_words = w2v_model.wv.most_similar('movie')
print("Words similar to 'movie':")
print(similar_words)

Words similar to 'movie':
[('but', 0.19912627339363098), ('good', 0.17276281118392944), ('like', 0.1711830496788025), ('a', 0.17020903527736664), ('direction', 0.1528114229440689), ('terrible', 0.14860160648822784), ('nice', 0.14595982432365417), ('i', 0.0805894061923027), ('acting', 0.06437695026397705), ('with', 0.06408977508544922)]
