In [1]:
!pip install nltk gensim scikit-learn


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [2]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import pandas as pd


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
documents = [
    "Natural language processing is an important field According to me ",
    "Machine learning is used in natural language processing",
    "Word embeddings capture semantic meaning"
    "Adhija did this"
]


In [6]:
count_vectorizer = CountVectorizer()
bow_count = count_vectorizer.fit_transform(documents)

bow_count_df = pd.DataFrame(
    bow_count.toarray(),
    columns=count_vectorizer.get_feature_names_out()
)

print("Bag of Words (Count Occurrence)")
print(bow_count_df)


Bag of Words (Count Occurrence)
   according  an  capture  did  embeddings  field  important  in  is  \
0          1   1        0    0           0      1          1   0   1   
1          0   0        0    0           0      0          0   1   1   
2          0   0        1    1           1      0          0   0   0   

   language  ...  machine  me  meaningadhija  natural  processing  semantic  \
0         1  ...        0   1              0        1           1         0   
1         1  ...        1   0              0        1           1         0   
2         0  ...        0   0              1        0           0         1   

   this  to  used  word  
0     0   1     0     0  
1     0   0     1     0  
2     1   0     0     1  

[3 rows x 21 columns]


In [8]:
count_vectorizer_for_normalization = CountVectorizer()
bow_counts_raw = count_vectorizer_for_normalization.fit_transform(documents)

feature_names = count_vectorizer_for_normalization.get_feature_names_out()

bow_counts_array = bow_counts_raw.toarray()

row_sums = bow_counts_array.sum(axis=1, keepdims=True)
row_sums[row_sums == 0] = 1  # Avoid division by zero for empty documents

bow_normalized_array = bow_counts_array / row_sums

bow_normalized_df = pd.DataFrame(
    bow_normalized_array,
    columns=feature_names
)

print("Bag of Words (Normalized Count)")
print(bow_normalized_df)

Bag of Words (Normalized Count)
   according   an   capture       did  embeddings  field  important     in  \
0        0.1  0.1  0.000000  0.000000    0.000000    0.1        0.1  0.000   
1        0.0  0.0  0.000000  0.000000    0.000000    0.0        0.0  0.125   
2        0.0  0.0  0.142857  0.142857    0.142857    0.0        0.0  0.000   

      is  language  ...  machine   me  meaningadhija  natural  processing  \
0  0.100     0.100  ...    0.000  0.1       0.000000    0.100       0.100   
1  0.125     0.125  ...    0.125  0.0       0.000000    0.125       0.125   
2  0.000     0.000  ...    0.000  0.0       0.142857    0.000       0.000   

   semantic      this   to   used      word  
0  0.000000  0.000000  0.1  0.000  0.000000  
1  0.000000  0.000000  0.0  0.125  0.000000  
2  0.142857  0.142857  0.0  0.000  0.142857  

[3 rows x 21 columns]


In [9]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print("TF-IDF Representation")
print(tfidf_df)


TF-IDF Representation
   according        an   capture       did  embeddings     field  important  \
0   0.346821  0.346821  0.000000  0.000000    0.000000  0.346821   0.346821   
1   0.000000  0.000000  0.000000  0.000000    0.000000  0.000000   0.000000   
2   0.000000  0.000000  0.377964  0.377964    0.377964  0.000000   0.000000   

        in        is  language  ...  machine        me  meaningadhija  \
0  0.00000  0.263766  0.263766  ...  0.00000  0.346821       0.000000   
1  0.39798  0.302674  0.302674  ...  0.39798  0.000000       0.000000   
2  0.00000  0.000000  0.000000  ...  0.00000  0.000000       0.377964   

    natural  processing  semantic      this        to     used      word  
0  0.263766    0.263766  0.000000  0.000000  0.346821  0.00000  0.000000  
1  0.302674    0.302674  0.000000  0.000000  0.000000  0.39798  0.000000  
2  0.000000    0.000000  0.377964  0.377964  0.000000  0.00000  0.377964  

[3 rows x 21 columns]


In [11]:
import nltk
nltk.download('punkt_tab')

tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]

word2vec_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

print("Word2Vec Embedding for 'language'")
print(word2vec_model.wv["language"])

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Word2Vec Embedding for 'language'
[ 9.4563962e-05  3.0773198e-03 -6.8126451e-03 -1.3754654e-03
  7.6685809e-03  7.3464094e-03 -3.6732971e-03  2.6427018e-03
 -8.3171297e-03  6.2054861e-03 -4.6373224e-03 -3.1641065e-03
  9.3113566e-03  8.7338570e-04  7.4907029e-03 -6.0740625e-03
  5.1605068e-03  9.9228229e-03 -8.4573915e-03 -5.1356913e-03
 -7.0648370e-03 -4.8626517e-03 -3.7785638e-03 -8.5361991e-03
  7.9556061e-03 -4.8439382e-03  8.4236134e-03  5.2625705e-03
 -6.5500261e-03  3.9578713e-03  5.4701497e-03 -7.4265362e-03
 -7.4057197e-03 -2.4752307e-03 -8.6257253e-03 -1.5815723e-03
 -4.0343284e-04  3.2996845e-03  1.4418805e-03 -8.8142155e-04
 -5.5940580e-03  1.7303658e-03 -8.9737179e-04  6.7936908e-03
  3.9735902e-03  4.5294715e-03  1.4343059e-03 -2.6998555e-03
 -4.3668128e-03 -1.0320747e-03  1.4370275e-03 -2.6460087e-03
 -7.0737829e-03 -7.8053069e-03 -9.1217868e-03 -5.9351693e-03
 -1.8474245e-03 -4.3238713e-03 -6.4606704e-03 -3.7173224e-03
  4.2891586e-03 -3.7390434e-03  8.3781751e-03  1.53