In [1]:
!pip install nltk scikit-learn gensim




In [5]:
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.datasets import fetch_20newsgroups


nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
data = fetch_20newsgroups(subset='train')

documents = data.data[:200]   # taking first 200 documents for faster execution

print("Number of documents:", len(documents))


Number of documents: 200


In [7]:
print("Sample Document:\n")
print(documents[0][:1000])


Sample Document:

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [8]:
count_vectorizer = CountVectorizer(stop_words='english', max_features=1000)
bow_matrix = count_vectorizer.fit_transform(documents)

print("Vocabulary Size:", len(count_vectorizer.get_feature_names_out()))
print("BoW Matrix Shape:", bow_matrix.shape)


Vocabulary Size: 1000
BoW Matrix Shape: (200, 1000)


In [9]:
print("Bag of Words Matrix (first 5 documents):")
print(bow_matrix[:5].toarray())


Bag of Words Matrix (first 5 documents):
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [11]:
from sklearn.preprocessing import normalize

bow_norm_matrix = normalize(bow_matrix, norm='l1')

print("Normalized BoW Matrix (first 5 documents):")
print(bow_norm_matrix[:5].toarray())


Normalized BoW Matrix (first 5 documents):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [13]:
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=1000
)

tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("TF-IDF Matrix Shape:", tfidf_matrix.shape)



TF-IDF Matrix Shape: (200, 1000)


In [14]:
print("TF-IDF Matrix (first 5 documents):")
print(tfidf_matrix[:5].toarray())


TF-IDF Matrix (first 5 documents):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [15]:
tokenized_docs = [
    nltk.word_tokenize(doc.lower())
    for doc in documents
]

print("Tokenized Sample:")
print(tokenized_docs[0][:30])


Tokenized Sample:
['from', ':', 'lerxst', '@', 'wam.umd.edu', '(', 'where', "'s", 'my', 'thing', ')', 'subject', ':', 'what', 'car', 'is', 'this', '!', '?', 'nntp-posting-host', ':', 'rac3.wam.umd.edu', 'organization', ':', 'university', 'of', 'maryland', ',', 'college', 'park']


In [16]:
word2vec_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=100,
    window=5,
    min_count=5,
    workers=4
)



In [17]:
print("Embedding for word 'computer':")
print(word2vec_model.wv['computer'])


Embedding for word 'computer':
[-8.4389988e-03  2.0750697e-01 -1.6187280e-01  5.7690009e-02
  1.3881016e-01 -6.4306951e-01  3.6425608e-01  8.0951655e-01
 -3.4699607e-01 -3.0969405e-01  3.9612908e-02 -4.3319610e-01
 -3.2460812e-02  1.4265198e-01  3.5686022e-01 -3.3237728e-01
 -6.5476343e-02 -5.6264386e-02 -4.1712560e-02 -3.4480768e-01
  1.6168292e-01 -3.5561968e-02  2.6941568e-01  1.6365443e-01
 -3.1138973e-02 -1.7730813e-01 -2.9514918e-01 -3.6047302e-02
 -3.2940161e-01 -1.8487844e-01  1.5671846e-01 -2.5505146e-02
  2.3683523e-01 -2.0809671e-01  7.4777557e-03  4.6688542e-01
 -1.7832953e-02 -1.3996325e-01 -2.9677325e-01 -5.0111419e-01
  6.6205958e-04 -1.2484885e-01  6.4427830e-02  1.3924873e-01
  3.9282244e-01 -1.2897237e-01 -2.2194076e-01 -9.5699653e-02
  2.6398894e-01  8.9673154e-02  8.5124977e-02 -4.7678670e-01
 -1.6975669e-02 -1.5707450e-01  1.4365081e-01 -2.2609833e-01
 -7.9880534e-03 -1.7302977e-01 -3.2180369e-01 -9.4598010e-02
  1.2762365e-01  5.2694038e-02  1.4923738e-01 -8.53800

In [18]:
print("Most similar words to 'computer':")
print(word2vec_model.wv.most_similar('computer'))


Most similar words to 'computer':
[('department', 0.9966564178466797), ('science', 0.994379460811615), ('state', 0.9940309524536133), ('chicago', 0.9919490218162537), ('hi', 0.99053555727005), ('corporation', 0.9904353022575378), ('laboratory', 0.9904119372367859), ('institute', 0.9900234937667847), ('engineering', 0.9899882078170776), ('world', 0.9891650676727295)]
