In [1]:
# -------- TEXT PREPROCESSING + TF-IDF ---------

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# download only first time
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

texts = [
    "I love Artificial Intelligence!",
    "AI is changing the world rapidly.",
    "This NLP course is amazing and very helpful."
]

# ---------- Cleaning + Preprocessing ----------
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()                              # lowercase
    text = re.sub(r'[^a-z\s]', '', text)             # remove punctuation/numbers
    tokens = nltk.word_tokenize(text)                # tokenization
    tokens = [w for w in tokens if w not in stop_words]  # remove stopwords
    tokens = [lemmatizer.lemmatize(w) for w in tokens]   # lemmatization
    return " ".join(tokens)

clean_texts = [preprocess(t) for t in texts]
print("Cleaned Texts:")
print(clean_texts)

# ---------- TF-IDF ----------
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(clean_texts)

print("\nVocabulary / Features:")
print(tfidf.get_feature_names_out())

print("\nTF-IDF Matrix (rows = documents, columns = words):")
print(X.toarray())

print("\nIDF Values:")
idf_values = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))
print(idf_values)


Cleaned Texts:
['love artificial intelligence', 'ai changing world rapidly', 'nlp course amazing helpful']

Vocabulary / Features:
['ai' 'amazing' 'artificial' 'changing' 'course' 'helpful' 'intelligence'
 'love' 'nlp' 'rapidly' 'world']

TF-IDF Matrix (rows = documents, columns = words):
[[0.         0.         0.57735027 0.         0.         0.
  0.57735027 0.57735027 0.         0.         0.        ]
 [0.5        0.         0.         0.5        0.         0.
  0.         0.         0.         0.5        0.5       ]
 [0.         0.5        0.         0.         0.5        0.5
  0.         0.         0.5        0.         0.        ]]

IDF Values:
{'ai': np.float64(1.6931471805599454), 'amazing': np.float64(1.6931471805599454), 'artificial': np.float64(1.6931471805599454), 'changing': np.float64(1.6931471805599454), 'course': np.float64(1.6931471805599454), 'helpful': np.float64(1.6931471805599454), 'intelligence': np.float64(1.6931471805599454), 'love': np.float64(1.693147180559945

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

texts = [
    "I love Artificial Intelligence",
    "AI is changing the world rapidly",
    "This NLP course is amazing and very helpful"
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)

print("Vocabulary / Features:")
print(vectorizer.get_feature_names_out())

print("\nBag of Words Matrix:")
print(X.toarray())


Vocabulary / Features:
['ai' 'amazing' 'and' 'artificial' 'changing' 'course' 'helpful'
 'intelligence' 'is' 'love' 'nlp' 'rapidly' 'the' 'this' 'very' 'world']

Bag of Words Matrix:
[[0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0]
 [1 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1]
 [0 1 1 0 0 1 1 0 1 0 1 0 0 1 1 0]]
