In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "A quick brown dog outpaces a swift fox.",
    "The dog and the fox are both brown."
]

sentences_lower = [sentence.lower() for sentence in sentences]

def preprocess_text(sentence):
    processed_sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
    tokens = word_tokenize(processed_sentence)
    filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
    return filtered_tokens

preprocessed_sentences = [preprocess_text(sentence) for sentence in sentences_lower]

vocabulary = set()
for sentence in preprocessed_sentences:
    vocabulary.update(sentence)

print("Vocabulary:", vocabulary)

bow_vectors = []
for sentence in preprocessed_sentences:
    bow_vector = Counter(sentence)
    bow_vectors.append(bow_vector)

for i, bow_vector in enumerate(bow_vectors):
    print("BoW Vector for Sentence", i+1, ":", bow_vector)


Vocabulary: {'swift', 'outpaces', 'lazy', 'jumps', 'quick', 'brown', 'dog', 'fox'}
BoW Vector for Sentence 1 : Counter({'quick': 1, 'brown': 1, 'fox': 1, 'jumps': 1, 'lazy': 1, 'dog': 1})
BoW Vector for Sentence 2 : Counter({'quick': 1, 'brown': 1, 'dog': 1, 'outpaces': 1, 'swift': 1, 'fox': 1})
BoW Vector for Sentence 3 : Counter({'dog': 1, 'fox': 1, 'brown': 1})


In [None]:
 nltk.download('shakespeare')

[nltk_data] Downloading package shakespeare to /root/nltk_data...
[nltk_data]   Unzipping corpora/shakespeare.zip.


True

In [None]:
from nltk.corpus import shakespeare
from sklearn.feature_extraction.text import TfidfVectorizer


corpus = [shakespeare.raw(file_id) for file_id in shakespeare.fileids()]

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

# Displaying Vocabulary
print("Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())


TF-IDF Matrix:
[[0.00012503 0.00012503 0.00012503 ... 0.         0.         0.        ]
 [0.00021722 0.00021722 0.00021722 ... 0.         0.         0.        ]
 [0.00011343 0.00011343 0.00011343 ... 0.         0.00028404 0.        ]
 ...
 [0.00017566 0.00017566 0.00017566 ... 0.00043987 0.         0.        ]
 [0.00012426 0.00012426 0.00012426 ... 0.         0.         0.00078231]
 [0.0001477  0.0001477  0.0001477  ... 0.         0.         0.00061994]]
Vocabulary:
['1992' '1996' '1998' ... 'zeal' 'zone' 'zounds']


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import string
import math

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return lemmatized_tokens

def bag_of_words(texts):
    vocabulary = set()
    for text in texts:
        vocabulary.update(text)
    return list(vocabulary)

def tf_idf(corpus):
    tf_corpus = []
    for document in corpus:
        tf_document = {}
        word_counts = Counter(document)
        total_words = len(document)
        for word, count in word_counts.items():
            tf_document[word] = count / total_words
        tf_corpus.append(tf_document)

    idf = {}
    num_documents = len(corpus)
    for document in corpus:
        for word in set(document):
            idf[word] = idf.get(word, 0) + 1

    for word, freq in idf.items():
        idf[word] = math.log(num_documents / freq)

    tf_idf_corpus = []
    for tf_document in tf_corpus:
        tf_idf_document = {}
        for word, tf in tf_document.items():
            tf_idf_document[word] = tf * idf[word]
        tf_idf_corpus.append(tf_idf_document)

    return tf_idf_corpus

texts = [
    "This is a sample sentence.",
    "Another example sentence.",
    "Yet another example of text data."
]

preprocessed_texts = [preprocess_text(text) for text in texts]

# Bag of Words model
vocabulary = bag_of_words(preprocessed_texts)
print("Vocabulary:", vocabulary)

# TF-IDF model
tf_idf_scores = tf_idf(preprocessed_texts)
print("\nTF-IDF Scores:")
for i, document in enumerate(tf_idf_scores):
    print("Document", i+1, ":", document)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Vocabulary: ['text', 'data', 'yet', 'another', 'sample', 'example', 'sentence']

TF-IDF Scores:
Document 1 : {'sample': 0.5493061443340549, 'sentence': 0.2027325540540822}
Document 2 : {'another': 0.13515503603605478, 'example': 0.13515503603605478, 'sentence': 0.13515503603605478}
Document 3 : {'yet': 0.21972245773362198, 'another': 0.08109302162163289, 'example': 0.08109302162163289, 'text': 0.21972245773362198, 'data': 0.21972245773362198}


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "A quick brown dog outpaces a swift fox.",
    "The dog and the fox are both brown."
]

sentences_lower = [sentence.lower() for sentence in sentences]

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(sentences_lower)

print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

print("Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())


TF-IDF Matrix:
[[0.         0.         0.         0.22421937 0.22421937 0.22421937
  0.3796364  0.3796364  0.         0.3796364  0.28872327 0.
  0.57744654]
 [0.         0.         0.         0.31021184 0.31021184 0.31021184
  0.         0.         0.52523431 0.         0.39945423 0.52523431
  0.        ]
 [0.3965233  0.3965233  0.3965233  0.23419305 0.23419305 0.23419305
  0.         0.         0.         0.         0.         0.
  0.60313238]]
Vocabulary:
['and' 'are' 'both' 'brown' 'dog' 'fox' 'jumps' 'lazy' 'outpaces' 'over'
 'quick' 'swift' 'the']
