In [33]:
import math
import string
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [34]:
Doc1 = "Natural Language Toolkit (NLTK) is a powerful library in Python for natural language processing tasks."
Doc2 = "With NLTK, you can tokenize text, perform stemming, lemmatization, and much more to preprocess your data."
Doc3 = "One of NLTK's key features is its extensive collection of corpora and lexical resources for training and testing NLP models."
Doc4 = "NLTK provides easy-to-use interfaces to perform sentiment analysis, part-of-speech tagging, and named entity recognition."
Doc5 = "By leveraging NLTK's functionalities, implementing TF-IDF for text analysis becomes straightforward, allowing you to extract important features from your corpus."

In [35]:
Corpus = [Doc1,Doc2,Doc3,Doc4,Doc5]
total_doc_num = len(Corpus)
Corpus

['Natural Language Toolkit (NLTK) is a powerful library in Python for natural language processing tasks.',
 'With NLTK, you can tokenize text, perform stemming, lemmatization, and much more to preprocess your data.',
 "One of NLTK's key features is its extensive collection of corpora and lexical resources for training and testing NLP models.",
 'NLTK provides easy-to-use interfaces to perform sentiment analysis, part-of-speech tagging, and named entity recognition.',
 "By leveraging NLTK's functionalities, implementing TF-IDF for text analysis becomes straightforward, allowing you to extract important features from your corpus."]

In [36]:
stop_words = set(stopwords.words('english'))
global_tf_corpus = []
word_frequencies = {}

In [37]:
for each in Corpus:
    tokens = word_tokenize(each)
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words and word.lower() not in string.punctuation]
    word_freq = Counter(filtered_tokens)
    word_corpus = dict(word_freq)
    length = len(filtered_tokens)
    global_tf_corpus.append({key: value / length for key, value in word_corpus.items()})

In [38]:
def idf(word_frequencies):
    idf_values = {}
    for word, freq in word_frequencies.items():
        idf_values[word] = math.log(total_doc_num/freq)
    return idf_values

In [39]:
for doc_set in global_tf_corpus:
    for word in doc_set:
        if word in word_frequencies:
            word_frequencies[word] += 1
        else:
            word_frequencies[word] = 1

In [40]:
idf_values = idf(word_frequencies)

In [41]:
tfidf_scores = []

for tf_doc in global_tf_corpus:
    tfidf_doc = {}
    for term, tf in tf_doc.items():
        tfidf_doc[term] = tf * idf_values[term]
    tfidf_scores.append(tfidf_doc)


In [43]:
for each in tfidf_scores:
    print(each)

{'natural': 0.29262507498801826, 'language': 0.29262507498801826, 'toolkit': 0.14631253749400913, 'nltk': 0.0, 'powerful': 0.14631253749400913, 'library': 0.14631253749400913, 'python': 0.14631253749400913, 'processing': 0.14631253749400913, 'tasks': 0.14631253749400913}
{'nltk': 0.0, 'tokenize': 0.17882643471490003, 'text': 0.10181008131935056, 'perform': 0.10181008131935056, 'stemming': 0.17882643471490003, 'lemmatization': 0.17882643471490003, 'much': 0.17882643471490003, 'preprocess': 0.17882643471490003, 'data': 0.17882643471490003}
{'one': 0.11495985088815001, 'nltk': 0.0, "'s": 0.06544933799101108, 'key': 0.11495985088815001, 'features': 0.06544933799101108, 'extensive': 0.11495985088815001, 'collection': 0.11495985088815001, 'corpora': 0.11495985088815001, 'lexical': 0.11495985088815001, 'resources': 0.11495985088815001, 'training': 0.11495985088815001, 'testing': 0.11495985088815001, 'nlp': 0.11495985088815001, 'models': 0.11495985088815001}
{'nltk': 0.0, 'provides': 0.1341198