In [None]:
# Text Analytics

# 1. Extract Sample document and apply following document preprocessing methods:
#    Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
# 2. Create representation of document by calculating Term Frequency and Inverse Document
#    Frequency.


In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Optional for WordNet lemmatizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anuj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anuj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Anuj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anuj\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Anuj\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Word Tokenizer

In [13]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Sample text
text = "AI is coined in the year 1956, but it gained popularity recently."

# Word Tokenization
word_tokens = word_tokenize(text)
print("Tokenized Words:", word_tokens)

# Sentence Tokenization
sent_tokens = sent_tokenize(text)
print("Tokenized Sentences:", sent_tokens)


Tokenized Words: ['AI', 'is', 'coined', 'in', 'the', 'year', '1956', ',', 'but', 'it', 'gained', 'popularity', 'recently', '.']
Tokenized Sentences: ['AI is coined in the year 1956, but it gained popularity recently.']


#  POS Tagging (Part-of-Speech Tagging)

In [15]:
from nltk import pos_tag

# POS Tagging
pos_tags = pos_tag(word_tokens)
print("POS Tags:", pos_tags)


POS Tags: [('AI', 'NNP'), ('is', 'VBZ'), ('coined', 'VBN'), ('in', 'IN'), ('the', 'DT'), ('year', 'NN'), ('1956', 'CD'), (',', ','), ('but', 'CC'), ('it', 'PRP'), ('gained', 'VBD'), ('popularity', 'NN'), ('recently', 'RB'), ('.', '.')]


#  Stop Words Removal

In [19]:
from nltk.corpus import stopwords

# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in word_tokens if word.lower() not in stop_words]
print("Tokens after Stop Words Removal:", filtered_tokens)


# Importing stopwords from nltk.corpus:
# stopwords is a module in the nltk library that provides a list of common words (e.g., "the", "is", "in", "and") that are generally considered to be of little value in text analysis.

# By importing stopwords from nltk.corpus, we can access a predefined list of these words. 

# stopwords.words('english') loads a list of English stop words.

# We convert this list into a set using set(). A set is used here for faster look-up performance because checking membership in a set is faster than in a list.

# word_tokens is a list of words that we obtained from the tokenization process.

# We use a list comprehension to iterate through each word in word_tokens.

# For each word:

# word.lower() converts the word to lowercase to ensure that the comparison is case-insensitive. For example, "The" and "the" should be treated the same.

# word.lower() not in stop_words checks if the lowercase version of the word is not in the set of stop words.

# If the word is not in the stop words list, it gets added to the filtered_tokens list.

Tokens after Stop Words Removal: ['AI', 'coined', 'year', '1956', ',', 'gained', 'popularity', 'recently', '.']


# Stemming

In [22]:
from nltk.stem import PorterStemmer

# Initialize Stemmer
stemmer = PorterStemmer()

# Stemming the words
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Tokens:", stemmed_tokens)


Stemmed Tokens: ['ai', 'coin', 'year', '1956', ',', 'gain', 'popular', 'recent', '.']


# Lemmatization

In [27]:
from nltk.stem import WordNetLemmatizer

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatizing the words
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Tokens:", lemmatized_tokens)


Lemmatized Tokens: ['AI', 'coined', 'year', '1956', ',', 'gained', 'popularity', 'recently', '.']


In [29]:


# ### **Term Frequency (TF)**

# **Definition**: Term Frequency (TF) is a measure of how frequently a term appears in a document. It is calculated by dividing the number of times a term appears in the document by the total number of terms in the document.

# $$
# \text{TF}(t) = \frac{\text{Number of times term t appears in a document}}{\text{Total number of terms in the document}}
# $$

# **Example**:
# For the sentence: "AI is popular,"

# * "AI" appears 1 time, and the total number of words in the document is 3.
# * TF for "AI" = $\frac{1}{3} = 0.33$

# ---

# ### **Inverse Document Frequency (IDF)**

# **Definition**: Inverse Document Frequency (IDF) measures how much information the word provides across the entire corpus. It is calculated by the logarithm of the total number of documents divided by the number of documents containing the term.

# $$
# \text{IDF}(t) = \log \left( \frac{N}{\text{Number of documents containing t}} \right)
# $$

# Where:

# * $N$ is the total number of documents.

# **Example**:
# For a corpus of 3 documents:

# * "AI" appears in 2 documents.
# * IDF for "AI" = $\log \left( \frac{3}{2} \right) = 0.176$

# ---

# ### **TF-IDF (Term Frequency-Inverse Document Frequency)**

# The TF-IDF score for a term in a document is the product of TF and IDF:

# $$
# \text{TF-IDF}(t, d) = \text{TF}(t, d) \times \text{IDF}(t)
# $$

# This score indicates the importance of a word in a document relative to the entire corpus. The higher the TF-IDF score, the more relevant the word is in the given document.

# ---

# ### **Formatted Example**:

# Let's say we have the following 3 documents in a corpus:

# 1. "AI is coined in the year 1956, but it gained popularity recently."
# 2. "The field of AI has developed rapidly in recent years."
# 3. "AI applications are becoming widespread in many industries."

# For each word in the document, the **TF** and **IDF** values are calculated, and then multiplied to get the **TF-IDF** score.
# .


In [35]:
import math
from collections import Counter

# Sample Documents
corpus = [
    "AI is gaining popularity in the tech world.",
    "The field of AI has rapidly developed in recent years.",
    "AI is used in many industries such as healthcare, finance, and transportation."
]

# 1. Term Frequency (TF)
def term_frequency(doc):
    words = doc.split()
    word_count = Counter(words)
    tf = {word: count / len(words) for word, count in word_count.items()}
    return tf

# 2. Inverse Document Frequency (IDF)
def inverse_document_frequency(corpus):
    idf = {}
    total_docs = len(corpus)
    for doc in corpus:
        for word in set(doc.split()):
            idf[word] = idf.get(word, 0) + 1
    return {word: math.log(total_docs / count) for word, count in idf.items()}

# Calculate TF for each document
tf_results = [term_frequency(doc) for doc in corpus]

# Calculate IDF for the corpus
idf_results = inverse_document_frequency(corpus)

# Display TF and IDF
print("Term Frequency (TF) for each document:")
for idx, doc_tf in enumerate(tf_results):
    print(f"Document {idx + 1}: {doc_tf}")

print("\nInverse Document Frequency (IDF) for the corpus:")
print(idf_results)

# Calculate and display TF-IDF for each document
print("\nTF-IDF for each document:")
for idx, doc_tf in enumerate(tf_results):
    tfidf = {word: doc_tf[word] * idf_results.get(word, 0) for word in doc_tf}
    print(f"Document {idx + 1}: {tfidf}")


Term Frequency (TF) for each document:
Document 1: {'AI': 0.125, 'is': 0.125, 'gaining': 0.125, 'popularity': 0.125, 'in': 0.125, 'the': 0.125, 'tech': 0.125, 'world.': 0.125}
Document 2: {'The': 0.1, 'field': 0.1, 'of': 0.1, 'AI': 0.1, 'has': 0.1, 'rapidly': 0.1, 'developed': 0.1, 'in': 0.1, 'recent': 0.1, 'years.': 0.1}
Document 3: {'AI': 0.08333333333333333, 'is': 0.08333333333333333, 'used': 0.08333333333333333, 'in': 0.08333333333333333, 'many': 0.08333333333333333, 'industries': 0.08333333333333333, 'such': 0.08333333333333333, 'as': 0.08333333333333333, 'healthcare,': 0.08333333333333333, 'finance,': 0.08333333333333333, 'and': 0.08333333333333333, 'transportation.': 0.08333333333333333}

Inverse Document Frequency (IDF) for the corpus:
{'gaining': 1.0986122886681098, 'is': 0.4054651081081644, 'popularity': 1.0986122886681098, 'tech': 1.0986122886681098, 'AI': 0.0, 'world.': 1.0986122886681098, 'the': 1.0986122886681098, 'in': 0.0, 'developed': 1.0986122886681098, 'has': 1.09861