In [67]:
with open('TextFile.txt','r') as file:
    text = file.read()
print(text)

Text analytics, also known as text mining, is the process of deriving meaningful insights and patterns from unstructured text data. With the rise of the internet and social media, the amount of text data generated daily has skyrocketed, making text analytics an essential tool for businesses and organizations seeking to extract insights from this data.In this essay, we will explore the different techniques used in text analytics, the benefits of text analytics, and the challenges that come with implementing text analytics.Text analytics can be broadly divided into three main techniques: text classification, sentiment analysis, and topic modeling.Text classification involves categorizing text data into predefined categories. This can be useful for automating tasks such as spam detection, content filtering, and customer feedback analysis. For example, a company might use text classification to automatically route customer complaints to the appropriate department.Sentiment analysis, also k

In [68]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
import math

In [69]:
stop_words = set(stopwords.words('english'))
# print(stop_words)
word_tokens = word_tokenize(text)
filtered_sentence = [w for w in word_tokens if w.lower() not in stop_words]
print(filtered_sentence)

['Text', 'analytics', ',', 'also', 'known', 'text', 'mining', ',', 'process', 'deriving', 'meaningful', 'insights', 'patterns', 'unstructured', 'text', 'data', '.', 'rise', 'internet', 'social', 'media', ',', 'amount', 'text', 'data', 'generated', 'daily', 'skyrocketed', ',', 'making', 'text', 'analytics', 'essential', 'tool', 'businesses', 'organizations', 'seeking', 'extract', 'insights', 'data.In', 'essay', ',', 'explore', 'different', 'techniques', 'used', 'text', 'analytics', ',', 'benefits', 'text', 'analytics', ',', 'challenges', 'come', 'implementing', 'text', 'analytics.Text', 'analytics', 'broadly', 'divided', 'three', 'main', 'techniques', ':', 'text', 'classification', ',', 'sentiment', 'analysis', ',', 'topic', 'modeling.Text', 'classification', 'involves', 'categorizing', 'text', 'data', 'predefined', 'categories', '.', 'useful', 'automating', 'tasks', 'spam', 'detection', ',', 'content', 'filtering', ',', 'customer', 'feedback', 'analysis', '.', 'example', ',', 'company'

In [70]:
pos_tagged = nltk.pos_tag(word_tokens)
print(pos_tagged)

[('Text', 'NN'), ('analytics', 'NNS'), (',', ','), ('also', 'RB'), ('known', 'VBN'), ('as', 'IN'), ('text', 'NN'), ('mining', 'NN'), (',', ','), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('deriving', 'VBG'), ('meaningful', 'JJ'), ('insights', 'NNS'), ('and', 'CC'), ('patterns', 'NNS'), ('from', 'IN'), ('unstructured', 'JJ'), ('text', 'NN'), ('data', 'NNS'), ('.', '.'), ('With', 'IN'), ('the', 'DT'), ('rise', 'NN'), ('of', 'IN'), ('the', 'DT'), ('internet', 'NN'), ('and', 'CC'), ('social', 'JJ'), ('media', 'NNS'), (',', ','), ('the', 'DT'), ('amount', 'NN'), ('of', 'IN'), ('text', 'NN'), ('data', 'NNS'), ('generated', 'VBD'), ('daily', 'RB'), ('has', 'VBZ'), ('skyrocketed', 'VBN'), (',', ','), ('making', 'VBG'), ('text', 'JJ'), ('analytics', 'NNS'), ('an', 'DT'), ('essential', 'JJ'), ('tool', 'NN'), ('for', 'IN'), ('businesses', 'NNS'), ('and', 'CC'), ('organizations', 'NNS'), ('seeking', 'VBG'), ('to', 'TO'), ('extract', 'VB'), ('insights', 'NNS'), ('from', 'IN'), 

In [71]:
ps = PorterStemmer()
stemmed = []
for w in filtered_sentence:
    stemmed.append(ps.stem(w))
    print(w, " : ", ps.stem(w))

Text  :  text
analytics  :  analyt
,  :  ,
also  :  also
known  :  known
text  :  text
mining  :  mine
,  :  ,
process  :  process
deriving  :  deriv
meaningful  :  meaning
insights  :  insight
patterns  :  pattern
unstructured  :  unstructur
text  :  text
data  :  data
.  :  .
rise  :  rise
internet  :  internet
social  :  social
media  :  media
,  :  ,
amount  :  amount
text  :  text
data  :  data
generated  :  gener
daily  :  daili
skyrocketed  :  skyrocket
,  :  ,
making  :  make
text  :  text
analytics  :  analyt
essential  :  essenti
tool  :  tool
businesses  :  busi
organizations  :  organ
seeking  :  seek
extract  :  extract
insights  :  insight
data.In  :  data.in
essay  :  essay
,  :  ,
explore  :  explor
different  :  differ
techniques  :  techniqu
used  :  use
text  :  text
analytics  :  analyt
,  :  ,
benefits  :  benefit
text  :  text
analytics  :  analyt
,  :  ,
challenges  :  challeng
come  :  come
implementing  :  implement
text  :  text
analytics.Text  :  analytics.te

In [72]:
lemmatizer = WordNetLemmatizer()
lemmatized1 = []
for w in filtered_sentence:
    lemmatized1.append(lemmatizer.lemmatize(w,pos="a"))
    print(w, " : ", lemmatizer.lemmatize(w,pos="a"))

Text  :  Text
analytics  :  analytics
,  :  ,
also  :  also
known  :  known
text  :  text
mining  :  mining
,  :  ,
process  :  process
deriving  :  deriving
meaningful  :  meaningful
insights  :  insights
patterns  :  patterns
unstructured  :  unstructured
text  :  text
data  :  data
.  :  .
rise  :  rise
internet  :  internet
social  :  social
media  :  media
,  :  ,
amount  :  amount
text  :  text
data  :  data
generated  :  generated
daily  :  daily
skyrocketed  :  skyrocketed
,  :  ,
making  :  making
text  :  text
analytics  :  analytics
essential  :  essential
tool  :  tool
businesses  :  businesses
organizations  :  organizations
seeking  :  seeking
extract  :  extract
insights  :  insights
data.In  :  data.In
essay  :  essay
,  :  ,
explore  :  explore
different  :  different
techniques  :  techniques
used  :  used
text  :  text
analytics  :  analytics
,  :  ,
benefits  :  benefits
text  :  text
analytics  :  analytics
,  :  ,
challenges  :  challenges
come  :  come
implementi

In [73]:
# This is done because Normal Lemmatization didn't provide any changes
#therefore changing the Penn Treebank POS tags to WordNet POS tags (POS = part-of-speech) 
# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to convert Penn Treebank POS tags to WordNet POS tags
def penn_to_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'  # Adjective
    elif tag.startswith('V'):
        return 'v'  # Verb
    elif tag.startswith('N'):
        return 'n'  # Noun
    elif tag.startswith('R'):
        return 'r'  # Adverb
    else:
        return None

pos_tagged = nltk.pos_tag(filtered_sentence)
lemmatized = []
for word, tag in pos_tagged:
    wn_tag = penn_to_wordnet_pos(tag)
    if wn_tag is not None:
        lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    else:
        lemma = lemmatizer.lemmatize(word)
    lemmatized.append(lemma)
    print(word, " : ", lemma) 

Text  :  Text
analytics  :  analytics
,  :  ,
also  :  also
known  :  know
text  :  text
mining  :  mining
,  :  ,
process  :  process
deriving  :  derive
meaningful  :  meaningful
insights  :  insight
patterns  :  pattern
unstructured  :  unstructured
text  :  text
data  :  data
.  :  .
rise  :  rise
internet  :  internet
social  :  social
media  :  medium
,  :  ,
amount  :  amount
text  :  text
data  :  data
generated  :  generate
daily  :  daily
skyrocketed  :  skyrocket
,  :  ,
making  :  make
text  :  text
analytics  :  analytics
essential  :  essential
tool  :  tool
businesses  :  business
organizations  :  organization
seeking  :  seek
extract  :  extract
insights  :  insight
data.In  :  data.In
essay  :  essay
,  :  ,
explore  :  explore
different  :  different
techniques  :  technique
used  :  use
text  :  text
analytics  :  analytics
,  :  ,
benefits  :  benefit
text  :  text
analytics  :  analytics
,  :  ,
challenges  :  challenge
come  :  come
implementing  :  implement
tex

In [74]:
# TF for lemmatized
# TF(t,d) = (Number of times term t appears in document d) / (Total number of terms in document d)

total_frequency = len(lemmatized)
freq_dict = {}

for word in lemmatized:
    if word in freq_dict:
        freq_dict[word] += 1
    else:
        freq_dict[word] = 1

# Normal Count
# print(freq_dict)

#TF
for key, value in freq_dict.items():
    value = float(value) / float(total_frequency)
    freq_dict[key] = value

print(freq_dict)


{'Text': 0.0025974025974025974, 'analytics': 0.03636363636363636, ',': 0.0987012987012987, 'also': 0.007792207792207792, 'know': 0.005194805194805195, 'text': 0.05974025974025974, 'mining': 0.005194805194805195, 'process': 0.005194805194805195, 'derive': 0.007792207792207792, 'meaningful': 0.0025974025974025974, 'insight': 0.012987012987012988, 'pattern': 0.0025974025974025974, 'unstructured': 0.0025974025974025974, 'data': 0.023376623376623377, '.': 0.04415584415584416, 'rise': 0.0025974025974025974, 'internet': 0.0025974025974025974, 'social': 0.012987012987012988, 'medium': 0.012987012987012988, 'amount': 0.007792207792207792, 'generate': 0.005194805194805195, 'daily': 0.0025974025974025974, 'skyrocket': 0.0025974025974025974, 'make': 0.007792207792207792, 'essential': 0.0025974025974025974, 'tool': 0.005194805194805195, 'business': 0.015584415584415584, 'organization': 0.012987012987012988, 'seek': 0.005194805194805195, 'extract': 0.0025974025974025974, 'data.In': 0.002597402597402

In [75]:
# IDF for lemmatized
# IDF(t)=log((Total number of documents in the corpus)/ (Number of documents containing term t))
length = len(lemmatized)
substring_length = length // 3
substring1 = lemmatized[:substring_length]
substring2 = lemmatized[substring_length:2*substring_length]
substring3 = lemmatized[2*substring_length:]
idf_dict = {}
N = 3
for word in set(lemmatized):
    count = 0
    if word in substring1:
        count += 1
    if word in substring2:
        count += 1
    if word in substring3:
        count += 1
    tp = float(N / count)
    ans = math.log10(tp)
    idf_dict[word] = ans
print(idf_dict)

{'quality': 0.47712125471966244, 'concern': 0.47712125471966244, 'Text': 0.47712125471966244, 'platform': 0.47712125471966244, 'trend': 0.47712125471966244, 'modeling.Text': 0.47712125471966244, 'analyze': 0.17609125905568124, 'pattern': 0.47712125471966244, 'seek': 0.17609125905568124, 'common': 0.47712125471966244, 'keywords': 0.47712125471966244, 'know': 0.47712125471966244, 'determine': 0.47712125471966244, 'analysis': 0.0, 'involve': 0.47712125471966244, 'insight': 0.0, 'continued': 0.47712125471966244, 'leverage': 0.47712125471966244, 'popular': 0.47712125471966244, 'decision': 0.47712125471966244, 'take': 0.17609125905568124, 'sentiment': 0.17609125905568124, 'amount': 0.0, 'different': 0.47712125471966244, 'brand': 0.47712125471966244, 'sense': 0.47712125471966244, 'resource': 0.47712125471966244, 'reputation': 0.47712125471966244, 'medium': 0.0, 'launch.Topic': 0.47712125471966244, 'team': 0.47712125471966244, 'derive': 0.17609125905568124, 'phrase': 0.47712125471966244, 'news

In [76]:
# Calculate TF-IDF
# TF-IDF(t,d)=TF(t,d)×IDF(t)

tfidf_dict = {}

for word in set(lemmatized):
    tf = freq_dict[word]
    idf = idf_dict[word]
    tfidf = tf * idf
    tfidf_dict[word] = tfidf

print(tfidf_dict)

{'quality': 0.0012392759862848374, 'concern': 0.0012392759862848374, 'Text': 0.0012392759862848374, 'platform': 0.0012392759862848374, 'trend': 0.0012392759862848374, 'modeling.Text': 0.0012392759862848374, 'analyze': 0.0013721396809533602, 'pattern': 0.0012392759862848374, 'seek': 0.0009147597873022402, 'common': 0.0012392759862848374, 'keywords': 0.0012392759862848374, 'know': 0.002478551972569675, 'determine': 0.0012392759862848374, 'analysis': 0.0, 'involve': 0.002478551972569675, 'insight': 0.0, 'continued': 0.0012392759862848374, 'leverage': 0.0012392759862848374, 'popular': 0.0012392759862848374, 'decision': 0.0012392759862848374, 'take': 0.0009147597873022402, 'sentiment': 0.0018295195746044805, 'amount': 0.0, 'different': 0.0012392759862848374, 'brand': 0.0012392759862848374, 'sense': 0.0012392759862848374, 'resource': 0.0012392759862848374, 'reputation': 0.0012392759862848374, 'medium': 0.0, 'launch.Topic': 0.0012392759862848374, 'team': 0.0012392759862848374, 'derive': 0.001