# Text Analytics
1. Extract Sample document and apply following document preprocessing
methods: Tokenization, POS Tagging, stop words removal, Stemming and
Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse
Document Frequency.

In [1]:
# Downloads required

# for tokenization
# nltk.download('punkt')

# for POS(Parts of speech) tagging
# nltk.download('averaged_perceptron_tagger')

# for stop words
# nltk.download('stopwords')

# for lemmatization
# nltk.download('wordnet')

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

In [3]:
x = open("testdoc.txt").read()

In [4]:
x

'Millions of people in India took part in an annual tree planting drive Sunday. More than 250 million saplings were planted in a single day across the country\'s most-populous state.\nThe campaign was led by Uttar Pradesh state government officials, lawmakers, and activists, in a bid to reduce carbon emissions and combat climate change.\nWhere were the trees planted?\nThe saplings were planted by volunteers in forests, farms, schools, and along riverbanks and highways.\n"We are committed to increasing the forest cover of Uttar Pradesh to over 15% of the total land area in the next five years,\'\' said state forest official Manoj Singh.\nAccording to another government official, the forest cover of the state has increased over the last few years.\n"There has been an increase of 127 sqare kilometers [79 sqare miles]Â\xa0in the forest cover in Uttar Pradesh as compared to 2017," a state government spokesperson was quoted as saying inÂ\xa0The Indian Express newspaper.\n"There has also been

# Tokenization

In [5]:
tokens = word_tokenize(x)
print(tokens)

['Millions', 'of', 'people', 'in', 'India', 'took', 'part', 'in', 'an', 'annual', 'tree', 'planting', 'drive', 'Sunday', '.', 'More', 'than', '250', 'million', 'saplings', 'were', 'planted', 'in', 'a', 'single', 'day', 'across', 'the', 'country', "'s", 'most-populous', 'state', '.', 'The', 'campaign', 'was', 'led', 'by', 'Uttar', 'Pradesh', 'state', 'government', 'officials', ',', 'lawmakers', ',', 'and', 'activists', ',', 'in', 'a', 'bid', 'to', 'reduce', 'carbon', 'emissions', 'and', 'combat', 'climate', 'change', '.', 'Where', 'were', 'the', 'trees', 'planted', '?', 'The', 'saplings', 'were', 'planted', 'by', 'volunteers', 'in', 'forests', ',', 'farms', ',', 'schools', ',', 'and', 'along', 'riverbanks', 'and', 'highways', '.', '``', 'We', 'are', 'committed', 'to', 'increasing', 'the', 'forest', 'cover', 'of', 'Uttar', 'Pradesh', 'to', 'over', '15', '%', 'of', 'the', 'total', 'land', 'area', 'in', 'the', 'next', 'five', 'years', ',', "''", 'said', 'state', 'forest', 'official', 'Mano

# POS Tagging

In [6]:
postags = pos_tag(tokens)
print(postags)

[('Millions', 'NNS'), ('of', 'IN'), ('people', 'NNS'), ('in', 'IN'), ('India', 'NNP'), ('took', 'VBD'), ('part', 'NN'), ('in', 'IN'), ('an', 'DT'), ('annual', 'JJ'), ('tree', 'NN'), ('planting', 'VBG'), ('drive', 'JJ'), ('Sunday', 'NNP'), ('.', '.'), ('More', 'JJR'), ('than', 'IN'), ('250', 'CD'), ('million', 'CD'), ('saplings', 'NNS'), ('were', 'VBD'), ('planted', 'VBN'), ('in', 'IN'), ('a', 'DT'), ('single', 'JJ'), ('day', 'NN'), ('across', 'IN'), ('the', 'DT'), ('country', 'NN'), ("'s", 'POS'), ('most-populous', 'JJ'), ('state', 'NN'), ('.', '.'), ('The', 'DT'), ('campaign', 'NN'), ('was', 'VBD'), ('led', 'VBN'), ('by', 'IN'), ('Uttar', 'NNP'), ('Pradesh', 'NNP'), ('state', 'NN'), ('government', 'NN'), ('officials', 'NNS'), (',', ','), ('lawmakers', 'NNS'), (',', ','), ('and', 'CC'), ('activists', 'NNS'), (',', ','), ('in', 'IN'), ('a', 'DT'), ('bid', 'NN'), ('to', 'TO'), ('reduce', 'VB'), ('carbon', 'NN'), ('emissions', 'NNS'), ('and', 'CC'), ('combat', 'NN'), ('climate', 'NN'), ('

# Removing stop words

In [7]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{"should've", 'between', 'don', 'those', 'o', "don't", 'herself', 'with', 'were', 'down', 'theirs', 'ain', "isn't", 'should', 'when', 'for', 'y', 'most', 'myself', 'further', 'there', 'few', 'haven', 'because', 'isn', 've', 'who', 'very', "wasn't", 'their', 'had', 'over', 'yourselves', 'he', 'of', 'from', 'only', 'same', "weren't", "that'll", 'it', 'some', 'too', 'while', "you'll", 're', "haven't", 'shouldn', 'them', "didn't", 'we', 'hers', "she's", 'any', 'more', "hadn't", 't', 'shan', 's', 'll', 'these', 'have', 'is', "it's", 'm', 'doing', "you're", 'until', "shan't", 'why', 'all', 'its', 'themselves', 'aren', "you've", 'me', 'does', 'a', 'ma', 'has', 'about', "hasn't", 'having', "needn't", 'she', 'in', 'against', 'whom', "shouldn't", "won't", 'how', 'd', 'not', 'that', 'himself', 'so', 'below', 'own', 'her', 'through', 'yourself', 'will', 'be', 'they', 'doesn', 'during', 'i', 'as', 'which', 'or', 'this', 'than', "mightn't", 'weren', 'am', 'where', 'hadn', 'then', 'what', 'being', 'o

In [8]:
li = []
for words in tokens:
    if words not in stop_words:
        li.append(words)
print(li)

['Millions', 'people', 'India', 'took', 'part', 'annual', 'tree', 'planting', 'drive', 'Sunday', '.', 'More', '250', 'million', 'saplings', 'planted', 'single', 'day', 'across', 'country', "'s", 'most-populous', 'state', '.', 'The', 'campaign', 'led', 'Uttar', 'Pradesh', 'state', 'government', 'officials', ',', 'lawmakers', ',', 'activists', ',', 'bid', 'reduce', 'carbon', 'emissions', 'combat', 'climate', 'change', '.', 'Where', 'trees', 'planted', '?', 'The', 'saplings', 'planted', 'volunteers', 'forests', ',', 'farms', ',', 'schools', ',', 'along', 'riverbanks', 'highways', '.', '``', 'We', 'committed', 'increasing', 'forest', 'cover', 'Uttar', 'Pradesh', '15', '%', 'total', 'land', 'area', 'next', 'five', 'years', ',', "''", 'said', 'state', 'forest', 'official', 'Manoj', 'Singh', '.', 'According', 'another', 'government', 'official', ',', 'forest', 'cover', 'state', 'increased', 'last', 'years', '.', '``', 'There', 'increase', '127', 'sqare', 'kilometers', '[', '79', 'sqare', 'mil

# Stemming

In [9]:
ps = PorterStemmer()
stemlist = []
for words in li:
    stemlist.append([words, ps.stem(words)])
print(stemlist)

[['Millions', 'million'], ['people', 'peopl'], ['India', 'india'], ['took', 'took'], ['part', 'part'], ['annual', 'annual'], ['tree', 'tree'], ['planting', 'plant'], ['drive', 'drive'], ['Sunday', 'sunday'], ['.', '.'], ['More', 'more'], ['250', '250'], ['million', 'million'], ['saplings', 'sapl'], ['planted', 'plant'], ['single', 'singl'], ['day', 'day'], ['across', 'across'], ['country', 'countri'], ["'s", "'s"], ['most-populous', 'most-popul'], ['state', 'state'], ['.', '.'], ['The', 'the'], ['campaign', 'campaign'], ['led', 'led'], ['Uttar', 'uttar'], ['Pradesh', 'pradesh'], ['state', 'state'], ['government', 'govern'], ['officials', 'offici'], [',', ','], ['lawmakers', 'lawmak'], [',', ','], ['activists', 'activist'], [',', ','], ['bid', 'bid'], ['reduce', 'reduc'], ['carbon', 'carbon'], ['emissions', 'emiss'], ['combat', 'combat'], ['climate', 'climat'], ['change', 'chang'], ['.', '.'], ['Where', 'where'], ['trees', 'tree'], ['planted', 'plant'], ['?', '?'], ['The', 'the'], ['sap

# Lemmatization

In [10]:
wl = WordNetLemmatizer()
lemilist = []
for words in li:
    lemilist.append([words, wl.lemmatize(words)])
print(lemilist)

[['Millions', 'Millions'], ['people', 'people'], ['India', 'India'], ['took', 'took'], ['part', 'part'], ['annual', 'annual'], ['tree', 'tree'], ['planting', 'planting'], ['drive', 'drive'], ['Sunday', 'Sunday'], ['.', '.'], ['More', 'More'], ['250', '250'], ['million', 'million'], ['saplings', 'sapling'], ['planted', 'planted'], ['single', 'single'], ['day', 'day'], ['across', 'across'], ['country', 'country'], ["'s", "'s"], ['most-populous', 'most-populous'], ['state', 'state'], ['.', '.'], ['The', 'The'], ['campaign', 'campaign'], ['led', 'led'], ['Uttar', 'Uttar'], ['Pradesh', 'Pradesh'], ['state', 'state'], ['government', 'government'], ['officials', 'official'], [',', ','], ['lawmakers', 'lawmaker'], [',', ','], ['activists', 'activist'], [',', ','], ['bid', 'bid'], ['reduce', 'reduce'], ['carbon', 'carbon'], ['emissions', 'emission'], ['combat', 'combat'], ['climate', 'climate'], ['change', 'change'], ['.', '.'], ['Where', 'Where'], ['trees', 'tree'], ['planted', 'planted'], ['?

# Word frequency

In [11]:
fre = dict()
for words in li:
    if words in fre:
        fre[words] += 1
    else:
        fre[words] = 1
print(fre)

{'Millions': 1, 'people': 2, 'India': 6, 'took': 2, 'part': 2, 'annual': 3, 'tree': 5, 'planting': 3, 'drive': 3, 'Sunday': 2, '.': 27, 'More': 2, '250': 2, 'million': 3, 'saplings': 11, 'planted': 7, 'single': 2, 'day': 2, 'across': 3, 'country': 3, "'s": 3, 'most-populous': 2, 'state': 10, 'The': 11, 'campaign': 2, 'led': 2, 'Uttar': 7, 'Pradesh': 7, 'government': 7, 'officials': 3, ',': 32, 'lawmakers': 2, 'activists': 2, 'bid': 2, 'reduce': 2, 'carbon': 2, 'emissions': 2, 'combat': 2, 'climate': 2, 'change': 2, 'Where': 2, 'trees': 4, '?': 5, 'volunteers': 2, 'forests': 2, 'farms': 2, 'schools': 2, 'along': 2, 'riverbanks': 2, 'highways': 2, '``': 8, 'We': 2, 'committed': 2, 'increasing': 2, 'forest': 9, 'cover': 9, '15': 2, '%': 8, 'total': 3, 'land': 4, 'area': 3, 'next': 2, 'five': 2, 'years': 4, "''": 8, 'said': 7, 'official': 6, 'Manoj': 2, 'Singh': 3, 'According': 2, 'another': 2, 'increased': 4, 'last': 3, 'There': 4, 'increase': 4, '127': 2, 'sqare': 4, 'kilometers': 2, '['

# Calculate TF for each word in li

In [12]:
total_words = len(li)
tf = {}
for word in li:
    if word in tf:
        tf[word] += 1 / total_words
    else:
        tf[word] = 1 / total_words

print(tf)

{'Millions': 0.002136752136752137, 'people': 0.004273504273504274, 'India': 0.01282051282051282, 'took': 0.004273504273504274, 'part': 0.004273504273504274, 'annual': 0.006410256410256411, 'tree': 0.010683760683760684, 'planting': 0.006410256410256411, 'drive': 0.006410256410256411, 'Sunday': 0.004273504273504274, '.': 0.05769230769230768, 'More': 0.004273504273504274, '250': 0.004273504273504274, 'million': 0.006410256410256411, 'saplings': 0.0235042735042735, 'planted': 0.014957264957264956, 'single': 0.004273504273504274, 'day': 0.004273504273504274, 'across': 0.006410256410256411, 'country': 0.006410256410256411, "'s": 0.006410256410256411, 'most-populous': 0.004273504273504274, 'state': 0.021367521367521364, 'The': 0.0235042735042735, 'campaign': 0.004273504273504274, 'led': 0.004273504273504274, 'Uttar': 0.014957264957264956, 'Pradesh': 0.014957264957264956, 'government': 0.014957264957264956, 'officials': 0.006410256410256411, ',': 0.06837606837606838, 'lawmakers': 0.00427350427

# Calculate IDF for each word in li

In [13]:
import math

idf = {}
num_docs = len(x)
for word in li:
    count = sum(1 for doc in x if word in doc)
    idf[word] = math.log(num_docs / count) if count > 0 else 0

print(idf)

{'Millions': 0, 'people': 0, 'India': 0, 'took': 0, 'part': 0, 'annual': 0, 'tree': 0, 'planting': 0, 'drive': 0, 'Sunday': 0, '.': 4.6481716219812785, 'More': 0, '250': 0, 'million': 0, 'saplings': 0, 'planted': 0, 'single': 0, 'day': 0, 'across': 0, 'country': 0, "'s": 0, 'most-populous': 0, 'state': 0, 'The': 0, 'campaign': 0, 'led': 0, 'Uttar': 0, 'Pradesh': 0, 'government': 0, 'officials': 0, ',': 4.678943280648032, 'lawmakers': 0, 'activists': 0, 'bid': 0, 'reduce': 0, 'carbon': 0, 'emissions': 0, 'combat': 0, 'climate': 0, 'change': 0, 'Where': 0, 'trees': 0, '?': 6.535241271013659, 'volunteers': 0, 'forests': 0, 'farms': 0, 'schools': 0, 'along': 0, 'riverbanks': 0, 'highways': 0, '``': 0, 'We': 0, 'committed': 0, 'increasing': 0, 'forest': 0, 'cover': 0, '15': 0, '%': 6.065237641767923, 'total': 0, 'land': 0, 'area': 0, 'next': 0, 'five': 0, 'years': 0, "''": 0, 'said': 0, 'official': 0, 'Manoj': 0, 'Singh': 0, 'According': 0, 'another': 0, 'increased': 0, 'last': 0, 'There': 