Perform the following text mining preprocessing steps on a text document: <br>
a. Stop Word Removal <br>
b. Stemming <br>
c. Removal of punctuation marks <br>
d. Compute the inverse document frequency of the words in the document 

In [124]:
import nltk
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
#nltk.download('all')

In [125]:
text = "Google is a technology company best no issue too good known most beautiful for its search engine, which allows users to search the internet for information, images, news, and other resources. Founded in 1998 by Larry Page and Sergey Brin, Google has since expanded into a vast array of services and products."

## removal of Stop words

In [126]:
stop_words = stopwords.words("english")
print("stop words, length: ", stop_words[:10], len(stop_words))

stop words, length:  ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"] 179


In [127]:
tokanized_text = word_tokenize(text)
filtered_text = [word for word in tokanized_text if word not in stop_words]
print(filtered_text)

['Google', 'technology', 'company', 'best', 'issue', 'good', 'known', 'beautiful', 'search', 'engine', ',', 'allows', 'users', 'search', 'internet', 'information', ',', 'images', ',', 'news', ',', 'resources', '.', 'Founded', '1998', 'Larry', 'Page', 'Sergey', 'Brin', ',', 'Google', 'since', 'expanded', 'vast', 'array', 'services', 'products', '.']


## Apply stemming
- stemming: consolidating variations of same word to reduce dimensnality and improve recall.

In [128]:
stemmer = PorterStemmer()
stemmed_text = [stemmer.stem(word) for word in filtered_text]
print("without stemming: ",filtered_text,"\nwith Stemming: ",stemmed_text)

without stemming:  ['Google', 'technology', 'company', 'best', 'issue', 'good', 'known', 'beautiful', 'search', 'engine', ',', 'allows', 'users', 'search', 'internet', 'information', ',', 'images', ',', 'news', ',', 'resources', '.', 'Founded', '1998', 'Larry', 'Page', 'Sergey', 'Brin', ',', 'Google', 'since', 'expanded', 'vast', 'array', 'services', 'products', '.'] 
with Stemming:  ['googl', 'technolog', 'compani', 'best', 'issu', 'good', 'known', 'beauti', 'search', 'engin', ',', 'allow', 'user', 'search', 'internet', 'inform', ',', 'imag', ',', 'news', ',', 'resourc', '.', 'found', '1998', 'larri', 'page', 'sergey', 'brin', ',', 'googl', 'sinc', 'expand', 'vast', 'array', 'servic', 'product', '.']


## Remove punctuations

In [129]:
translator = str.maketrans("","",string.punctuation)
filtered_text = [word.translate(translator) for word in filtered_text if word.translate(translator)!='']
stemmed_text = [word.translate(translator) for word in stemmed_text if word.translate(translator)!='']


## Computing inverse document frequancy of words in document
- *IDF* : in this process words with high frequany weighted less <br>
    - because higher frequancy create noise in tf{ term frequancy} matrix

In [130]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(stemmed_text)
idf_scores = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))

# Display IDF scores
for word, score in idf_scores.items():
    print(f"{word}: {score}")


1998: 3.772588722239781
allow: 3.772588722239781
array: 3.772588722239781
beauti: 3.772588722239781
best: 3.772588722239781
brin: 3.772588722239781
compani: 3.772588722239781
engin: 3.772588722239781
expand: 3.772588722239781
found: 3.772588722239781
good: 3.772588722239781
googl: 3.367123614131617
imag: 3.772588722239781
inform: 3.772588722239781
internet: 3.772588722239781
issu: 3.772588722239781
known: 3.772588722239781
larri: 3.772588722239781
news: 3.772588722239781
page: 3.772588722239781
product: 3.772588722239781
resourc: 3.772588722239781
search: 3.367123614131617
sergey: 3.772588722239781
servic: 3.772588722239781
sinc: 3.772588722239781
technolog: 3.772588722239781
user: 3.772588722239781
vast: 3.772588722239781
