## Text analysis: 
### 1. Extract Sample document and apply following document preprocessing methods: Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
### 2. Create representation of document by calculating Term Frequency and Inverse Document Frequency

In [1]:
sample_sentence = "Wow, it is such a beautiful day!"

### TOKENISATION

In [2]:
import nltk
nltk.download("averaged_perceptron_tagger_eng")
nltk.download('punkt_tab')
from nltk import word_tokenize, sent_tokenize

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
sentences= sent_tokenize(sample_sentence)
tokenized_words = [word_tokenize(sentence) for sentence in sentences]
print('sentence words: ', sentences)
print('tokenized words: ', tokenized_words)

sentence words:  ['Wow, it is such a beautiful day!']
tokenized words:  [['Wow', ',', 'it', 'is', 'such', 'a', 'beautiful', 'day', '!']]


### STOPWORDS REMOVAL

In [4]:
from nltk.corpus import stopwords
import nltk

In [5]:
# Download the stopwords dataset if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
stop_words = set(stopwords.words('english'))

# Flatten the list (handles both flat and nested cases)
flat_tokens = []
for word in tokenized_words:
    if isinstance(word, list):  # Handle nested lists
        flat_tokens.extend(word)
    else:  # Handle flat lists
        flat_tokens.append(word)

# Remove stopwords (case-insensitive)
filtered_tokens = [word for word in flat_tokens if word.lower() not in stop_words]
print("Filtered tokens:", filtered_tokens)

Filtered tokens: ['Wow', ',', 'beautiful', 'day', '!']


### POS tagging

In [7]:
from nltk import pos_tag
token_words = word_tokenize(sample_sentence)
token_words

['Wow', ',', 'it', 'is', 'such', 'a', 'beautiful', 'day', '!']

In [8]:
pos_tags = pos_tag(token_words)
print("tagging parts of speech: ", pos_tags)

tagging parts of speech:  [('Wow', 'NNP'), (',', ','), ('it', 'PRP'), ('is', 'VBZ'), ('such', 'JJ'), ('a', 'DT'), ('beautiful', 'JJ'), ('day', 'NN'), ('!', '.')]


### Stemming

In [9]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed tokens: ", stemmed_tokens)

Stemmed tokens:  ['wow', ',', 'beauti', 'day', '!']


### Lemmatization

In [10]:
from nltk.stem import WordNetLemmatizer

# Download required NLTK data (only needed once)
nltk.download('wordnet')
nltk.download('omw-1.4')  # Required for some languages

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [11]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize each token in filtered_tokens
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized tokens:", lemmatized_tokens)

Lemmatized tokens: ['Wow', ',', 'beautiful', 'day', '!']


### calculating Term Frequency and Inverse Document Frequency.

In [12]:
preprocessed_text = ' '.join(lemmatized_tokens)
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_representation = tfidf_vectorizer.fit_transform([preprocessed_text])
print("preprocessed text: ", preprocessed_text)
print("\n TFIDF representation: \n", tfidf_representation.toarray())

preprocessed text:  Wow , beautiful day !

 TFIDF representation: 
 [[0.57735027 0.57735027 0.57735027]]
