In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
import math

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
with open("datasets/SampleText.txt", "r", encoding="utf-8") as file:
    doc = file.read()

In [None]:
print(doc)

In [None]:
"""
Tokenization - Tokenization is the process of splitting a text or document into smaller units called tokens. 
These tokens can be words, phrases, or symbols, depending on the specific tokenizer used. 
word_tokenize() : split a sentence into tokens or words
sent_tokenize() : to split a document or paragraph into sentences
"""
word_tokens = word_tokenize(doc)
sent_tokens = sent_tokenize(doc)
print(word_tokens)
print(sent_tokens)

In [None]:
"""
POS Tagging - POS tagging is the process of assigning a part-of-speech tag (e.g., noun, verb, adjective) 
to each token in a sentence. It helps in understanding the grammatical structure of a sentence
"""
tags = pos_tag(word_tokens)
print(tags)

In [None]:
"""
Stop Words Removal - Stop words are common words that are often filtered out from text data because they do not 
contribute much to the meaning of the text. These words include articles, prepositions, conjunctions, and other common words.
"""

stop_words = set(stopwords.words('english'))
filtered_words = [token for token in word_tokens if token.lower() not in stop_words]
print(len(word_tokens))
print(len(filtered_words))

In [None]:
"""
Stemming - Stemming is the process of reducing words to their root or base form by removing affixes (e.g., prefixes, suffixes). 
The goal of stemming is to reduce words to their common base or root form, which helps in information retrieval and text analysis.
Stemming is a process that stems or removes last few characters from a word, often leading to incorrect meanings and spelling.
"""

porter = PorterStemmer()
stemmed_tokens = [porter.stem(word) for word in word_tokens]
print(stemmed_tokens)

In [None]:
"""
Lemmatization - Lemmatization is similar to stemming but involves reducing words to their base or dictionary form (lemma) 
using a vocabulary and morphological analysis of the words. Lemmatization ensures that the resulting word is a valid word.
Lemmatization considers the context and converts the word to its meaningful base form, which is called Lemma.
"""

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in word_tokens]
print(lemmatized_tokens)

In [None]:
"""
Term Frequency - Measures how frequently a term appears in a document. TF measures the frequency of a term (word) in a 
document relative to the total number of words in that document.
TF = (freq of term in a doc / total number of terms in doc)

Inverse Document Frequency - IDF measures the rarity of a term across all documents in the corpus.
IDF = log(totalno of docs / no of docs containing the term + 1)

TF-IDF combines TF and IDF to assign a weight to each term in a document relative to its importance in the entire corpus.
"""

def get_tf(docs):
    tf = {}
    for doc in docs:
        tokens = word_tokenize(doc)
        total_terms = len(tokens)
        for token in set(tokens):
            frequency = tokens.count(token)
            tf[(token, doc)] = frequency/total_terms
    return tf

def get_idf(docs):
    idf = {}
    tokens = []
    for doc in docs:
        tokens += word_tokenize(doc)
    for token in set(tokens):
        count = 1
        for d in docs:
            if token in word_tokenize(d):
                count += 1
        idf[token] = math.log(len(docs)/count)
    return idf

def get_tfidf(docs):
    tf = get_tf(docs)
    idf = get_idf(docs)
    tfidf = {}
    for token, doc in tf.keys():
        tfidf[(token, doc)] = tf[(token, doc)] * idf[token]
    return tfidf

doc1 = "Natural language processing (NLP) is a field of artificial intelligence concerned with the interaction between computers and humans in natural language. It aims to enable computers to understand, interpret, and generate human language in a way that is both meaningful and useful. NLP techniques are used in a wide range of applications, including machine translation, sentiment analysis, information extraction, and text summarization. One of the key challenges in NLP is dealing with the ambiguity and variability of natural language, which can make it difficult for computers to accurately process and understand text. However, recent advances in machine learning and deep learning have led to significant improvements in NLP performance, making it an increasingly important area of research and development."
doc2 = "Machine learning (ML) is a subset of artificial intelligence that focuses on the development of algorithms that can learn from and make predictions or decisions based on data. ML algorithms can be categorized into supervised learning, unsupervised learning, and reinforcement learning, depending on the type of training data and the learning task. Supervised learning involves training a model on labeled data, while unsupervised learning involves training on unlabeled data. Reinforcement learning involves training a model to interact with an environment and learn from feedback. ML techniques have applications in various domains, including image recognition, speech recognition, medical diagnosis, and autonomous vehicles."
doc3 = "Data science is an interdisciplinary field that combines techniques from statistics, computer science, and domain-specific knowledge to extract insights and knowledge from data. It involves various stages of the data lifecycle, including data collection, data cleaning, data analysis, and data visualization. Data scientists use a variety of tools and techniques, such as machine learning, statistical modeling, and data mining, to uncover patterns and trends in data and make data-driven decisions. Data science has applications in numerous industries, including healthcare, finance, marketing, and e-commerce."

tf = get_tf([doc1, doc2, doc3])
idf = get_idf([doc1, doc2, doc3])
tfidf = get_tfidf([doc1, doc2, doc3])

for token, doc in tf.keys():
    print(token, ":", tf[(token, doc)])

for token in idf.keys():
    print(token, ":", idf[token])

for token, doc in tfidf.keys():
    print(token, ":", tfidf[(token, doc)])