In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [2]:
document = "Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans using natural language. It involves the application of computational techniques to analyze and understand human language. NLP can be used for various tasks such as machine translation, sentiment analysis, information extraction, and question answering."

##     Punctuation Removal 

In [3]:
def remove_punctuation(text):
    # Remove punctuation characters
    translator = str.maketrans("", "", string.punctuation)
    text = text.translate(translator)
    return text

document = remove_punctuation(document)
document

'Natural language processing NLP is a field of artificial intelligence that focuses on the interaction between computers and humans using natural language It involves the application of computational techniques to analyze and understand human language NLP can be used for various tasks such as machine translation sentiment analysis information extraction and question answering'

## Tokenization 

In [4]:
tokens = word_tokenize(document)
tokens

LookupError: ignored

## POS Tagging

In [5]:
pos_tags = pos_tag(tokens)
pos_tags

NameError: ignored

## Stop Words Removal 

In [None]:
stop_words = set(stopwords.words("english"))

filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
filtered_tokens

## Stemming and Lemmatization 

In [None]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
lemmatized_tokens

In [None]:
preprocessed_document = " ".join(lemmatized_tokens)
preprocessed_document

## TF-IDF representation 

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([preprocessed_document])

feature_names = tfidf_vectorizer.get_feature_names_out()
for (i,feature)in enumerate(feature_names):
    tf_idf_value = tfidf_matrix[0,i]
    if tf_idf_value>0:
        print(f"{feature}: {tf_idf_value}")