In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK packages (run only once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amard\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amard\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amard\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\amard\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [2]:
text = "Natural Language Processing is a powerful tool in data science. It helps machines understand human language."


In [3]:
tokens = word_tokenize(text)
print("Tokens:", tokens)


Tokens: ['Natural', 'Language', 'Processing', 'is', 'a', 'powerful', 'tool', 'in', 'data', 'science', '.', 'It', 'helps', 'machines', 'understand', 'human', 'language', '.']


In [4]:
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)


POS Tags: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('tool', 'NN'), ('in', 'IN'), ('data', 'NNS'), ('science', 'NN'), ('.', '.'), ('It', 'PRP'), ('helps', 'VBZ'), ('machines', 'NNS'), ('understand', 'JJ'), ('human', 'JJ'), ('language', 'NN'), ('.', '.')]


In [5]:
stop_words = set(stopwords.words("english"))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("After Stop Words Removal:", filtered_tokens)


After Stop Words Removal: ['Natural', 'Language', 'Processing', 'powerful', 'tool', 'data', 'science', '.', 'helps', 'machines', 'understand', 'human', 'language', '.']


In [6]:
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in filtered_tokens]
print("After Stemming:", stemmed)


After Stemming: ['natur', 'languag', 'process', 'power', 'tool', 'data', 'scienc', '.', 'help', 'machin', 'understand', 'human', 'languag', '.']


In [7]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("After Lemmatization:", lemmatized)


After Lemmatization: ['Natural', 'Language', 'Processing', 'powerful', 'tool', 'data', 'science', '.', 'help', 'machine', 'understand', 'human', 'language', '.']


In [9]:
# Let's say we have multiple documents
documents = [
    "Natural Language Processing is a powerful tool in data science.",
    "Machines can learn human language through NLP.",
    "TF and IDF help to represent documents numerically."
]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# View as DataFrame for better readability
import pandas as pd
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(df_tfidf)


        and       can      data  documents      help     human       idf  \
0  0.000000  0.000000  0.341426   0.000000  0.000000  0.000000  0.000000   
1  0.000000  0.389888  0.000000   0.000000  0.000000  0.389888  0.000000   
2  0.353553  0.000000  0.000000   0.353553  0.353553  0.000000  0.353553   

         in        is  language  ...       nlp  numerically  powerful  \
0  0.341426  0.341426  0.259663  ...  0.000000     0.000000  0.341426   
1  0.000000  0.000000  0.296520  ...  0.389888     0.000000  0.000000   
2  0.000000  0.000000  0.000000  ...  0.000000     0.353553  0.000000   

   processing  represent   science        tf   through        to      tool  
0    0.341426   0.000000  0.341426  0.000000  0.000000  0.000000  0.341426  
1    0.000000   0.000000  0.000000  0.000000  0.389888  0.000000  0.000000  
2    0.000000   0.353553  0.000000  0.353553  0.000000  0.353553  0.000000  

[3 rows x 23 columns]
