<a href="https://colab.research.google.com/github/2403a52030-sketch/NLP-LAB/blob/main/NLTK_Lab_2_2403a52030.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import spacy

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
medical_text = """
Diabetes is a chronic disease that affects how the body processes blood sugar.
If untreated, diabetes may cause heart disease, kidney failure, nerve damage and vision problems.
Early diagnosis and proper treatment help improve patient outcomes.
"""

In [None]:
nltk.download('punkt_tab')

# Sentence Tokenization
sentences_nltk = sent_tokenize(medical_text)

# Word Tokenization
words_nltk = word_tokenize(medical_text)

print("NLTK Sentence Tokens:")
for s in sentences_nltk:
    print("- ", s)

print("\nNLTK Word Tokens:")
print(words_nltk)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


NLTK Sentence Tokens:
-  
Patients with diabetes mellitus often develop cardiovascular complications.
-  The administration of insulin reduces blood glucose levels.
-  Doctors are monitoring patients who were diagnosed with chronic kidney disease.
-  Early diagnosis and treatments improve patient outcomes.

NLTK Word Tokens:
['Patients', 'with', 'diabetes', 'mellitus', 'often', 'develop', 'cardiovascular', 'complications', '.', 'The', 'administration', 'of', 'insulin', 'reduces', 'blood', 'glucose', 'levels', '.', 'Doctors', 'are', 'monitoring', 'patients', 'who', 'were', 'diagnosed', 'with', 'chronic', 'kidney', 'disease', '.', 'Early', 'diagnosis', 'and', 'treatments', 'improve', 'patient', 'outcomes', '.']


In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(medical_text)

sentences_spacy = [sent.text for sent in doc.sents]
words_spacy = [token.text for token in doc if not token.is_punct]

print("spaCy Sentence Tokens:")
for s in sentences_spacy:
    print("-", s)

print("\nspaCy Word Tokens:")
print(words_spacy)


spaCy Sentence Tokens:
- 
Diabetes is a chronic disease that affects how the body processes blood sugar.

- If untreated, diabetes may cause heart disease, kidney failure, nerve damage and vision problems.

- Early diagnosis and proper treatment help improve patient outcomes.


spaCy Word Tokens:
['\n', 'Diabetes', 'is', 'a', 'chronic', 'disease', 'that', 'affects', 'how', 'the', 'body', 'processes', 'blood', 'sugar', '\n', 'If', 'untreated', 'diabetes', 'may', 'cause', 'heart', 'disease', 'kidney', 'failure', 'nerve', 'damage', 'and', 'vision', 'problems', '\n', 'Early', 'diagnosis', 'and', 'proper', 'treatment', 'help', 'improve', 'patient', 'outcomes', '\n']


In [None]:
stemmer = PorterStemmer()

stemmed_words = [stemmer.stem(word) for word in words_nltk if word.isalpha()]

print("Stemmed Words:")
print(stemmed_words)


Stemmed Words:
['patient', 'with', 'diabet', 'mellitu', 'often', 'develop', 'cardiovascular', 'complic', 'the', 'administr', 'of', 'insulin', 'reduc', 'blood', 'glucos', 'level', 'doctor', 'are', 'monitor', 'patient', 'who', 'were', 'diagnos', 'with', 'chronic', 'kidney', 'diseas', 'earli', 'diagnosi', 'and', 'treatment', 'improv', 'patient', 'outcom']


In [None]:
lemmatizer = WordNetLemmatizer()

lemmatized_words_nltk = [lemmatizer.lemmatize(word.lower())
                          for word in words_nltk if word.isalpha()]

print("NLTK Lemmatized Words:")
print(lemmatized_words_nltk)


NLTK Lemmatized Words:
['patient', 'with', 'diabetes', 'mellitus', 'often', 'develop', 'cardiovascular', 'complication', 'the', 'administration', 'of', 'insulin', 'reduces', 'blood', 'glucose', 'level', 'doctor', 'are', 'monitoring', 'patient', 'who', 'were', 'diagnosed', 'with', 'chronic', 'kidney', 'disease', 'early', 'diagnosis', 'and', 'treatment', 'improve', 'patient', 'outcome']


In [None]:
lemmatized_words_spacy = [token.lemma_ for token in doc if token.is_alpha]
print("spaCy Lemmatized Words:")
print(lemmatized_words_spacy)


spaCy Lemmatized Words:
['Diabetes', 'be', 'a', 'chronic', 'disease', 'that', 'affect', 'how', 'the', 'body', 'process', 'blood', 'sugar', 'if', 'untreate', 'diabete', 'may', 'cause', 'heart', 'disease', 'kidney', 'failure', 'nerve', 'damage', 'and', 'vision', 'problem', 'early', 'diagnosis', 'and', 'proper', 'treatment', 'help', 'improve', 'patient', 'outcome']


In [16]:
import pandas as pd

doc = nlp(medical_text)

# Define tokens to iterate over, ensuring consistency with the current medical_text and doc
tokens = [token.text for token in doc if token.is_alpha]

# STEP 4: Compare original, stemmed, lemmatized
comparison = []

for word in tokens:
    stemmed = stemmer.stem(word.lower()) # Apply lowercasing for consistency
    lemma_nltk = lemmatizer.lemmatize(word.lower()) # Apply lowercasing for consistency
    # Ensure consistent lowercasing for lookup in spaCy doc
    lemma_spacy = next(token.lemma_ for token in doc if token.text.lower() == word.lower())
    comparison.append([word, stemmed, lemma_nltk, lemma_spacy])

# STEP 5: Display results
df = pd.DataFrame(
    comparison,
    columns=["Original Word", "Stemmed (NLTK)", "Lemmatized (NLTK)", "Lemmatized (spaCy)"]
)

df

Unnamed: 0,Original Word,Stemmed (NLTK),Lemmatized (NLTK),Lemmatized (spaCy)
0,Diabetes,diabet,diabetes,Diabetes
1,is,is,is,be
2,a,a,a,a
3,chronic,chronic,chronic,chronic
4,disease,diseas,disease,disease
5,that,that,that,that
6,affects,affect,affect,affect
7,how,how,how,how
8,the,the,the,the
9,body,bodi,body,body
