In [6]:
import nltk

# Install NLTK library (if not already installed)
!pip install nltk

# Download necessary NLTK data packages for tokenization and lemmatization
nltk.download('punkt') # Required for sentence and word tokenizers
nltk.download('wordnet') # Required for WordNetLemmatizer
nltk.download('punkt_tab') # Required for sentence tokenization in some cases

# Import required modules from NLTK
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd # Import pandas for data manipulation and display

print("NLTK installed, data downloaded, and modules imported successfully.")

NLTK installed, data downloaded, and modules imported successfully.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
medical_text = '''Diabetes is a chronic disease that affects how the body processes blood sugar.
If untreated, diabetes may cause heart disease, kidney failure, nerve damage and vision problems.
Early diagnosis and proper treatment help improve patient outcomes.'''

# Print a confirmation message to indicate the variable has been defined
print("Medical text variable 'medical_text' defined successfully.")

Medical text variable 'medical_text' defined successfully.


In [8]:
# Perform sentence tokenization using NLTK's sent_tokenize function
medical_sentences = sent_tokenize(medical_text)

print("Sentence tokenization of medical text:")
# Iterate through the tokenized sentences and print each one with an index
for i, sentence in enumerate(medical_sentences):
    print(f"Sentence {i+1}: {sentence}")

Sentence tokenization of medical text:
Sentence 1: Diabetes is a chronic disease that affects how the body processes blood sugar.
Sentence 2: If untreated, diabetes may cause heart disease, kidney failure, nerve damage and vision problems.
Sentence 3: Early diagnosis and proper treatment help improve patient outcomes.


In [9]:
medical_words = [] # Initialize an empty list to store all tokenized words

# Iterate through each sentence and tokenize it into words
for sentence in medical_sentences:
    words = word_tokenize(sentence) # Tokenize the current sentence into words
    medical_words.extend(words) # Add the tokenized words to the main list

# Print the total number of words collected
print(f"Total number of words collected: {len(medical_words)}")
print("First 20 words:")
# Print the first 20 words to get a preview
print(medical_words[:20])

print("Word tokenization completed successfully.")

Total number of words collected: 42
First 20 words:
['Diabetes', 'is', 'a', 'chronic', 'disease', 'that', 'affects', 'how', 'the', 'body', 'processes', 'blood', 'sugar', '.', 'If', 'untreated', ',', 'diabetes', 'may', 'cause']
Word tokenization completed successfully.


In [10]:
porter_stemmer = PorterStemmer() # Instantiate the Porter Stemmer
stemmed_words = [] # Initialize an empty list to store stemmed words

# Iterate through each word in the medical_words list
for word in medical_words:
    stemmed_word = porter_stemmer.stem(word) # Apply stemming to the current word
    stemmed_words.append(stemmed_word) # Add the stemmed word to the list

# Print the total number of stemmed words
print(f"Total number of stemmed words: {len(stemmed_words)}")
print("First 20 stemmed words:")
# Print the first 20 stemmed words for preview
print(stemmed_words[:20])

# Create a DataFrame for comparison of original and stemmed words
comparison_df = pd.DataFrame({
    'Original Word': medical_words[:20], # Limiting to first 20 for display purposes
    'Stemmed Word': stemmed_words[:20]
})

print("\nComparison of Original vs. Stemmed Words (First 20):")
print(comparison_df)

Total number of stemmed words: 42
First 20 stemmed words:
['diabet', 'is', 'a', 'chronic', 'diseas', 'that', 'affect', 'how', 'the', 'bodi', 'process', 'blood', 'sugar', '.', 'if', 'untreat', ',', 'diabet', 'may', 'caus']

Comparison of Original vs. Stemmed Words (First 20):
   Original Word Stemmed Word
0       Diabetes       diabet
1             is           is
2              a            a
3        chronic      chronic
4        disease       diseas
5           that         that
6        affects       affect
7            how          how
8            the          the
9           body         bodi
10     processes      process
11         blood        blood
12         sugar        sugar
13             .            .
14            If           if
15     untreated      untreat
16             ,            ,
17      diabetes       diabet
18           may          may
19         cause         caus


In [11]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet # Not directly used but often imported with WordNetLemmatizer

# Instantiate a WordNetLemmatizer object
lemmatizer = WordNetLemmatizer()

lemmatized_words = [] # Initialize an empty list to store lemmatized words

# Iterate through each word to apply lemmatization
for word in medical_words:
    # Attempt to lemmatize the word as a verb (pos='v')
    lemma_v = lemmatizer.lemmatize(word, pos='v')

    # If lemmatizing as a verb didn't change the word, try as a noun (pos='n')
    if lemma_v == word:
        lemma_n = lemmatizer.lemmatize(word, pos='n')
        # If still no change, perform a general lemmatization without specifying POS
        if lemma_n == word:
            final_lemma = lemmatizer.lemmatize(word)
        else:
            final_lemma = lemma_n
    else:
        final_lemma = lemma_v # Use the verb lemma if it was different from original

    lemmatized_words.append(final_lemma) # Add the final lemmatized word to the list

# Print the total number of lemmatized words
print(f"Total number of lemmatized words: {len(lemmatized_words)}")
print("First 20 lemmatized words:")
# Print the first 20 lemmatized words for preview
print(lemmatized_words[:20])

# Create a DataFrame for comparison of original and lemmatized words
lemmatization_comparison_df = pd.DataFrame({
    'Original Word': medical_words[:20], # Limiting to first 20 for display purposes
    'Lemmatized Word': lemmatized_words[:20]
})

print("\nComparison of Original vs. Lemmatized Words (First 20):")
print(lemmatization_comparison_df)

Total number of lemmatized words: 42
First 20 lemmatized words:
['Diabetes', 'be', 'a', 'chronic', 'disease', 'that', 'affect', 'how', 'the', 'body', 'process', 'blood', 'sugar', '.', 'If', 'untreated', ',', 'diabetes', 'may', 'cause']

Comparison of Original vs. Lemmatized Words (First 20):
   Original Word Lemmatized Word
0       Diabetes        Diabetes
1             is              be
2              a               a
3        chronic         chronic
4        disease         disease
5           that            that
6        affects          affect
7            how             how
8            the             the
9           body            body
10     processes         process
11         blood           blood
12         sugar           sugar
13             .               .
14            If              If
15     untreated       untreated
16             ,               ,
17      diabetes        diabetes
18           may             may
19         cause           cause


In [12]:
# Create a comprehensive DataFrame comparing original, stemmed, and lemmatized words
comparison_all_df = pd.DataFrame({
    'Original Word': medical_words,
    'Stemmed Word': stemmed_words,
    'Lemmatized Word': lemmatized_words
})

print("Comparison of Original, Stemmed, and Lemmatized Words:")
# Print the full comparison DataFrame
print(comparison_all_df)

Comparison of Original, Stemmed, and Lemmatized Words:
   Original Word Stemmed Word Lemmatized Word
0       Diabetes       diabet        Diabetes
1             is           is              be
2              a            a               a
3        chronic      chronic         chronic
4        disease       diseas         disease
5           that         that            that
6        affects       affect          affect
7            how          how             how
8            the          the             the
9           body         bodi            body
10     processes      process         process
11         blood        blood           blood
12         sugar        sugar           sugar
13             .            .               .
14            If           if              If
15     untreated      untreat       untreated
16             ,            ,               ,
17      diabetes       diabet        diabetes
18           may          may             may
19         cause         