In [187]:
# Cell: zxplu1JDfPSV
# Description: Installs and imports NLTK and SpaCy libraries.

# Install NLTK and SpaCy if not already present
!pip install nltk spacy

# Import the Natural Language Toolkit (NLTK) library
import nltk
# Import the SpaCy library
import spacy

# Print a confirmation message indicating successful installation and import
print("NLTK and SpaCy installed and imported successfully.")

NLTK and SpaCy installed and imported successfully.


In [188]:
# Cell: 5a07127f
# Description: Defines a sample text and counts the number of words in it.

# Define the input text string for analysis
text = "The students were studying diligently in the library, reading books, writing notes, and discussing ideas with their friends. They enjoyed learning new concepts, but sometimes struggled with understanding complex theories and solving difficult problems."

# Split the text into words based on whitespace to create a list of words
words = text.split()
# Count the number of words by getting the length of the words list
word_count = len(words)

# Print the total number of words found in the text
print(f"Number of words: {word_count}")

Number of words: 34


In [189]:
# Cell: xgw94XAdh_PB
# Description: Converts the defined text to lowercase.

# Convert the entire text to lowercase for case-insensitive processing
lowercase_text = text.lower()

# Print the text in its lowercase form
print(f"Lowercase text: {lowercase_text}")

Lowercase text: the students were studying diligently in the library, reading books, writing notes, and discussing ideas with their friends. they enjoyed learning new concepts, but sometimes struggled with understanding complex theories and solving difficult problems.


In [190]:
# Cell: 5ddfe292
# Description: Downloads necessary NLTK data packages for tokenization and lemmatization.

# Download the 'punkt_tab' resource, used for tokenization (sentence splitting and word tokenization)
nltk.download('punkt_tab')
# Download the 'wordnet' corpus, essential for WordNetLemmatizer
nltk.download('wordnet')
# Download the 'omw-1.4' (Open Multilingual Wordnet) resource, often used with WordNet
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [191]:
# Cell: 3bad9497
# Description: Performs word tokenization on the text using NLTK.

# Import the word_tokenize function from NLTK's tokenize module
from nltk.tokenize import word_tokenize

# Tokenize the input text into a list of individual words
tokenized_words = word_tokenize(text)

# Print the list of tokenized words
print(f"Tokenized words: {tokenized_words}")

Tokenized words: ['The', 'students', 'were', 'studying', 'diligently', 'in', 'the', 'library', ',', 'reading', 'books', ',', 'writing', 'notes', ',', 'and', 'discussing', 'ideas', 'with', 'their', 'friends', '.', 'They', 'enjoyed', 'learning', 'new', 'concepts', ',', 'but', 'sometimes', 'struggled', 'with', 'understanding', 'complex', 'theories', 'and', 'solving', 'difficult', 'problems', '.']


In [192]:
# Cell: 693ff748
# Description: Performs stemming on the tokenized words using NLTK's Porter Stemmer.

# Import the PorterStemmer class from NLTK's stem module
from nltk.stem import PorterStemmer

# Initialize the Porter Stemmer
stemmer = PorterStemmer()
# Apply stemming to each word in the tokenized_words list
# The stem() method reduces words to their root forms
stemmed_words = [stemmer.stem(word) for word in tokenized_words]

# Print the list of stemmed words
print(f"Stemmed words: {stemmed_words}")

Stemmed words: ['the', 'student', 'were', 'studi', 'dilig', 'in', 'the', 'librari', ',', 'read', 'book', ',', 'write', 'note', ',', 'and', 'discuss', 'idea', 'with', 'their', 'friend', '.', 'they', 'enjoy', 'learn', 'new', 'concept', ',', 'but', 'sometim', 'struggl', 'with', 'understand', 'complex', 'theori', 'and', 'solv', 'difficult', 'problem', '.']


In [193]:
# Cell: f005644b
# Description: Performs lemmatization on the tokenized words using NLTK's WordNet Lemmatizer.

# Import the WordNetLemmatizer class from NLTK's stem module
from nltk.stem import WordNetLemmatizer

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()
# Apply lemmatization to each word in the tokenized_words list
# The lemmatize() method reduces words to their dictionary base form (lemma)
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenized_words]

# Print the list of lemmatized words
print(f"Lemmatized words: {lemmatized_words}")

Lemmatized words: ['The', 'student', 'were', 'studying', 'diligently', 'in', 'the', 'library', ',', 'reading', 'book', ',', 'writing', 'note', ',', 'and', 'discussing', 'idea', 'with', 'their', 'friend', '.', 'They', 'enjoyed', 'learning', 'new', 'concept', ',', 'but', 'sometimes', 'struggled', 'with', 'understanding', 'complex', 'theory', 'and', 'solving', 'difficult', 'problem', '.']


In [194]:
with open('/content/lab1 nlp.txt', 'r') as file:
    text_from_file = file.read()

print(f"First 10000 characters of loaded text:\n{text_from_file[:10000]}")

First 10000 characters of loaded text:
  
India is a vast and diverse country located in South Asia, known for its rich history, cultural heritage, and vibrant traditions. It is the seventh-largest nation by land area and the most populous country in the world. With landscapes ranging from the towering Himalayas in the north to the tropical coasts in the south, India offers remarkable geographical variety. Its history spans thousands of years, marked by ancient civilizations, powerful kingdoms, and influential philosophies.   
The nation is often described as a land of unity in diversity, with hundreds of languages, religions, and ethnic groups coexisting. Hinduism, Buddhism, Jainism, and Sikhism originated here, while Islam and Christianity also have deep roots. Festivals such as Diwali, Holi, Eid, and Christmas are celebrated with equal enthusiasm, reflecting the pluralistic spirit of the country. Indian cuisine, famous for its spices and flavors, varies widely across regions, from t

In [195]:
# Cell: 7fb3a460
# Description: Performs word tokenization on the text from the file using NLTK.

# Import the word_tokenize function from NLTK's tokenize module
from nltk.tokenize import word_tokenize

# Tokenize the input text from the file into a list of individual words
tokenized_file_words = word_tokenize(text_from_file)

# Print the list of tokenized words from the file
print(f"Tokenized words from file: {tokenized_file_words}")

Tokenized words from file: ['India', 'is', 'a', 'vast', 'and', 'diverse', 'country', 'located', 'in', 'South', 'Asia', ',', 'known', 'for', 'its', 'rich', 'history', ',', 'cultural', 'heritage', ',', 'and', 'vibrant', 'traditions', '.', 'It', 'is', 'the', 'seventh-largest', 'nation', 'by', 'land', 'area', 'and', 'the', 'most', 'populous', 'country', 'in', 'the', 'world', '.', 'With', 'landscapes', 'ranging', 'from', 'the', 'towering', 'Himalayas', 'in', 'the', 'north', 'to', 'the', 'tropical', 'coasts', 'in', 'the', 'south', ',', 'India', 'offers', 'remarkable', 'geographical', 'variety', '.', 'Its', 'history', 'spans', 'thousands', 'of', 'years', ',', 'marked', 'by', 'ancient', 'civilizations', ',', 'powerful', 'kingdoms', ',', 'and', 'influential', 'philosophies', '.', 'The', 'nation', 'is', 'often', 'described', 'as', 'a', 'land', 'of', 'unity', 'in', 'diversity', ',', 'with', 'hundreds', 'of', 'languages', ',', 'religions', ',', 'and', 'ethnic', 'groups', 'coexisting', '.', 'Hindui

In [196]:
# Cell: 6bc6fcfe
# Description: Performs stemming on the tokenized words from the file using NLTK's Porter Stemmer.

# Import the PorterStemmer class from NLTK's stem module
from nltk.stem import PorterStemmer

# Initialize the Porter Stemmer
stemmer = PorterStemmer()
# Apply stemming to each word in the tokenized_file_words list
# The stem() method reduces words to their root forms
stemmed_file_words = [stemmer.stem(word) for word in tokenized_file_words]

# Print the list of stemmed words from the file
print(f"Stemmed words from file: {stemmed_file_words}")

Stemmed words from file: ['india', 'is', 'a', 'vast', 'and', 'divers', 'countri', 'locat', 'in', 'south', 'asia', ',', 'known', 'for', 'it', 'rich', 'histori', ',', 'cultur', 'heritag', ',', 'and', 'vibrant', 'tradit', '.', 'it', 'is', 'the', 'seventh-largest', 'nation', 'by', 'land', 'area', 'and', 'the', 'most', 'popul', 'countri', 'in', 'the', 'world', '.', 'with', 'landscap', 'rang', 'from', 'the', 'tower', 'himalaya', 'in', 'the', 'north', 'to', 'the', 'tropic', 'coast', 'in', 'the', 'south', ',', 'india', 'offer', 'remark', 'geograph', 'varieti', '.', 'it', 'histori', 'span', 'thousand', 'of', 'year', ',', 'mark', 'by', 'ancient', 'civil', ',', 'power', 'kingdom', ',', 'and', 'influenti', 'philosophi', '.', 'the', 'nation', 'is', 'often', 'describ', 'as', 'a', 'land', 'of', 'uniti', 'in', 'divers', ',', 'with', 'hundr', 'of', 'languag', ',', 'religion', ',', 'and', 'ethnic', 'group', 'coexist', '.', 'hinduism', ',', 'buddhism', ',', 'jainism', ',', 'and', 'sikhism', 'origin', 'he

In [197]:
# Cell: 3c6c2577
# Description: Performs lemmatization on the tokenized words from the file using NLTK's WordNet Lemmatizer.

# Import the WordNetLemmatizer class from NLTK's stem module
from nltk.stem import WordNetLemmatizer

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()
# Apply lemmatization to each word in the tokenized_file_words list
# The lemmatize() method reduces words to their dictionary base form (lemma)
lemmatized_file_words = [lemmatizer.lemmatize(word) for word in tokenized_file_words]

# Print the list of lemmatized words from the file
print(f"Lemmatized words from file: {lemmatized_file_words}")

Lemmatized words from file: ['India', 'is', 'a', 'vast', 'and', 'diverse', 'country', 'located', 'in', 'South', 'Asia', ',', 'known', 'for', 'it', 'rich', 'history', ',', 'cultural', 'heritage', ',', 'and', 'vibrant', 'tradition', '.', 'It', 'is', 'the', 'seventh-largest', 'nation', 'by', 'land', 'area', 'and', 'the', 'most', 'populous', 'country', 'in', 'the', 'world', '.', 'With', 'landscape', 'ranging', 'from', 'the', 'towering', 'Himalayas', 'in', 'the', 'north', 'to', 'the', 'tropical', 'coast', 'in', 'the', 'south', ',', 'India', 'offer', 'remarkable', 'geographical', 'variety', '.', 'Its', 'history', 'span', 'thousand', 'of', 'year', ',', 'marked', 'by', 'ancient', 'civilization', ',', 'powerful', 'kingdom', ',', 'and', 'influential', 'philosophy', '.', 'The', 'nation', 'is', 'often', 'described', 'a', 'a', 'land', 'of', 'unity', 'in', 'diversity', ',', 'with', 'hundred', 'of', 'language', ',', 'religion', ',', 'and', 'ethnic', 'group', 'coexisting', '.', 'Hinduism', ',', 'Buddh