<a href="https://colab.research.google.com/github/DeepthiTabithaBennet/NaturalLanguageProcessing/blob/main/NLP_LexicalAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Write a python program with functions to
# a) remove punctuations, special symbols, numbers using regular
# expression
# b) Tokenize the given text (Tokenizing the text into sentences)
# c) Add Custom Stopwords and List Removed Stopwords
# d) Perform stemming and lemmatization on text
# e) Extract the usernames from the email addresses present
# f) Find the most common words in the text to exclude as stopwords
# g) Write a program to Correct the spelling errors using textblob

# you may use nltk, spacy or textblob

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
! pip install pyspellchecker
! pip install textblob



In [None]:
import re
import string
from collections import Counter

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from spellchecker import SpellChecker

from textblob import TextBlob
import spacy

In [None]:
# Function to remove punctuations, special symbols, and numbers using regex
def remove_non_alpha(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

In [None]:
# Function to tokenize text into sentences
def tokenize_sentences(text):
    return sent_tokenize(text)

In [None]:
# Function to tokenize text into words
def tokenize_words(text):
    return word_tokenize(text)

In [None]:
# Function to add custom stopwords
def add_custom_stopwords(stopwords_list, custom_stopwords):
    stopwords_list.extend(custom_stopwords)

In [None]:
# Function to list removed stopwords
def list_removed_stopwords(text, stopwords_list):
    words = tokenize_words(text.lower())
    removed_stopwords = [word for word in words if word not in stopwords_list]
    return removed_stopwords

In [None]:
# Function to perform stemming and lemmatization
def stem_and_lemmatize(text):
    words = tokenize_words(text.lower())

    porter = PorterStemmer()
    stemmed_words = [porter.stem(word) for word in words]

    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    return stemmed_words, lemmatized_words

In [None]:
 # Function to extract usernames from email addresses
def extract_usernames(email_addresses):
    usernames = []

    for email in email_addresses:
        username = re.search(r'([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', email)
        if username:
            usernames.append(username.group(1))

    return usernames

In [None]:
# Function to find most common words in the text
def find_most_common_words(text, num_words):
    words = tokenize_words(text.lower())
    word_counts = Counter(words)
    return word_counts.most_common(num_words)

In [None]:
# Function to correct spelling errors using TextBlob
def correct_spelling(text):
    blob = TextBlob(text)
    return str(blob.correct())

In [None]:
text = "Hello, Deepthi Tabitha here! Yoou caen call me Deepthi. How are you doing? I hav 2 cats andd they're vry cute! Mmy e-mail is deepthitabitha@student.tce.edu"
print("Original Text :\n", text)

# a) Remove punctuations, special symbols, and numbers
task_a = remove_non_alpha(text)
print("\nText after removing punctuations, special symbols, and numbers:")
print(task_a)

# b) Tokenize the given text into sentences
task_b = tokenize_sentences(text)
print("\nTokenized sentences:")
print(task_b)

# c) Add Custom Stopwords and List Removed Stopwords
custom_stopwords = ['example', 'contains', 'like']
nltk_stopwords = stopwords.words('english')
add_custom_stopwords(nltk_stopwords, custom_stopwords)
task_c = list_removed_stopwords(text, nltk_stopwords)
print("\nList of removed stopwords:")
print(task_c)

# d) Perform stemming and lemmatization on text
task_d1, task_d2 = stem_and_lemmatize(text)
print("\nStemmed words:")
print(task_d1)
print("\nLemmatized words:")
print(task_d2)

# e) Extract the usernames from the email addresses present
email_addresses = re.findall(r'\w+@\w+\.\w+', text)      # longer version  \b[\w.-]+?@\w+?\.\w{2,4}\b
task_e = extract_usernames(email_addresses)
print("\nExtracted usernames:")
print(task_e)

# f) Find the most common words in the text to exclude as stopwords
task_f = find_most_common_words(task_a, 3)
print("\nMost common words in the text:")
print(task_f)

# g) Correct the spelling errors using TextBlob
task_g = correct_spelling(text)
print("\nText after spelling correction:")
print(task_g)

Original Text :
 Hello, Deepthi Tabitha here! Yoou caen call me Deepthi. How are you doing? I hav 2 cats andd they're vry cute! Mmy e-mail is deepthitabitha@student.tce.edu

Text after removing punctuations, special symbols, and numbers:
Hello Deepthi Tabitha here Yoou caen call me Deepthi How are you doing I hav  cats andd theyre vry cute Mmy email is deepthitabithastudenttceedu

Tokenized sentences:
['Hello, Deepthi Tabitha here!', 'Yoou caen call me Deepthi.', 'How are you doing?', "I hav 2 cats andd they're vry cute!", 'Mmy e-mail is deepthitabitha@student.tce.edu']

List of removed stopwords:
['hello', ',', 'deepthi', 'tabitha', '!', 'yoou', 'caen', 'call', 'deepthi', '.', '?', 'hav', '2', 'cats', 'andd', "'re", 'vry', 'cute', '!', 'mmy', 'e-mail', 'deepthitabitha', '@', 'student.tce.edu']

Stemmed words:
['hello', ',', 'deepthi', 'tabitha', 'here', '!', 'yoou', 'caen', 'call', 'me', 'deepthi', '.', 'how', 'are', 'you', 'do', '?', 'i', 'hav', '2', 'cat', 'andd', 'they', "'re", 'vr