In [1]:
import nltk

In [2]:
corpus = [    "I can't wait for the new season of my favorite show!",
    "The COVID-19 pandemic has affected millions of people worldwide.",
    "U.S. stocks fell on Friday after news of rising inflation.",
    "<html><body>Welcome to the website!</body></html>",
    "Python is a great programming language!!! 😃😃"]

## Text cleaning

In [3]:
import re
import string
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize

In [4]:
def clean_text(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = text.lower()
    text = re.sub(r'\d+', '', text) # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    text = re.sub(r'\W', ' ', text)
    return text

In [5]:
cleaned_corpus = [clean_text(doc) for doc in corpus]

In [6]:
cleaned_corpus

['i cant wait for the new season of my favorite show',
 'the covid pandemic has affected millions of people worldwide',
 'us stocks fell on friday after news of rising inflation',
 'welcome to the website',
 'python is a great programming language   ']

## Tokenization 

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/falcon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
tokenization_corpus = [word_tokenize(doc) for doc in cleaned_corpus]

In [9]:
tokenization_corpus

[['i',
  'cant',
  'wait',
  'for',
  'the',
  'new',
  'season',
  'of',
  'my',
  'favorite',
  'show'],
 ['the',
  'covid',
  'pandemic',
  'has',
  'affected',
  'millions',
  'of',
  'people',
  'worldwide'],
 ['us',
  'stocks',
  'fell',
  'on',
  'friday',
  'after',
  'news',
  'of',
  'rising',
  'inflation'],
 ['welcome', 'to', 'the', 'website'],
 ['python', 'is', 'a', 'great', 'programming', 'language']]

## Stop Words Removal

In [10]:
from nltk.corpus import stopwords

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/falcon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
stop_words = set(stopwords.words('english'))

In [13]:
filtered_corpus = [[token for token in block if token not in stop_words] for block in tokenization_corpus]

In [14]:
filtered_corpus

[['cant', 'wait', 'new', 'season', 'favorite', 'show'],
 ['covid', 'pandemic', 'affected', 'millions', 'people', 'worldwide'],
 ['us', 'stocks', 'fell', 'friday', 'news', 'rising', 'inflation'],
 ['welcome', 'website'],
 ['python', 'great', 'programming', 'language']]

## Lemmatization

In [15]:
from nltk.stem import WordNetLemmatizer

In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/falcon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
lemmatizer = WordNetLemmatizer()

In [18]:
lemmatized_corpus = [[lemmatizer.lemmatize(word) for word in doc ] for doc in filtered_corpus]

In [19]:
lemmatized_corpus

[['cant', 'wait', 'new', 'season', 'favorite', 'show'],
 ['covid', 'pandemic', 'affected', 'million', 'people', 'worldwide'],
 ['u', 'stock', 'fell', 'friday', 'news', 'rising', 'inflation'],
 ['welcome', 'website'],
 ['python', 'great', 'programming', 'language']]

## Stemming

In [20]:
from nltk.stem import PorterStemmer

In [21]:
stemmer = PorterStemmer()

In [22]:
stemmer_corpus = [[stemmer.stem(word) for word in doc] for doc in lemmatized_corpus]

In [23]:
stemmer_corpus

[['cant', 'wait', 'new', 'season', 'favorit', 'show'],
 ['covid', 'pandem', 'affect', 'million', 'peopl', 'worldwid'],
 ['u', 'stock', 'fell', 'friday', 'news', 'rise', 'inflat'],
 ['welcom', 'websit'],
 ['python', 'great', 'program', 'languag']]