In [1]:
import pandas as pd
import string
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Exercise 1: Lowercase

In [2]:
# sample data
list_ = ["This is my first NLP exercise", "wtf!!!!!"]
series_data = pd.Series(list_, name='text')

print(series_data)
print("Lowercase text\n", series_data.str.lower().tolist())
print("Uppercase text\n", series_data.str.upper().tolist())

0    This is my first NLP exercise
1                         wtf!!!!!
Name: text, dtype: object
Lowercase text
 ['this is my first nlp exercise', 'wtf!!!!!']
Uppercase text
 ['THIS IS MY FIRST NLP EXERCISE', 'WTF!!!!!']


# Exercise 2: punctuation

In [3]:
sentence = "Remove, this from .? the sentence !!!! !"#&'()*+,-./:;<=>_"

# string.punctuation contains all punctuation marks
cleaned = sentence.translate(str.maketrans('', '', string.punctuation))
print("Cleaned sentence:")
print(cleaned)

# Alternatively
cleaned_2 = ''.join([char for char in sentence if char not in string.punctuation])
print("\nAlternative method:")
print(cleaned_2)

Cleaned sentence:
Remove this from  the sentence  

Alternative method:
Remove this from  the sentence  


# Exercise 3: Tokenization

In [4]:
# download tokenizers
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/barraotieno/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
# sample text
text = """Bitcoin is a cryptocurrency invented in 2008 by an unknown person or group of people using the name Satoshi Nakamoto. The currency began use in 2009 when its implementation was released as open-source software."""

# Sentence tokenization
sentences = sent_tokenize(text)
print("Sentences:", sentences)

# Word tokenization
words = word_tokenize(text)
print("\nWords:", words[:10], "...")  # Show first 10

Sentences: ['Bitcoin is a cryptocurrency invented in 2008 by an unknown person or group of people using the name Satoshi Nakamoto.', 'The currency began use in 2009 when its implementation was released as open-source software.']

Words: ['Bitcoin', 'is', 'a', 'cryptocurrency', 'invented', 'in', '2008', 'by', 'an', 'unknown'] ...


# EXERCISE 4: STOP WORDS REMOVAL

In [6]:
# Download NLTK resources (only needed once)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/barraotieno/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# Sample text data
text = """
The goal of this exercise is to learn to remove stop words with NLTK.  Stop words usually refers to the most common words in a language.
"""

# Tokenize + lowercase
tokens = word_tokenize(text.lower())

# Load English stop words
stop_words = set(stopwords.words('english'))

# Filter
filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
print("Filtered tokens:", filtered_tokens)

Filtered tokens: ['goal', 'exercise', 'learn', 'remove', 'stop', 'words', 'nltk', 'stop', 'words', 'usually', 'refers', 'common', 'words', 'language']


# EXERCISE 5: STEMMING

In [8]:
# sample text
text = "The interviewer interviews the president in an interview"

# Tokenize and stem
tokens = word_tokenize(text.lower())
stemmer = PorterStemmer()

stemmed = [stemmer.stem(word) for word in tokens if word.isalpha()]
print("Stemmed tokens:", stemmed)

Stemmed tokens: ['the', 'interview', 'interview', 'the', 'presid', 'in', 'an', 'interview']


# EXERCISE 6: TEXT PREPROCESSING

In [9]:
# Sample text
text = """
01 Edu System presents an innovative curriculum in software engineering and programming. 
With a renowned industry-leading reputation, the curriculum has been rigorously designed for 
learning skills of the digital world and technology industry. Taking a different approach than 
the classic teaching methods today, learning is facilitated through a collective and co-creative 
process in a professional environment.
"""

# Define the preprocessing function
def preprocess_text(text):
    """
    Complete text preprocessing pipeline
    Steps: Lowercase -> Remove Punctuation -> Tokenization -> Stopword Removal -> Stemming
    """
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove Punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # 3. Tokenization
    tokens = word_tokenize(text)
    
    # 4. Stopword Filtering
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
    
    # 5. Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    
    return stemmed_tokens

print(preprocess_text(text))

['edu', 'system', 'present', 'innov', 'curriculum', 'softwar', 'engin', 'program', 'renown', 'industrylead', 'reput', 'curriculum', 'rigor', 'design', 'learn', 'skill', 'digit', 'world', 'technolog', 'industri', 'take', 'differ', 'approach', 'classic', 'teach', 'method', 'today', 'learn', 'facilit', 'collect', 'cocr', 'process', 'profession', 'environ']
