### Importing libraries

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [2]:
nltk.download('punkt')  # Download the necessary resource for tokenization
nltk.download('stopwords')  # Download the stopwords corpus

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Python function for removing stopwords and printing output as tokens

In [3]:
def tokenize_text_with_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

In [4]:
text_document = "Tokenizing the text with stop words such as is, the, and was as delimiters should identify meaningful phrases."

In [5]:
tokens = tokenize_text_with_stopwords(text_document)

print("Tokens:", tokens)

Tokens: ['Tokenizing', 'text', 'stop', 'words', ',', ',', 'delimiters', 'identify', 'meaningful', 'phrases', '.']


### Performing stopword removal on a text document and printing output as tokens

In [6]:
def tokenize_text_file_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        tokens = tokenize_text_with_stopwords(text)
    return tokens

In [7]:
file_path = 'demo.txt'

tokens = tokenize_text_file_stopwords(file_path)
print("Tokens:", tokens)

Tokens: ['good', 'morning', '!', 'Good', 'morning', 'ye', 'thou', '!', '’', 'say', 'patients', ',', 'worse', 'hypocrites', ',', 'hypocrites', ',', 'cruel', 'phony', 'hypocrites', ',', 'worst', '.', '”']


### Python function for removing stopwords and punctuation and printing output as tokens

In [8]:
def tokenize_text_without_punctuation(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word not in string.punctuation]
    return filtered_tokens

In [9]:
text_document = "Tokenizing the text, with stop words such as is, the, and was, as delimiters should identify meaningful phrases."

tokens = tokenize_text_without_punctuation(text_document)

print('Tokens:', tokens)

Tokens: ['Tokenizing', 'text', 'stop', 'words', 'delimiters', 'identify', 'meaningful', 'phrases']


### Removing stopwords and punctuation on a text document and printing output as tokens

In [10]:
def tokenize_text_file_stopwords_punctuation(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        stop_words = set(stopwords.words('english'))
        tokens = tokenize_text_with_stopwords(text)
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word not in string.punctuation]
    return filtered_tokens

In [11]:
file_path = 'demo.txt'

tokens = tokenize_text_file_stopwords_punctuation(file_path)
print("Tokens:", tokens)

Tokens: ['good', 'morning', 'Good', 'morning', 'ye', 'thou', '’', 'say', 'patients', 'worse', 'hypocrites', 'hypocrites', 'cruel', 'phony', 'hypocrites', 'worst', '”']


### Python function for removing stopwords and printing output as a sentence

In [12]:
def remove_stopwords_without_tokenizing(text):
    stop_words = set(stopwords.words('english'))
    filtered_text = ' '.join(word for word in text.split() if word.lower() not in stop_words)
    return filtered_text

In [13]:
text = "This is an example sentence demonstrating stop word removal. Here all the stop words are removed"
filtered_text = remove_stopwords_without_tokenizing(text)
print("Tokens:", filtered_text)

Tokens: example sentence demonstrating stop word removal. stop words removed


### Performing stopword removal on a text document and printing output as a sentence

In [14]:
def remove_stopwords_without_tokenizing_on_document(text):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        stop_words = set(stopwords.words('english'))
        filtered_text = ' '.join(word for word in text.split() if word.lower() not in stop_words)
    return filtered_text

In [15]:
file_path = "demo.txt"
filtered_text = remove_stopwords_without_tokenizing_on_document(file_path)
print("Tokens:", filtered_text)

Tokens: good morning! Good morning ye thou! I’d say patients, worse hypocrites, hypocrites, cruel phony hypocrites, worst.”
