In [13]:
import re #regular expression

In [14]:
# Function to tokenize text with specified stopwords as delimiters
def tokenize_with_stopwords(text, stopwords):
    # Create a regular expression pattern
    pattern = re.compile(r'\b(?:' + '|'.join(stopwords) + r')\b|\b\w+\b')
    
    # Tokenize the text using the pattern
    tokens = pattern.findall(text)
    return tokens

##### This line constructs a regular expression pattern. Let's break it down:
    ### \b: This denotes a word boundary.
    ### (?: ... ): This is a non-capturing group that allows grouping of the specified stopwords.
    ### |: This is the alternation operator, which acts like a logical OR.
    ###' '.join(stopwords): This joins the stopwords with the alternation operator, creating a pattern like word1|word2|word3.
    ###\b: Another word boundary.
    ###|\b\w+\b: This part of the pattern matches any word (sequence of word characters) that is not in the stopwords list.

In [15]:
# Sample text for demonstration
sample_text = "This is a sample text, and it was written to demonstrate tokenization with stopwords."

In [16]:
# Define stopwords
stopwords = ["is", "the", "was"]

In [17]:
# Tokenize the sample text with stopwords as delimiters
tokens = tokenize_with_stopwords(sample_text, stopwords)

In [18]:
# Print the tokens
print("Tokens:", tokens)

Tokens: ['This', 'is', 'a', 'sample', 'text', 'and', 'it', 'was', 'written', 'to', 'demonstrate', 'tokenization', 'with', 'stopwords']


## Using a Text File as Input Data

In [19]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
# Define a function for tokenization
def tokenize_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file: #uni-code transformation format
        text = file.read()
        tokens = word_tokenize(text)
    return tokens

In [21]:
# Replace 'your_file.txt' with the path to your text file
file_path = 'sample_text.txt'

In [22]:
# Tokenize the content of the file
tokens = tokenize_file(file_path)

In [23]:
# Print the tokens
print("Tokens:", tokens)

Tokens: ['The', 'Taj', 'Mahal', 'complex', 'is', 'believed', 'to', 'have', 'been', 'completed', 'in', 'its', 'entirety', 'in', '1653', 'at', 'a', 'cost', 'estimated', 'at', 'the', 'time', 'to', 'be', 'around', '₹32', 'million', ',', 'which', 'in', '2023', 'would', 'be', 'approximately', '₹35', 'billion', '(', 'US', '$', '498', 'million', ')', '.', '[', '8', ']', 'The', 'construction', 'project', 'employed', 'some', '20,000', 'artisans', 'under', 'the', 'guidance', 'of', 'a', 'board', 'of', 'architects', 'led', 'by', 'Ustad', 'Ahmad', 'Lahori', ',', 'the', 'emperor', "'s", 'court', 'architect', '.', 'Various', 'types', 'of', 'symbolism', 'have', 'been', 'employed', 'in', 'the', 'Taj', 'to', 'reflect', 'natural', 'beauty', 'and', 'divinity', '.', 'The', 'Taj', 'Mahal', 'was', 'designated', 'as', 'a', 'UNESCO', 'World', 'Heritage', 'Site', 'in', '1983', 'for', 'being', '``', 'the', 'jewel', 'of', 'Islamic', 'art', 'in', 'India', 'and', 'one', 'of', 'the', 'universally', 'admired', 'master

## Various Tokenization Techniques

### 1. Word Tokenization

In [24]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

text = "Word tokenization is important for natural language processing."
tokens = word_tokenize(text)
print("Word Tokens:", tokens)

Word Tokens: ['Word', 'tokenization', 'is', 'important', 'for', 'natural', 'language', 'processing', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Sentence Tokenization

In [25]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

text = "Sentence tokenization breaks text into sentences. This is an example."
sentences = sent_tokenize(text)
print("Sentences:", sentences)

Sentences: ['Sentence tokenization breaks text into sentences.', 'This is an example.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 3. Whitespace Tokenization

In [26]:
text = "Whitespace tokenization splits text based on spaces."
tokens = text.split()
print("Whitespace Tokens:", tokens)

Whitespace Tokens: ['Whitespace', 'tokenization', 'splits', 'text', 'based', 'on', 'spaces.']


### 4. Custom Delimiter Tokenization

In [27]:
import re

text = "Custom delimiter tokenization, separating items: apple, orange, banana."
tokens = re.split(r'[,\s]', text)
print("Custom Delimiter Tokens:", tokens)

Custom Delimiter Tokens: ['Custom', 'delimiter', 'tokenization', '', 'separating', 'items:', 'apple', '', 'orange', '', 'banana.']
