

# Steps:

Lowercasing the sentence.

Remove everything except letters using a regex pattern with re.sub.

Splitting the sentence into individual words.

Removing stopwords (like 'at', 'the', etc.).

Stemming the words (reducing them to their root form).


# applying on single sentence

In [12]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk


# Sample sentence
sentence = "Traffic is I Light loving intelligently detected at 4th intersection! 3423423423 Check details here: https://example.com"
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
# Lowercase the sentence
sentence = sentence.lower()
sentence

'traffic is i light loving intelligently detected at 4th intersection! 3423423423 check details here: https://example.com'

In [14]:
# Remove everything except alphabetic letters
sentence = re.sub(r'[^a-z\s]', '', sentence)
sentence

'traffic is i light loving intelligently detected at th intersection  check details here httpsexamplecom'

In [15]:
# Split the sentence into words
words = sentence.split()
words

['traffic',
 'is',
 'i',
 'light',
 'loving',
 'intelligently',
 'detected',
 'at',
 'th',
 'intersection',
 'check',
 'details',
 'here',
 'httpsexamplecom']

In [16]:

# Remove stopwords (using NLTK's stopword list)
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

filtered_words

['traffic',
 'light',
 'loving',
 'intelligently',
 'detected',
 'th',
 'intersection',
 'check',
 'details',
 'httpsexamplecom']

In [17]:
# Initialize the stemmer
stemmer = PorterStemmer()

# Apply stemming to each word
stemmed_words = [stemmer.stem(word) for word in filtered_words]

# Final result
print("Original Sentence: ", "Traffic light detected at 4th intersection! Check details here: https://example.com")
print("Processed Sentence: ", " ".join(stemmed_words))

Original Sentence:  Traffic light detected at 4th intersection! Check details here: https://example.com
Processed Sentence:  traffic light love intellig detect th intersect check detail httpsexamplecom


# applying on dataset

In [18]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Function to process sentences
def process_sentences(sentences):
    # Initialize the stemmer and stopwords
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    # Process each sentence
    processed_sentences = []

    for sentence in sentences:
        # Lowercase the sentence
        sentence = sentence.lower()

        # Remove everything except alphabetic letters
        sentence = re.sub(r'[^a-z\s]', '', sentence)

        # Split the sentence into words
        words = sentence.split()

        # Remove stopwords
        filtered_words = [word for word in words if word not in stop_words]

        # Apply stemming
        stemmed_words = [stemmer.stem(word) for word in filtered_words]

        # Join the words back into a sentence
        processed_sentences.append(" ".join(stemmed_words))

    return processed_sentences

# Example sentences to process
sentences = [
    "Traffic light detected at 4th intersection! Check details here: https://example.com",
    "Road signs indicate a detour due to construction near the city center.",
    "Accidents were reported at the 10th street junction. Visit www.accidents.com for more info."
]

# Process the sentences
processed = process_sentences(sentences)

# Print processed sentences
for i, sentence in enumerate(processed):
    print(f"Processed Sentence {i + 1}: {sentence}")


Processed Sentence 1: traffic light detect th intersect check detail httpsexamplecom
Processed Sentence 2: road sign indic detour due construct near citi center
Processed Sentence 3: accid report th street junction visit wwwaccidentscom info


# Tokenization (Embeddings)

Steps:

Tokenize the sentence into words.

Convert each word into an integer index using Keras' Tokenizer.

Apply embeddings using Embedding layer.

Pad the tokenized sequence to ensure all sequences have the same length.

In [19]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
import numpy as np

# Example sentence with 5 words
sentence = ["This is a sample sentence"] # 10

In [20]:
# Initialize the Tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on the sentence
tokenizer.fit_on_texts(sentence)

# Convert the sentence into a sequence of tokens (integer indices)
sequences = tokenizer.texts_to_sequences(sentence)
sequences

[[1, 2, 3, 4, 5]]

In [23]:
# Pad the sequences (padding to a fixed length, here 10 for demonstration)
padded_sequences = pad_sequences(sequences, padding='post', maxlen=10)
padded_sequences

array([[1, 2, 3, 4, 5, 0, 0, 0, 0, 0]], dtype=int32)

In [24]:
# Define the embedding layer (using random embeddings for demonstration)
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1,
                            output_dim=50, input_length=10)

# Apply the embedding layer to the padded sequences
embedded_sequences = embedding_layer(np.array(padded_sequences))

# Print the padded sequences and embeddings
print("Tokenized Sequences:", sequences)
print("Padded Sequences:", padded_sequences)
print("Embedded Sequences:", embedded_sequences.numpy())

Tokenized Sequences: [[1, 2, 3, 4, 5]]
Padded Sequences: [[1 2 3 4 5 0 0 0 0 0]]
Embedded Sequences: [[[-0.02770307 -0.03901814  0.02857963 -0.03703789  0.00625979
    0.03331539 -0.03900361  0.00171049 -0.01270552 -0.00782112
    0.0462851   0.02584353 -0.00704037 -0.02377622 -0.01201887
   -0.02059838 -0.04683933 -0.0069509  -0.04808817 -0.03649639
   -0.02652862 -0.0153277  -0.02721206 -0.04391104  0.00010464
   -0.03048887  0.01785128 -0.04152553 -0.0030301  -0.04668398
   -0.03317467 -0.04377502  0.00439275 -0.00600898  0.03006338
    0.00093473  0.00305034 -0.00997962  0.0395583   0.01745769
    0.02237098 -0.04808968  0.03399346 -0.01484156  0.0298318
    0.00585966  0.03861896 -0.04303154  0.00226866 -0.04007905]
  [-0.01282611  0.01882828 -0.01180019 -0.02730849  0.02534733
   -0.02903639 -0.04109927 -0.02885828  0.01317869 -0.02102422
    0.04284105 -0.03320094  0.02794321 -0.00050126 -0.03020349
   -0.00153934 -0.01677737  0.04621373 -0.01123105  0.03460467
    0.03964236  0

