In [4]:
# Chapter 4 - Notebook 1: Text Cleaning Demo using ISL_CSLRT Dataset

"""
This notebook demonstrates basic text preprocessing using your ISL_CSLRT dataset.
We will:
- Load sentences from the provided CSV file
- Apply text normalization (lowercasing, punctuation removal)
- Prepare gloss-style cleaned sentences
"""
# Import necessary libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')

# Load dataset
df = pd.read_csv('isl_train_meta.csv')

# Preview raw data
print("Sample Raw Sentences:")
print(df['Sentences'].head())

# Initialize stopwords
stop_words = set(stopwords.words('english'))

# Function: Text Cleaning Pipeline
def clean_text(sentence, remove_stopwords=True):
    # Step 1: Lowercase the sentence
    sentence = sentence.lower()

    # Step 2: Remove punctuations using regex
    sentence = re.sub(r'[^\w\s]', '', sentence)

    # Step 3: Normalize whitespace
    sentence = re.sub(r'\s+', ' ', sentence).strip()

    # Step 4: Tokenize the sentence
    tokens = sentence.split()

    # Step 5: Optional - Remove Stopwords
    if remove_stopwords:
        tokens = [word for word in tokens if word not in stop_words]

    # Step 6: Convert to gloss style (uppercase tokens)
    gloss = ' '.join(tokens).upper()

    return gloss

# Apply Cleaning
df['cleaned_gloss'] = df['Sentences'].apply(lambda x: clean_text(x))

# Display cleaned data
print("\nSample Cleaned Gloss Sequences:")
print(df[['Sentences', 'cleaned_gloss']].head())


Sample Raw Sentences:
0    it does not make any difference to me
1                            tell me truth
2                           do me a favour
3                             do not worry
4                         do not abuse him
Name: Sentences, dtype: object

Sample Cleaned Gloss Sequences:
                               Sentences    cleaned_gloss
0  it does not make any difference to me  MAKE DIFFERENCE
1                          tell me truth       TELL TRUTH
2                         do me a favour           FAVOUR
3                           do not worry            WORRY
4                       do not abuse him            ABUSE


[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
