In [5]:
# Chapter 4 - Notebook 1: Text Cleaning Demo using ISL_CSLRT Dataset

"""
This notebook demonstrates basic text preprocessing using your ISL_CSLRT dataset.
We will:
- Load sentences from the provided CSV file
- Apply text normalization (lowercasing, punctuation removal)
- Prepare gloss-style cleaned sentences
"""

import pandas as pd
import re

# Load dataset
meta_file = "isl_train_meta.csv"  # path to your uploaded CSV

df = pd.read_csv(meta_file)
print(f"Dataset Loaded: {len(df)} samples")

# Display sample sentences
print("\nSample Raw Sentences:")
print(df['Sentences'].head())

# Simple Text Cleaning Function
def clean_text_to_gloss(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.upper()

# Apply Cleaning
df['cleaned_sentence'] = df['Sentences'].apply(clean_text_to_gloss)

print("\nCleaned sentence examples:")
print(df[['Sentences', 'cleaned_sentence']].head())

# Optional saving step
df[['Sentences', 'cleaned_sentence']].to_csv("cleaned_sentences.csv", index=False)

Dataset Loaded: 500 samples

Sample Raw Sentences:
0    it does not make any difference to me
1                            tell me truth
2                           do me a favour
3                             do not worry
4                         do not abuse him
Name: Sentences, dtype: object

Cleaned sentence examples:
                               Sentences  \
0  it does not make any difference to me   
1                          tell me truth   
2                         do me a favour   
3                           do not worry   
4                       do not abuse him   

                        cleaned_sentence  
0  IT DOES NOT MAKE ANY DIFFERENCE TO ME  
1                          TELL ME TRUTH  
2                         DO ME A FAVOUR  
3                           DO NOT WORRY  
4                       DO NOT ABUSE HIM  
