# Text Cleaning for ISL_CLSRT Dataset

This notebook demonstrates how to clean sentence-level data from ISL_CLSRT dataset for Sign Language Translation tasks. It covers:
- Lowercasing
- Punctuation removal
- Whitespace normalization
- Stopword removal
- Tokenization
- Conversion to gloss-style (uppercase tokens)


In [None]:
# ✅ Colab Setup
from google.colab import drive
drive.mount('/content/drive')
!pip install nltk pandas

Mounted at /content/drive


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords (one-time)
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
df = pd.read_csv('/content/drive/MyDrive/IETGenAI-SLT/Chapter 4/isl_train_meta.csv')
df.head()


Unnamed: 0,Sentences,File location,gloss_sequence,signer_id,sample_id
0,it does not make any difference to me,ISL_CSLRT_Corpus\Videos_Sentence_Level\it does...,IT MAKE ANY DIFFERENCE ME DO NOT,6,ISL_0278_S6
1,tell me truth,ISL_CSLRT_Corpus\Videos_Sentence_Level\tell me...,TELL TRUTH,6,ISL_0341_S6
2,do me a favour,ISL_CSLRT_Corpus\Videos_Sentence_Level\do me a...,DO FAVOUR ME,4,ISL_0046_S4
3,do not worry,ISL_CSLRT_Corpus\Videos_Sentence_Level\do not ...,DONOT WORRY,4,ISL_0065_S4
4,do not abuse him,ISL_CSLRT_Corpus\Videos_Sentence_Level\do not ...,HIM ABUSE DONOT,5,ISL_0048_S5


In [None]:
stop_words = set(stopwords.words('english'))

def clean_text(sentence, remove_stopwords=True):
    sentence = sentence.lower()
    sentence = re.sub(r'[^\w\s]', '', sentence)
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    tokens = sentence.split()
    if remove_stopwords:
        tokens = [word for word in tokens if word not in stop_words]
    gloss = ' '.join(tokens).upper()
    return gloss


In [None]:
df['cleaned_gloss'] = df['Sentences'].apply(lambda x: clean_text(x))
df[['Sentences', 'cleaned_gloss']].head()


Unnamed: 0,Sentences,cleaned_gloss
0,it does not make any difference to me,MAKE DIFFERENCE
1,tell me truth,TELL TRUTH
2,do me a favour,FAVOUR
3,do not worry,WORRY
4,do not abuse him,ABUSE


In [None]:
df.to_csv('isl_train_meta_cleaned.csv', index=False)
print("Cleaned file saved as isl_train_meta_cleaned.csv")


Cleaned file saved as isl_train_meta_cleaned.csv


### Summary:
This notebook performed basic text preprocessing steps suitable for sign language translation tasks.


In [None]:
# ✅ Colab Setup
from google.colab import drive
drive.mount('/content/drive')
!pip install nltk pandas

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords (one-time)
nltk.download('stopwords')

df = pd.read_csv('/content/drive/MyDrive/IETGenAI-SLT/Chapter 4/isl_train_meta.csv')
# display(df.head()) # Optional: uncomment to display head after loading

stop_words = set(stopwords.words('english'))

def clean_text(sentence, remove_stopwords=True):
    sentence = sentence.lower()
    sentence = re.sub(r'[^\w\s]', '', sentence)
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    tokens = sentence.split()
    if remove_stopwords:
        tokens = [word for word in tokens if word not in stop_words]
    gloss = ' '.join(tokens).upper()
    return gloss

df['cleaned_gloss'] = df['Sentences'].apply(lambda x: clean_text(x))
# display(df[['Sentences', 'cleaned_gloss']].head()) # Optional: uncomment to display head after cleaning

df.to_csv('isl_train_meta_cleaned.csv', index=False)
print("Cleaned file saved as isl_train_meta_cleaned.csv")