## Importing Required Dependencies

In [9]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

###  Reads data from a given CSV file path and assigns column names.

In [10]:
def read_dataset(file_path, sep = ','):
    try:
        data = pd.read_csv(file_path, sep='\t', names=['label', 'text'], header=None, encoding='latin-1')
        return data
    except Exception as e:
        print(f"Error reading file: {e}")
        return None
    
dataset = read_dataset('SMSSpamCollection.csv', sep='\t')


### Preprocesses text by converting to lowercase, removing special characters, and stopwords.

In [12]:

def clean_text(text):
    
    if not isinstance(text, str):
        return ""  # Return empty string for invalid data
    
    text = text.lower().strip()  # Convert to lowercase and strip spaces
    text = re.sub(r"[^a-zA-Z\d\s]", "", text)  # Remove special characters
    
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    
    return " ".join(filtered_words)

def preprocess_data(data, text_column='text'):
    """Cleans dataset by removing NaNs, applying text preprocessing, and removing duplicates."""
    if text_column not in data.columns:
        raise ValueError(f"Column '{text_column}' not found in dataset")
    
    data = data.dropna()  # Remove rows with missing text
    data[text_column] = data[text_column].apply(clean_text)  # Apply text cleaning
    data = data.drop_duplicates(subset=[text_column])  # Remove duplicate texts
    
    return data

dataset = preprocess_data(dataset, text_column='text')



### Splits dataset into training, validation, and test sets.

In [13]:

def partition_data(data, test_size=0.2, val_size=0.5, random_state=42):

    train_data, temp_data = train_test_split(data, test_size=test_size, random_state=random_state)
    val_data, test_data = train_test_split(temp_data, test_size=val_size, random_state=random_state)
    
    return train_data, val_data, test_data

train_set, val_set, test_set = partition_data(dataset)



### Saves the split datasets into CSV files.

In [14]:
def save_splits(train_set, val_set, test_set, train_file="train.csv", val_file="validation.csv", test_file="test.csv"):
    
    train_set.to_csv(train_file, index=False)
    val_set.to_csv(val_file, index=False)
    test_set.to_csv(test_file, index=False)
    print(f"Data saved: {train_file}, {val_file}, {test_file}")

save_splits(train_set, val_set, test_set)


Data saved: train.csv, validation.csv, test.csv
