In [48]:
import re
import random
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import collections

dataset = pd.read_csv('./dataset/SMSSpamCollection', sep='\t', header=None, names=['label', 'text'])

stopwords = stopwords.words('english')
def clean_text(text):
    if isinstance(text, str): 
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        text = ' '.join([word for word in text.split() if word not in stopwords])
        return text
    else:
        return ''  

dataset['text'] = dataset['text'].apply(clean_text)
dataset = dataset[dataset['text'] != '']  
dataset.dropna(subset=['text'], inplace=True)   
dataset.head(10)

Unnamed: 0,label,text
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though
5,spam,freemsg hey darling weeks word back id like fu...
6,ham,even brother like speak treat like aids patent
7,ham,per request melle melle oru minnaminunginte nu...
8,spam,winner valued network customer selected receiv...
9,spam,mobile months u r entitled update latest colou...


In [49]:
from argparse import Namespace
args = Namespace(                               # Create key, value pairs, and access values later on, e.g., args.seed
    train_proportion=0.8,
    val_proportion=0.1,
    test_proportion=0.1,
    seed=1337
)

# Create dict
by_category = collections.defaultdict(list)  

for _, row in dataset.iterrows():
    by_category[row.label].append(row.to_dict())
    

# Create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_category.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test=int(args.test_proportion * n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list) # use extend when adding multiple elements to a list; us


In [50]:
final_message = pd.DataFrame(final_list)


In [51]:
final_message.split.value_counts()

train    4452
val       555
test      555
Name: split, dtype: int64

In [52]:
final_message.head()

Unnamed: 0,label,text,split
0,ham,si take mokka players,train
1,ham,food,train
2,ham,pizza u want,train
3,ham,heart empty without love mind empty without wi...,train
4,ham,whats feathery bowa something guys dont know,train


In [53]:
final_message.to_csv('./dataset/SMSSpamCollection_Split', index=False)  # don't write row names (index).