In [37]:
import urllib.request   # to download the zip file from the web
import zipfile          # to unzip the downloaded file
import os               # to work with file paths
import pandas as pd     # to read and work with tabular data

### 1. Where to download from and local filenames

In [38]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_name = "sms_spam_collection.zip"    # downloaded zip
folder = "sms_spam_collection"          # folder to extract into
data_name_inside_zip = "SMSSpamCollection"  # file name inside the zip (no extension)
tsv_name = os.path.join(folder, "SMSSpamCollection.tsv")  # we will rename to .tsv

### 2. Download the zip if we don't already have it

In [39]:
if not os.path.exists(tsv_name):
    print("Downloading dataset (this may take a few seconds)...")
    urllib.request.urlretrieve(url, zip_name)
    print("Download finished.")

    # 3) Extract the zip
    print("Extracting the zip file...")
    with zipfile.ZipFile(zip_name, "r") as z:
        z.extractall(folder)
    print("Extraction done.")

    # 4) Rename the extracted file to add .tsv for easy reading by pandas
    original = os.path.join(folder, data_name_inside_zip)
    if os.path.exists(original):
        os.rename(original, tsv_name)
        print("Renamed to .tsv for easy reading.")
    else:
        raise FileNotFoundError("Expected file not found inside the zip.")
else:
    print("Dataset already prepared. Skipping download and extraction.")


Dataset already prepared. Skipping download and extraction.


### 5. Load the data into a pandas DataFrame

In [40]:
# The file is tab-separated, has no header row. We name the columns 'Label' and 'Text'.
print("Loading data into a table...")
df = pd.read_csv(tsv_name, sep="\t", header=None, names=["Label", "Text"])
print("Loaded. Number of rows:", len(df))

Loading data into a table...
Loaded. Number of rows: 5572


### 6. Look at how many 'ham' (not spam) and 'spam' messages we have

In [41]:
print("\nCounts before balancing:")
print(df["Label"].value_counts())


Counts before balancing:
Label
ham     4825
spam     747
Name: count, dtype: int64


### 7. Make the dataset balanced by taking the same number of 'ham' as 'spam'

In [42]:
num_spam = df[df["Label"] == "spam"].shape[0]
ham_sample = df[df["Label"] == "ham"].sample(num_spam, random_state=123)  # reproducible sample
balanced = pd.concat([ham_sample, df[df["Label"] == "spam"]]).sample(frac=1, random_state=123).reset_index(drop=True)
print("\nCounts after balancing:")
print(balanced["Label"].value_counts())


Counts after balancing:
Label
ham     747
spam    747
Name: count, dtype: int64


### 8. Convert text labels to numbers: ham -> 0, spam -> 1

In [43]:
balanced["Label"] = balanced["Label"].map({"ham": 0, "spam": 1})
print("\nSample rows (labels are now 0 or 1):")
balanced.head()


Sample rows (labels are now 0 or 1):


Unnamed: 0,Label,Text
0,0,Dude how do you like the buff wind.
1,0,Tessy..pls do me a favor. Pls convey my birthd...
2,1,Reminder: You have not downloaded the content ...
3,1,Got what it takes 2 take part in the WRC Rally...
4,1,"Shop till u Drop, IS IT YOU, either 10K, 5K, £..."


### 9. Split into train / validation / test using simple indexing

In [44]:
n = len(balanced)
train_end = int(0.7 * n)               # first 70%
valid_end = train_end + int(0.1 * n)   # next 10%
train = balanced[:train_end].reset_index(drop=True)
valid = balanced[train_end:valid_end].reset_index(drop=True)
test = balanced[valid_end:].reset_index(drop=True)

print(f"\nSplit sizes -> train: {len(train)}, valid: {len(valid)}, test: {len(test)}")


Split sizes -> train: 1045, valid: 149, test: 300


### 10. Save to CSV so they can be reused later

In [45]:

train.to_csv("train.csv", index=False)
valid.to_csv("validation.csv", index=False)
test.to_csv("test.csv", index=False)
print("Saved train.csv, validation.csv, test.csv")

Saved train.csv, validation.csv, test.csv


### Dataset is ready for tokenization and model training.
Next simple steps:
 - Tokenize the 'Text' column using a tokenizer (convert words to numbers)
 - Create DataLoaders (batches) and feed them to your model for training