## Used to download the dataset 
and prepare folders for train, test, and validation

In [7]:
import urllib.request
import zipfile
import os
from pathlib import Path

In [8]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
extracted_path = "sms_spam_collection" 
zip_destination_dir=Path(extracted_path)
zip_path = zip_destination_dir / "sms_spam_collection.zip"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

In [9]:
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Downloading the file
    with urllib.request.urlopen(url) as response:

        if (not zip_destination_dir.exists()):
            os.mkdir(zip_destination_dir)
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

In [10]:
try:
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
    data_split_dir= extracted_path + "/" + "data_splits"
    os.mkdir(data_split_dir)    # create for the train, validation, and test datasets for later
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
    print(f"Primary URL failed: {e}. Trying backup URL...")
    url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) 

File downloaded and saved as sms_spam_collection\SMSSpamCollection.tsv
