# 6.2 Preparing the dataset
I used the UCI Sentiment Labelled Sentences dataset to create some novel results, which is provided by by Dimitrios Kotzias, Misha Denil, Nando de Freitas, Padhraic Smyth in 2015 in the paper titled "From Group to Individual Labels Using Deep Features" published in Knowledge Discovery and Data Mining.

In [4]:
# Prepare to download the dataset
# import urllib.request
# import zipfile
# import os
# from pathlib import Path
# url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
# zip_path = "sms_spam_collection.zip"
# extracted_path = 'sms_spam_collection'
# data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

In [None]:
# A function for downloading the dataset
# def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
#     if data_file_path.exists():
#         print(f"{data_file_path} already exists. Skipping download and extraction.")
#         return

#     # Downloading the file
#     response = requests.get(url, stream=True, timeout=60)
#     response.raise_for_status()
#     with open(zip_path, "wb") as out_file:
#         for chunk in response.iter_content(chunk_size=8192):
#             if chunk:
#                 out_file.write(chunk)

#     # Unzipping the file
#     with zipfile.ZipFile(zip_path, "r") as zip_ref:
#         zip_ref.extractall(extracted_path)

#     # Add .tsv file extension
#     original_file_path = Path(extracted_path) / "SMSSpamCollection"
#     os.rename(original_file_path, data_file_path)
#     print(f"File downloaded and saved as {data_file_path}")

In [6]:
# Run the function
# download_and_unzip_sentiment_data(url, zip_path, extracted_path, data_file_path)

In [7]:
import pandas as pd
data = []
with open("sentiment_combined.txt", "r", encoding="utf-8") as f:
    for line in f:
        line = line.rstrip('\n')
        text, label = line.rsplit('\t', 1)
        data.append((text, label))
df = pd.DataFrame(data, columns=["Text", "Label"])
df

Unnamed: 0,Text,Label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
2995,The screen does get smudged easily because it ...,0
2996,What a piece of junk.. I lose more calls on th...,0
2997,Item Does Not Match Picture.,0
2998,The only thing that disappoint me is the infra...,0


In [8]:
# Examine the class label distribution
print(df["Label"].value_counts())

Label
1    1500
0    1500
Name: count, dtype: int64


In [9]:
# A function for creating a balanced dataset
# def create_balanced_dataset(df):
#     num_negative = df[df["Label"] == 0].shape[0]
#     ham_subset = df[df["Label"] == 1].sample(num_negative, random_state = 123)
#     balanced_df = pd.concat([ham_subset, df[df["Label"] == 0.0]])
#     return balanced_df

In [10]:
# # Create a balanced dataset and convert class labels
# balanced_df = create_balanced_dataset(df)
# print(balanced_df["Label"].value_counts())

In [11]:
# Swap columns
columns_titles = ["Label","Text"]
df = df.reindex(columns = columns_titles)
print(df["Label"].value_counts())

Label
1    1500
0    1500
Name: count, dtype: int64


In [12]:
# A function for splitting the dataset
def random_split(df, train_frac, validation_frac):
    df = df.sample(frac = 1, random_state = 123).reset_index(drop = True)
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]
    return train_df, validation_df, test_df

In [13]:
# Split the dataset and save the parts to CSV
train_df, validation_df, test_df = random_split(df, 0.7, 0.1)
train_df.to_csv("train.csv", index = None)
validation_df.to_csv("validation.csv", index = None)
test_df.to_csv("test.csv", index = None)

# 6.3 Creating data loaders

In [14]:
# Load the tokenizer and add special padding token
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special = {"<|endoftext|>"}))

[50256]


In [15]:
# Setting up a PyTorch Dataset class
import torch
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)
        self.encoded_texts = [tokenizer.encode(text) for text in self.data["Text"]]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            self.encoded_texts = [encoded_text[:self.max_length] for encoded_text in self.encoded_texts]

        self.encoded_texts = [encoded_text + [pad_token_id] * (self.max_length - len(encoded_text)) for encoded_text in self.encoded_texts]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return (torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.long))

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length

In [16]:
# Load the train data with the SpamDataset class
train_dataset = SentimentDataset(csv_file = "train.csv", max_length = None, tokenizer = tokenizer)
print("The longest length is", train_dataset.max_length)

The longest length is 103


In [17]:
# Load the validation and test set
val_dataset = SentimentDataset(csv_file = "validation.csv", max_length = train_dataset.max_length, tokenizer = tokenizer)
test_dataset = SentimentDataset(csv_file = "test.csv", max_length = train_dataset.max_length, tokenizer = tokenizer)

## Exercise 6.1 - Page 179
This exercise is finished by the end of this chapter.

In [18]:
# Creating PyTorch data loaders
from torch.utils.data import DataLoader
num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True,
                          num_workers = num_workers, drop_last = True)
val_loader = DataLoader(dataset = val_dataset, batch_size = batch_size,
                        num_workers = num_workers, drop_last = False)
test_loader = DataLoader(dataset = test_dataset, batch_size = batch_size,
                        num_workers = num_workers, drop_last = False)

In [19]:
# Print the tensor dimensions
for input_batch, target_batch in train_loader:
    pass
print("Input batch dimensions:", input_batch.shape)
print("Label batch dimensions:", target_batch.shape)

Input batch dimensions: torch.Size([8, 103])
Label batch dimensions: torch.Size([8])


In [20]:
# Print the total number of batches in each dataset
print(f"{len(train_loader)} training batches")
print(f"{len(val_loader)} validation batches")
print(f"{len(test_loader)} test batches")

262 training batches
38 validation batches
75 test batches
