# Sumamrizing Emails with Hugging Face and Amazon Sagemaker

In [3]:
model_id = "google/flan-t5-base"

dataset_id = "argilla/FinePersonas-Conversations-Email-Summaries"

## Setup

In [4]:
!pip -q install transformers datasets sagemaker --upgrade

In [5]:
!pip -q install widgetsnbextension ipywidgets

In [6]:
!pip install fsspec s3fs



In [7]:
# Importing SageMaker SDK
import sagemaker

print(sagemaker.__version__)

sess = sagemaker.Session()
bucket = sess.default_bucket()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
2.232.1


In [8]:
# IMporting Transformers
import transformers
import datasets

print(transformers.__version__)
print(datasets.__version__)

4.45.1
3.0.1


### Loading the Dataset

In [9]:
#Loading DataSet and Printing It Out
from datasets import load_dataset, load_from_disk

dataset = load_dataset(dataset_id)
dataset

DatasetDict({
    train: Dataset({
        features: ['conversation_id', 'email', 'maximum_brevity_summary', 'summary', 'distilabel_metadata', 'model_name'],
        num_rows: 363584
    })
})

In [10]:
from datasets import DatasetDict

# Step 1: Split 36k dataset into 6k for training and the remaining 30k
train_test_split = dataset["train"].train_test_split(train_size=10000, shuffle=True, seed=42)

# Step 2: Now split the remaining 10k rows into 3k for test and 1k for validation
test_val_split = train_test_split["test"].train_test_split(test_size=3000, train_size=1000, shuffle=True, seed=42)

# Combine into a final DatasetDict with the desired splits
dataset = DatasetDict({
    "train": train_test_split["train"],      # 6k rows
    "test": test_val_split["test"],          # 3k rows
    "validation": test_val_split["train"]    # 1k rows
})

# Step 3: Modify the features of the dataset
def transform_features(example):
    return {
        'text': example['email'],
        'summary': example['summary'],
        'title': example['summary']  # Title will be the same as summary
    }

# Apply the transformation to each split and remove the old columns
columns_to_keep = ['text', 'summary', 'title']  # Only keep these columns
dataset = dataset.map(transform_features, remove_columns=['conversation_id', 'email', 'maximum_brevity_summary', 'distilabel_metadata', 'model_name'])

# Print the modified dataset structure
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['summary', 'text', 'title'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['summary', 'text', 'title'],
        num_rows: 3000
    })
    validation: Dataset({
        features: ['summary', 'text', 'title'],
        num_rows: 1000
    })
})


### Preprocessing the Dataset

In [11]:
 from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)

prefix = "summarize: "
input_max_length = 1024
output_max_length = 128


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=input_max_length, truncation=True)
    labels = tokenizer(
        text_target=examples["title"], max_length=output_max_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [12]:
# Apply the tokenization to the dataset
dataset
tokenized_dataset = dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=["title", "text", "summary"]
)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

### Upload Dataset that we have processed to S3 

In [13]:

import fsspec
import s3fs 

s3 = s3fs.S3FileSystem()

s3_prefix = "huggingface/email-t5-summarization"

dataset_input_path = "s3://{}/{}".format(bucket, s3_prefix)
train_input_path = "{}/train".format(dataset_input_path)
valid_input_path = "{}/validation".format(dataset_input_path)

print(dataset_input_path)
print(train_input_path)
print(valid_input_path)

s3://sagemaker-eu-north-1-202533513906/huggingface/email-t5-summarization
s3://sagemaker-eu-north-1-202533513906/huggingface/email-t5-summarization/train
s3://sagemaker-eu-north-1-202533513906/huggingface/email-t5-summarization/validation


In [14]:
import fsspec
import s3fs
import os

# Create local directories to save the dataset
local_data_path = "./tokenized_dataset"
os.makedirs(local_data_path, exist_ok=True)

# Save each dataset split to disk locally
tokenized_dataset["train"].save_to_disk(f"{local_data_path}/train")
tokenized_dataset["test"].save_to_disk(f"{local_data_path}/validation")

# Now upload the locally saved files to S3
s3 = s3fs.S3FileSystem()

# Upload the local dataset splits to S3
s3.put(f"{local_data_path}/train", train_input_path, recursive=True)
s3.put(f"{local_data_path}/validation", valid_input_path, recursive=True)

print("Dataset successfully uploaded to S3!")

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]

Dataset successfully uploaded to S3!


## Fine Tune on SageMaker with Hugging Face Deep Learning Container

In [15]:
!pygmentize train.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34mimport[39;49;00m [04m[36margparse[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[34mimport[39;49;00m [04m[36mevaluate[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m load_from_disk[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m ([37m[39;49;00m
    AutoModelForSeq2SeqLM,[37m[39;49;00m
    AutoTokenizer,[37m[39;49;00m
    DataCollatorForSeq2Seq,[37m[39;49;00m
    Seq2SeqTrainer,[37m[39;49;00m
    Seq2SeqTrainingArguments,[37m[39;49;00m
)[37m[39;49;00m
[37m[39;49;00m
rouge = evaluate.load([33m"[39;49;00m[33mrouge[39;49;00m[33m"[39;49;00m)[37m[39;49;00m
[37m[39;49;00m
[37m[39;49;00m
[34mdef[39;49;00m [32mcompute_met

In [16]:
#Defibning HyperParameter
hyperparameters = {
    "epochs": 1,
    "learning-rate": 1e-6,
    "train-batch-size": 1,
    "eval-batch-size": 8,
    "model-name": model_id,
}

In [17]:
from sagemaker.huggingface import HuggingFace

huggingface_estimator = HuggingFace(
    role=sagemaker.get_execution_role(),
    # Fine-tuning script
    entry_point="train.py",
    dependencies=["requirements.txt"],
    hyperparameters=hyperparameters,
    # Infrastructure
    transformers_version="4.26.0",
    pytorch_version="1.13.1",
    py_version="py39",
    instance_type="ml.g5.xlarge",
    instance_count=1,
    distribution={"smdistributed": {"dataparallel": {"enabled": True}}},
)

In [18]:
huggingface_estimator.fit({"train": train_input_path, "valid": valid_input_path})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2024-10-01-11-19-17-032


2024-10-01 11:19:21 Starting - Starting the training job...
2024-10-01 11:19:35 Starting - Preparing the instances for training...
2024-10-01 11:20:20 Downloading - Downloading the training image..................
2024-10-01 11:23:17 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
  "cipher": algorithms.TripleDES,[0m
  "class": algorithms.TripleDES,[0m
[34m2024-10-01 11:23:25,049 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-10-01 11:23:25,069 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-10-01 11:23:25,082 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-10-01 11:23:25,086 sagemaker_pytorch_container.training INFO     Invoking SMDataParallel[0m
[34m2024-10-01 11:23:

In [19]:
 huggingface_estimator.model_data

's3://sagemaker-eu-north-1-202533513906/huggingface-pytorch-training-2024-10-01-11-19-17-032/output/model.tar.gz'

## Deploy on SageMaker with a Hugging Face Deep Learning Container

In [22]:
 huggingface_predictor = huggingface_estimator.deploy(
    initial_instance_count=2, instance_type="ml.g4dn.xlarge"
)

INFO:sagemaker:Creating model with name: huggingface-pytorch-training-2024-10-01-12-47-16-521
INFO:sagemaker:Creating endpoint-config with name huggingface-pytorch-training-2024-10-01-12-47-16-521
INFO:sagemaker:Creating endpoint with name huggingface-pytorch-training-2024-10-01-12-47-16-521


---------!

In [23]:
test_data = {"inputs": f"{prefix}: {dataset['test'][3]}"}

In [24]:
prediction = huggingface_predictor.predict(test_data)
print(prediction)

[{'generated_text': '"Subject: RE: Upcoming conference and potential collaborationnnHi Alex,'}]


In [25]:
dataset['test'][3]

{'summary': 'Emily will be attending the Evolutionary Biology conference in the city next month and is interested in meeting to discuss a potential collaboration. Emily is excited about creating content that bridges science communication and evolutionary biology, making research more accessible to the public. She suggests meeting for coffee or lunch to brainstorm ideas and discuss project logistics.',
 'text': "Subject: RE: Upcoming conference and potential collaboration\n\nHi Alex,\n\nI hope you're doing well! I wanted to let you know that I'll be attending the Evolutionary Biology conference in your city next month. I was thinking it might be a great opportunity for us to meet up and discuss the potential collaboration we've been emailing about.\n\nI'm really excited about the idea of creating content that explores the connections between our fields and makes our research more accessible to the public. I think your skills in science communication and my expertise in evolutionary biol

In [None]:
 huggingface_predictor.delete_endpoint()