# 1 - Acquire Data

In [8]:
!wget https://md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com/f45bkkt8pr-1.zip
!unzip f45bkkt8pr-1.zip
!unzip "SMS PHISHING DATASET FOR MACHINE LEARNING AND PATTERN RECOGNITION/Dataset_5971.zip"

--2022-09-27 02:41:07--  https://md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com/f45bkkt8pr-1.zip
Resolving md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com (md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com)... 52.218.108.72
Connecting to md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com (md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com)|52.218.108.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 474313 (463K) [application/zip]
Saving to: ‘f45bkkt8pr-1.zip’


2022-09-27 02:41:07 (1.74 MB/s) - ‘f45bkkt8pr-1.zip’ saved [474313/474313]

Archive:  f45bkkt8pr-1.zip
  inflating: SMS PHISHING DATASET FOR MACHINE LEARNING AND PATTERN RECOGNITION/Phone_extract.py  
  inflating: SMS PHISHING DATASET FOR MACHINE LEARNING AND PATTERN RECOGNITION/Http_extract.py  
  inflating: SMS PHISHING DATASET FOR MACHINE LEARNING AND PATTERN RECOGNITION/Email_extract.py  
  inflating: SMS PHISHING DATASET FOR MACHINE LEARNING AND PAT

# 2 - Install libraries

In [9]:
!pip install "sagemaker>=2.48.0" "transformers==4.12.3" "datasets[s3]==1.18.3" "torch" --upgrade

[0m

# 3 - Basic Setup

In [10]:
import sagemaker.huggingface

In [11]:
sess = sagemaker.Session()

sagemaker_session_bucket=None

if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::339199775262:role/service-role/AmazonSageMaker-ExecutionRole-20220926T232139
sagemaker bucket: sagemaker-us-east-1-339199775262
sagemaker session region: us-east-1


# 4 - Dataset Processing

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

In [13]:
df = pd.read_csv('Dataset_5971.csv')

df.LABEL = (df.LABEL.str.lower()=='smishing')*1 #Binarizing problem
df = df[['TEXT', 'LABEL']] #Filtering out extra columns
df.columns = ['text', 'label_ids'] #Adjusting column names

#Train/test split
df_train, df_test = train_test_split(df, 
                                     test_size=0.2, 
                                     stratify=df.label_ids, 
                                     random_state=0)

train_dataset = Dataset.from_pandas(df_train) #Intantiate train dataset from pandas
test_dataset = Dataset.from_pandas(df_test) #Intantiate test dataset from pandas

dataset = DatasetDict({'train':train_dataset, 'eval':test_dataset}) #Define join dataset

#Download tokenizer
tokenizer_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

#Tokenizing function
def tokenize(batch):
    return tokenizer(batch['text'], 
                     truncation=True, 
                     max_length=32, 
                     return_special_tokens_mask=True)

dataset = dataset.map(tokenize) #Apply tokenization

dataset.set_format('torch', columns=['input_ids', 
                                     'attention_mask', 
                                     'special_tokens_mask', 
                                     'label_ids']) #Set format to torch

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]



0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

# 5 - Upload Dataset to S3

In [14]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  
s3_prefix = "data"

#Save train dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
dataset["train"].save_to_disk(training_input_path, fs=s3)
print(f"Training set saved to: {training_input_path}")

#Save test dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
dataset["eval"].save_to_disk(test_input_path, fs=s3)
print(f"Eval set saved to: {test_input_path}")

Training set saved to: s3://sagemaker-us-east-1-339199775262/data/train
Eval set saved to: s3://sagemaker-us-east-1-339199775262/data/test


In [15]:
from sagemaker.huggingface import HuggingFace

#Training job hyper-parameters
hyperparameters={'epochs': 4,
                 'train_batch_size': 32,
                 'model_name':'distilbert-base-uncased'
                 }

In [16]:
huggingface_estimator = HuggingFace(entry_point='train.py',
                            source_dir='./scripts',
                            instance_type="ml.g4dn.xlarge",
                            instance_count=1,
                            role=role,
                            transformers_version='4.12',
                            pytorch_version='1.9',
                            py_version='py38',
                            hyperparameters = hyperparameters)

In [17]:
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

2022-09-27 02:41:37 Starting - Starting the training job...
2022-09-27 02:42:04 Starting - Preparing the instances for trainingProfilerReport-1664246497: InProgress
.........
2022-09-27 02:43:33 Downloading - Downloading input data
2022-09-27 02:43:33 Training - Downloading the training image.............................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-09-27 02:48:17,656 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-09-27 02:48:17,687 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-09-27 02:48:17,694 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-09-27 02:48:18,213 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
 

### Deploying endpoint

In [18]:
predictor = huggingface_estimator.deploy(1,"ml.g4dn.xlarge")

---------!

In [28]:
sentiment_input= {"inputs":'Dear PAYTM customer your Paytm KYC has expired. Contact customer care No-8536074310 immediately. your account will Block within 24 hr. Thank you PAYTM TEAM.'}

predictor.predict(sentiment_input)

[{'label': 'LABEL_1', 'score': 0.9754716157913208}]

In [30]:
predictor.endpoint

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


'huggingface-pytorch-training-2022-09-27-02-52-47-564'