In [109]:
import sagemaker
from sagemaker.pytorch import PyTorch
import boto3
import os
import json
import pandas as pd
from DataProcess import data_split_and_merge as da_sm


In [92]:
!pwd

/home/ec2-user/SageMaker/sagemaker


### 数据准备

In [114]:
!python DataPrepare.py \
    --CenterFlag 1 \
    --mode 'single_file' \
    --input_path './raw_data/1961701.jsonl' \
    --output_folder './tmp' \
    --SampleMode 'short'

## with long samples 
!python DataPrepare.py \
    --CenterFlag 1 \
    --mode 'single_file' \
    --input_path './raw_data/2236632.jsonl' \
    --output_folder './tmp' \
    --SampleMode 'long_3'

## with long samples 
!python DataPrepare.py \
    --CenterFlag 1 \
    --mode 'single_file' \
    --input_path './raw_data/2271377.jsonl' \
    --output_folder './tmp' \
    --SampleMode 'long_3'

# new two books for testing
!python DataPrepare.py \
    --CenterFlag 1 \
    --mode 'single_file' \
    --input_path './raw_data/2380945.jsonl' \
    --output_folder './tmp' \
    --SampleMode 'long_3'


!python DataPrepare.py \
    --CenterFlag 1 \
    --mode 'single_file' \
    --input_path './raw_data/2217876.jsonl' \
    --output_folder './tmp' \
    --SampleMode 'long_3'

In [115]:
!ls tmp

1961701.json  2217876.json  2236632.json  2271377.json	2380945.json


### 统一数据标签风格

In [116]:

def del_longer_answer(data_path, output_folder):  
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        f = open(data_path)
        data = json.load(f)
        text = data['data']
        new_text = []
        ans_list = []
        for i, sample in enumerate(text):
            if len(sample['answers']['text'][0]) < 20:
                while sample['answers']['text'][0].endswith(' '):
                    sample['answers']['text'][0] = sample['answers']['text'][0][0:-1]
                if sample['answers']['text'][0].endswith('\'s'):
                    sample['answers']['text'][0] = sample['answers']['text'][0][0:-2]
                while sample['answers']['text'][0].startswith(' '): 
                    sample['answers']['text'][0] = sample['answers']['text'][0][1:]
                    sample['answers']['answer_start'][0]+=1
                new_text.append(sample)
        for idx, sample in enumerate(new_text):
                sample['id'] = str(idx)
        res_dic = {}
        res_dic['data'] =  new_text
        print(f"raw_len:{len(text)}, new_len:{len(new_text)}")
        with open(os.path.join(output_folder, data_path.split('/')[-1].split('.')[-2])+ '.json', 'w', encoding="utf-8") as fp:
                json.dump(res_dic, fp) 
                
## 统计答案中以‘s作为结尾的个数 
def compute_s_answer(data_path):
    f = open(data_path)
    data = json.load(f)
    text = data['data']

    ans_list = []
    for i, sample in enumerate(text):
        if sample['answers']['text'][0].endswith('\'s'):
            ans_list.append({'index': i, 'text':sample['answers']['text'][0]})

    print(data_path.split('/')[-1], len(ans_list), len(ans_list)/len(text)*100)

    return ans_list
    
## 统计答案中前后有空格的个数（这些也可能造成问题
def compute_blank_answer(data_path):
    f = open(data_path)
    data = json.load(f)
    text = data['data']

    ans_list = []
    for i, sample in enumerate(text):
        if sample['answers']['text'][0].startswith(' ') or sample['answers']['text'][0].endswith(' '):
            ans_list.append({'index': i, 'text':sample['answers']['text'][0]})

    print(data_path.split('/')[-1], len(ans_list), len(ans_list)/len(text)*100)

    return ans_list
    
def compute_longer_answer(data_path):
    f = open(data_path)
    data = json.load(f)
    text = data['data']

    ans_list = []
    for i, sample in enumerate(text):
        if len(sample['answers']['text'][0]) >= 20:
            ans_list.append({'index': i, 'text':sample['answers']['text'][0]})

    print(data_path.split('/')[-1], len(ans_list), len(ans_list)/len(text)*100)

    return ans_list
    

In [118]:
del_longer_answer('tmp/2217876.json', 'data')
del_longer_answer('tmp/2271377.json', 'data')
del_longer_answer('tmp/1961701.json', 'data')
del_longer_answer('tmp/2236632.json', 'data')
del_longer_answer('tmp/2380945.json', 'data')

raw_len:354, new_len:326
raw_len:408, new_len:406
raw_len:168, new_len:168
raw_len:117, new_len:117
raw_len:278, new_len:278


### 训练数据准备: 4 books train, 1 books test (2217876.json etc) 

In [124]:
## get train and test file 
## 4 books train, 1 books test (2380945.json etc) 
data_1 = "data/1961701.json"
data_2 = "data/2380945.json"
data_3 = "data/2236632.json"
data_4 = "data/2271377.json"
data_5 = "data/2217876.json"

data_12 = da_sm.merge_data(data_1, data_2)
data_34= da_sm.merge_data(data_3, data_4)
data_1234 = da_sm.merge_data(str(data_12), str(data_34)) #5

In [125]:
!ls data

1961701+2380945+2236632+2271377.json  2217876.json	    2271377.json
1961701+2380945.json		      2236632+2271377.json  2380945.json
1961701.json			      2236632.json


In [127]:
role = sagemaker.get_execution_role()
prefix='stary-datalab-QAmodel'
sess = sagemaker.Session()
bucket = sess.default_bucket()


boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train/train.json")
).upload_file("data/1961701+2380945+2236632+2271377.json")
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "test/test.json")
).upload_file("data/2217876.json")

training_input_path = f's3://{sess.default_bucket()}/{prefix}/train/train.json'
test_input_path = f's3://{sess.default_bucket()}/{prefix}/test/test.json'

### 模型训练

In [128]:
# hyperparameters which are passed to the training job
hyperparameters = {
    # 训练
    'model_name_or_path': 'deepset/bert-base-cased-squad2', #  pretrained model name,deepset/roberta-base-squad2,deepset/minilm-uncased-squad2,etc
    'train_file': '/opt/ml/input/data/train/train.json',
    'validation_file':'/opt/ml/input/data/test/test.json',
    'output_dir':'/opt/ml/model',
    'do_train': True,
    'do_eval': True,
    'per_device_train_batch_size':12,
    'learning_rate':3e-5,
    'num_train_epochs': 5,
    'max_seq_length':512,
    'doc_stride':128,
    'overwrite_cache': True,
    'overwrite_output_dir': True,
    'evaluation_strategy': 'epoch',
    'save_strategy': 'epoch',
    'prediction_loss_only': False,
    'load_best_model_at_end': True,
    'metric_for_best_model': 'eval_exact_match',
    'greater_is_better': True,
    'save_total_limit': 1
}
# create the Estimator
estimator = PyTorch(
    entry_point = 'run_train.py',
    source_dir = './code',
    instance_type = 'ml.p3.2xlarge',
    instance_count=1,
    role = role,
    framework_version="1.11.0",
    transformers_version='4.6',
    py_version='py38',
    hyperparameters = hyperparameters,
    base_job_name='train-QAmodel-stary-4books-eval-on45',
)

# starting training job
estimator.fit({'train':training_input_path,'test': test_input_path})

2022-12-05 13:34:56 Starting - Starting the training job...
2022-12-05 13:35:21 Starting - Preparing the instances for trainingProfilerReport-1670247295: InProgress
.........
2022-12-05 13:36:45 Downloading - Downloading input data
2022-12-05 13:36:45 Training - Downloading the training image........................
2022-12-05 13:40:56 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-12-05 13:40:59,458 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-12-05 13:40:59,486 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-12-05 13:40:59,493 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-12-05 13:40:59,976 sagemaker-training-toolkit INFO     Installing dependencies from requirements.t

### 模型部署

In [129]:
model_data_file = estimator.model_data
print(model_data_file)

s3://sagemaker-us-east-1-551641581032/train-QAmodel-stary-4books-eval-on45-2022-12-05-13-34-55-598/output/model.tar.gz


In [130]:
!aws s3 cp $model_data_file ./model_tmp/

download: s3://sagemaker-us-east-1-551641581032/train-QAmodel-stary-4books-eval-on45-2022-12-05-13-34-55-598/output/model.tar.gz to model_tmp/model.tar.gz


In [132]:
!tar -zxvf ./model_tmp/model.tar.gz -C ./model_tmp/

train_results.json
tokenizer.json
eval_nbest_predictions.json
special_tokens_map.json
checkpoint-81/
checkpoint-81/optimizer.pt
checkpoint-81/tokenizer.json
checkpoint-81/special_tokens_map.json
checkpoint-81/config.json
checkpoint-81/vocab.txt
checkpoint-81/rng_state.pth
checkpoint-81/training_args.bin
checkpoint-81/scheduler.pt
checkpoint-81/tokenizer_config.json
checkpoint-81/trainer_state.json
checkpoint-81/pytorch_model.bin
config.json
vocab.txt
README.md
eval_predictions.json
all_results.json
training_args.bin
tokenizer_config.json
trainer_state.json
pytorch_model.bin
eval_results.json


In [145]:
!cp -r ./code ./deploy_model/code

In [146]:
!cp model_tmp/config.json ./deploy_model/
!cp model_tmp/pytorch_model.bin ./deploy_model/
!cp model_tmp/special_tokens_map.json ./deploy_model/
!cp model_tmp/tokenizer_config.json ./deploy_model/
!cp model_tmp/tokenizer.json ./deploy_model/
!cp model_tmp/training_args.bin ./deploy_model/

In [147]:
!cd deploy_model && tar -czvf ./model-deploy.tar.gz *

code/
code/requirements.txt
code/run_train.py
code/trainer_qa.py
code/utils_qa.py
code/inference.py
code/.ipynb_checkpoints/
config.json
pytorch_model.bin
special_tokens_map.json
tokenizer_config.json
tokenizer.json
training_args.bin


In [148]:
!aws s3 cp deploy_model/model-deploy.tar.gz s3://$bucket/output-stary/model-deploy.tar.gz

upload: deploy_model/model-deploy.tar.gz to s3://sagemaker-us-east-1-551641581032/output-stary/model-deploy.tar.gz


In [149]:
%%time
## Deploy the model using model_data
import sagemaker

role = sagemaker.get_execution_role()

# instance_type = 'local'
# instance_type = 'ml.m5.xlarge'
instance_type = 'ml.g4dn.xlarge'

# predictor = estimator.deploy(initial_instance_count=1, instance_type=instance_type)

from sagemaker.pytorch.model import PyTorchModel

pytorch_model = PyTorchModel(model_data='s3://{}/output-stary/model-deploy.tar.gz'.format(bucket), role=role,
                             entry_point='inference.py', framework_version='1.11.0', py_version='py38', model_server_workers=4)  # TODO [For GPU], model_server_workers=6

predictor = pytorch_model.deploy(instance_type=instance_type, initial_instance_count=1)

-----------!CPU times: user 28.1 s, sys: 3.1 s, total: 31.2 s
Wall time: 6min


### 模型推理

In [150]:
%%time
# example request, you always need to define "inputs"
import time
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

predictor.serializer = JSONSerializer()
predictor.deserializer = JSONDeserializer()


data = {
    'inputs': {
        "question": "Who is the speaker?",
        "context": "We were halfway through our meal when Blair jumped up and did the potty dance, \"I really have to go! I'll meet you guys at the house.\" She starts running faster. "
    }
}

data1 = {
    'inputs': {
        "question": "Who said the center sentence?",
        "context": "We were halfway through our meal when Blair jumped up and did the potty dance, center: \"I really have to go! I'll meet you guys at the house.\" She starts running faster. "
    }
}

# request
outputs = predictor.predict(data)

outputs

CPU times: user 10.5 ms, sys: 282 µs, total: 10.8 ms
Wall time: 253 ms


{'Question': 'Who is the speaker?', 'answer': 'Blair', 'Answer_start': 14}

### clean-up

Deleting the local endpoint when you're finished is important since you can only run one local endpoint at a time.

In [151]:
# estimator.delete_endpoint()
predictor.delete_endpoint()