In [1]:
# pip install -r requirements.txt -i https://pypi.douban.com/simple

In [2]:
# !PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python"

In [1]:
import warnings
warnings.filterwarnings("ignore")

## Load the Model
### Locally

In [2]:
from Translate import Translate

model_path = "../models/mbart-finetuned-cn-to-en-auto"

In [3]:
trans = Translate(model_path)
text = "开空调的情况下，续航掉的太快了，特别是冬天天气冷的时候，不开空调不行，天气一冻，续航就掉的更快"
print(trans.translator(text))

In the case of turning on the air conditioner, the electric range drops too fast, especially when the weather is cold in winter, not turning on the air conditioner is not possible, the weather freezes, and the electric range drops faster.


### Fine-tuning

In [7]:
import pandas as pd

model_name = f"mbart-finetuned-cn-to-en-auto-sample"
model_path = f"../models/{model_name}"

finetuned_model_path = model_path

In [8]:
data_proc = "../data/trans/PROC_2023.csv"
df = pd.read_csv(data_proc)
df = df.head(20)

In [9]:
trans.finetune(df, finetuned_model_path=finetuned_model_path, compute_metrics=False)

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

You're using a MBart50TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,0.734511


{'eval_loss': 0.6135677099227905, 'eval_runtime': 0.232, 'eval_samples_per_second': 4.31, 'eval_steps_per_second': 4.31, 'epoch': 1.0}


## Push to S3
```bash
#!/bin/bash

cd mbart-finetuned-cn-to-en-auto
tar zcvf mbart.tar.gz *

aws s3 cp mbart.tar.gz \
  s3://hugging-face/llm/mbart.tar.gz
  
rm mbart.tar.gz
cd ../
```

In [10]:
import os

In [17]:
current_dir = os.getcwd()
print(current_dir)

os.chdir(finetuned_model_path)
print(os.getcwd())

os.chdir(current_dir)
print(os.getcwd())

/home/ec2-user/SageMaker/LocalCat/notebook


In [14]:
print(model_name)

mbart-finetuned-cn-to-en-auto-sample


In [19]:
import subprocess

current_dir = os.getcwd()

file_tar = f"{model_name}.tar.gz"
bucket = "hugging-face"
key = f"llm/{file_tar}"

# Define the bash command
bash_command = f"""
cd {finetuned_model_path}
tar zcvf {file_tar} *
aws s3 cp {file_tar} \
  s3://{bucket}/{key}
rm {file_tar}
cd {current_dir}
"""

# Run the bash command
process = subprocess.Popen(bash_command, shell=True)
process.wait()

config.json
generation_config.json
mbart-finetuned-cn-to-en-auto-sample.tar.gz
pytorch_model.bin
sentencepiece.bpe.model
special_tokens_map.json
tokenizer_config.json
tokenizer.json
training_args.bin
upload: ./mbart-finetuned-cn-to-en-auto-sample.tar.gz to s3://hugging-face/llm/mbart-finetuned-cn-to-en-auto-sample.tar.gz


0

## Deploy it
### From S3

In [9]:
import boto3  
from sagemaker.huggingface.model import HuggingFaceModel
import sagemaker
from time import gmtime, strftime

from sagemaker.huggingface.model import HuggingFacePredictor

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
    
config = dict()
config["S3_MODEL"] = "s3://hugging-face/llm/mbart.tar.gz"
config['MODEL_NAME'] = "mbart"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [10]:
huggingface_model = HuggingFaceModel(
    model_data=config["S3_MODEL"],
    role=role,
    transformers_version='4.37.0',
    pytorch_version='2.1.0',
    py_version='py310',
)

In [None]:
endpoint_name = config['MODEL_NAME'].upper() + strftime("-%Y%m%d-%H%M%S", gmtime())

predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type='ml.g4dn.4xlarge',
    endpoint_name=endpoint_name,
)

---------!

## Envoke it

In [31]:
import json

predictor = HuggingFacePredictor(
    endpoint_name=endpoint_name
)
endpoint_name = predictor.endpoint_name
runtime_client = boto3.client('sagemaker-runtime')
input_data = {"inputs": "夜观天象"}

response = runtime_client.invoke_endpoint(
            EndpointName=endpoint_name,
            ContentType='application/json',
            Body=json.dumps(input_data)
        )
result = json.loads(response['Body'].read().decode('utf-8'))[0]['generated_text']
print(result)

Night viewing


In [45]:
import json
import boto3 
from sagemaker.huggingface.model import HuggingFacePredictor

import pandas as pd
from tqdm import tqdm

pd.set_option('display.max_colwidth', None)

In [53]:
def translator(text, endpoint_name):
    predictor = HuggingFacePredictor(
        endpoint_name=endpoint_name
    )
    runtime_client = boto3.client('sagemaker-runtime')
    input_data = {"inputs": text}

    response = runtime_client.invoke_endpoint(
                EndpointName=endpoint_name,
                ContentType='application/json',
                Body=json.dumps(input_data)
            )
    result = json.loads(response['Body'].read().decode('utf-8'))[0]['generated_text']
    return result

In [33]:
text = "开空调的情况下，续航掉的太快了，特别是冬天天气冷的时候，不开空调不行，天气一冻，续航就掉的更快"
result = translator(text, endpoint_name)
print(result)

In the case of turning on the air conditioner, the electric range drops too fast, especially when the weather is cold in winter, not turning on the air conditioner is not possible, the weather freezes, and the electric range drops faster.


In [37]:
def translator_batch(df, endpoint_name, col_src='Chinese', col_tgt='English'):
    tqdm.pandas()
    df[col_tgt] = df[col_src].progress_apply(lambda x: translator(x, endpoint_name))
    return df

In [49]:
data_proc = "../data/trans/PROC_2023.csv"
df = pd.read_csv(data_proc)
df = df.head(10)

In [50]:
df = translator_batch(df, endpoint_name, col_tgt="Translation")

100%|██████████| 10/10 [00:12<00:00,  1.20s/it]


In [52]:
df

Unnamed: 0,Chinese,English,Translation
0,开空调的情况下，续航掉的太快了，特别是冬天天气冷的时候，不开空调不行，天气一冻，续航就掉的更快,"In the case of turning on the air conditioner, the electric range drops too fast, especially when the weather is cold in winter, if you don't turn on the air conditioner, the weather freezes, the battery life will fall faster.","In the case of turning on the air conditioner, the electric range drops too fast, especially when the weather is cold in winter, not turning on the air conditioner is not possible, the weather freezes, and the electric range drops faster."
1,车机流畅度差，容易卡死机，车机系统，启动载入很慢，换挡杆前的车机，使用任何功能都有概率死机，发生过3-4次,"The smoothness of the IHU is poor, easy to jam, the car machine system, the start loading is very slow, the car machine before the gear lever, using any function has a probability of crashing, which has occurred 3-4 times.","The smoothness of the IHU is poor, it is easy to jam, the IHU system, the start loading is slow, the IHU in front of the gear lever, the use of any function has a probability of crashing, which has occurred 3-4 times."
2,整车的悬架系统，在过减速带时，速度在20码以下，但是车身的抖动还是很厉害，舒适性为第一的，美系车相比，差距还是比较大的,"The suspension system of the whole car, when crossing the speed bump, the speed is below 20km/h, but the shaking of the body is still very strong, the comfort is the first, compared with the American car, the gap is still relatively large.","The suspension system of the whole car, when crossing the speed bump, the speed is less than 20km/h, but the body shaking is still very bad, comfort is the first, compared to the American car, the gap is still relatively large."
3,大众车的通病，车子的隔音效果不太理想，车速在90码以上，车内的胎噪声就很明显了，必须把音量调大，才能缓解一点（是原厂轮胎，车窗关闭）,"The common problem of Volkswagen, the sound insulation of the car is not ideal, the speed is above 90km/h, the tire noise in the car is obvious, the volume must be turned up, in order to alleviate a little (is the original tires, the windows are closed.","The common problem of mass-produced cars, the sound insulation effect of the car is not ideal, the speed is above 90km/h, the tire noise in the car is obvious, it is necessary to turn up the volume, in order to alleviate a little (is the original tire, the window is closed."
4,车辆外观很不错，但是车标在晚上不能发亮，要是可以发亮的话会更拉风一点,"The appearance of the vehicle is very good, but the logo cannot be shiny at night, if it can be bright, it will be more stunning.","The appearance of the vehicle is very good, but the license plate cannot shine at night, if it can shine, it will be a little more dazzling."
5,对于一款中型车来说，后排的空间真的太小了，别说和迈腾比就是和雅阁比也小了点，特别是腿部空间明显要小一点，对于高个子的来说没有那么友好,"For a mid-size car, the space in the rear is really too small, not to mention that compared with the Magotan an is also a little smaller than the Accord, especially the legroom is obviously a little smaller, which is not so friendly for tall people.","For a medium-sized car, the space in the rear is really too small, not to mention that the ratio of the sedan to the luxury sedan is also a little smaller, especially the legroom is obviously a little smaller, and is not so friendly to tall people."
6,胎噪声音大（行驶在坑洼路面上，车速在50码左右，车窗全关闭情况下听到）（是原厂轮胎）,"Loud tire noise (heard when driving on potholes, at a speed of about 50km/h, with all windows closed (factory tires.","The tire noise is loud (driving on a dirt road, the speed is about 50km/h, heard when the windows are all closed (it is the original tires.)"
7,在行驶中3月份，突然加油，松脚刹，车子不动，打双闪，重启，关停再开，恢复正常；偶然一次10-20时速，城市道路面，黄埔大道，很多车在马路上很危险；,"In March, when driving in March, suddenly speed up, release the foot brake, the car does not move, double flash, restart, shut down and then open, return to normal; Occasionally a speed of 10-20 mph, city road surface, wham poa Avenue, many carson the road is dangerous; .","While driving in March, suddenly refueling, loose foot brakes, the car is not moving, double flashing, restart, stop and start again, and return to normal; Occasionally at 10-20 o'clock, urban road surface, Huangpu Avenue, many cars on the road are very dangerous;"
8,车机反应速度慢,The reaction speed of the IHU is slow.,The reaction speed of the IHU is slow.
9,没有开发出来，开发太慢了，车机一直没有更新的，中控大屏，一直存在，车的硬件本身支持的，但软件未做好，要等它更新，经常；,"Not developed, the development is too slow, the IHU has not been updated, the central control screen, has always existed, the hardware of the car itself supports, but the software is not done, to wait for it to update, often; .","Not developed, the development is too slow, the IHU has not been updated, the central control screen, has always existed, the hardware of the car itself supports, but the software is not done, to wait for it to update, often;"
