# Fine Tune

**Reference:** https://neptune.ai/blog/hugging-face-pre-trained-models-find-the-best

In [42]:
# !pip install -r requirements.txt

In [84]:
import warnings
warnings.filterwarnings("ignore")

## Load Model

In [82]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")

In [83]:
sentence = "开空调的情况下，续航掉的太快了，特别是冬天天气冷的时候，不开空调不行，天气一冻，续航就掉的更快"

tokenizer.src_lang = 'zh_CN'
tokenizer.tgt_lang = 'en_XX'

encoded = tokenizer(sentence, return_tensors="pt")
generated_tokens = model.generate(**encoded)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)



['If the air conditioner is on, the flight resumes too quickly, especially in winter when the weather is cold. If the air conditioner is not on, the flight resumes faster as soon as the weather is cold']

## Preprocess Data

### Load Data

In [24]:
import pandas as pd
from datasets import Dataset

In [16]:
df_1 = pd.read_excel('../data/trans/2023_NCVQS_text.xlsx', sheet_name=0)
df_1.rename(columns = {'Detail complains breakdown  (Chinese)': 'Chinese', 'Translation': 'English'}, inplace=True)

df_2 = pd.read_excel('../data/trans/2023_NCVQS_text.xlsx', sheet_name=1)

zh_columns = df_2.columns[:4]
en_columns = df_2.columns[4:]

df_2['Chinese'] = df_2[zh_columns].apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)
df_2['English'] = df_2[en_columns].apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)

df_2.drop(columns=en_columns, inplace=True)
df_2.drop(columns=zh_columns, inplace=True)

df = pd.concat([df_1, df_2], axis = 0)
df = df.dropna()
df.drop_duplicates(keep='first', subset='Chinese', inplace=True)

In [17]:
print("Length: ", len(df), "\n Num of NaN: ", df.isna().sum().sum())

Length:  447 
 Num of NaN:  0


In [18]:
df.head()

Unnamed: 0,Chinese,English
0,开空调的情况下，续航掉的太快了，特别是冬天天气冷的时候，不开空调不行，天气一冻，续航就掉的更快,"In the case of turning on the air conditioner,..."
1,车机流畅度差，容易卡死机，车机系统，启动载入很慢，换挡杆前的车机，使用任何功能都有概率死机，...,"The smoothness of the IHU is poor, easy to jam..."
2,整车的悬架系统，在过减速带时，速度在20码以下，但是车身的抖动还是很厉害，舒适性为第一的，美...,"The suspension system of the whole car, when c..."
3,大众车的通病，车子的隔音效果不太理想，车速在90码以上，车内的胎噪声就很明显了，必须把音量调...,"The common problem of Volkswagen, the sound in..."
4,车辆外观很不错，但是车标在晚上不能发亮，要是可以发亮的话会更拉风一点,"The appearance of the vehicle is very good, bu..."


In [19]:
tmp = df.copy()

### Transform Data

In [20]:
df = df.assign(translation=df.apply(lambda row:{'zh': row['Chinese'], 'en': row['English']}, axis=1))
df.reset_index(inplace=True)
df.drop(['index', 'Chinese', 'English'], axis=1, inplace=True)

In [21]:
df.head()

Unnamed: 0,translation
0,{'zh': '开空调的情况下，续航掉的太快了，特别是冬天天气冷的时候，不开空调不行，天气一...
1,{'zh': '车机流畅度差，容易卡死机，车机系统，启动载入很慢，换挡杆前的车机，使用任何功...
2,{'zh': '整车的悬架系统，在过减速带时，速度在20码以下，但是车身的抖动还是很厉害，舒...
3,{'zh': '大众车的通病，车子的隔音效果不太理想，车速在90码以上，车内的胎噪声就很明显...
4,"{'zh': '车辆外观很不错，但是车标在晚上不能发亮，要是可以发亮的话会更拉风一点', '..."


In [31]:
df.iloc[0][0]

{'zh': '开空调的情况下，续航掉的太快了，特别是冬天天气冷的时候，不开空调不行，天气一冻，续航就掉的更快',
 'en': "In the case of turning on the air conditioner, the electric range drops too fast, especially when the weather is cold in winter, if you don't turn on the air conditioner, the weather freezes, the battery life will fall faster."}

In [22]:
print("Dimension: ", df.shape)

Dimension:  (447, 1)


In [25]:
dataset = Dataset.from_pandas(df, split='train')
dataset = dataset.train_test_split(test_size=0.05)

In [27]:
print(type(dataset['train']))
print(dataset['train'][:1])

<class 'datasets.arrow_dataset.Dataset'>
{'translation': [{'en': 'The model looks domineering, the lines are good-looking, the interior color is reasonable, and the functions technology is strong. The interior of the car is well made, the dashboard is magnificent, stylish, the power consumption is small, and the acceleration is fast and stable. The model is domineering and stylish, good workmanship, stylish and magnificent dashboard, low power consumption, stable and fast acceleration. No', 'zh': '车型看起来霸气，线条好看，内饰颜色搭配合理，配置科技感强 车内饰做工好，仪表盘大气，时尚，电耗小，而且加速快，平稳 车型大气霸气时尚，做工好，仪表盘时尚大气，电耗低，平稳加速快 无'}]}


In [28]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 424
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 23
    })
})

### Tokenize Datasets

In [106]:
prefix = "" #for mBART and MarianMT
max_input_length = 512
max_target_length = 512

source_lang = "zh"
target_lang = "en"

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
   
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/424 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

## Train and Fine-tune the Model

### Model Setup

In [107]:
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer
import evaluate

In [108]:
from datetime import datetime

# Get the current date
today = datetime.now()

# Format the date as "DDMMYYYY"
formatted_date = today.strftime("%d%m%Y")

print(formatted_date)

06022024


In [109]:
model_name = f"mbart-finetuned-cn2en-{formatted_date}"
model_path = f"../model/{model_name}"

batch_size = 4

args = Seq2SeqTrainingArguments(
   output_dir=model_path,
   evaluation_strategy = "epoch",
   learning_rate=2e-5,
   per_device_train_batch_size=batch_size,
   per_device_eval_batch_size=batch_size,
   weight_decay=0.01,
   save_total_limit=3,
   num_train_epochs=2,
   predict_with_generate=True,
)

In [110]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) # default setting

In [111]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")
meteor = evaluate.load('meteor')

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    meteor_result = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result = {'bleu' : result['score']}
    result["gen_len"] = np.mean(prediction_lens)
    result["meteor"] = meteor_result["meteor"]
    result = {k: round(v, 4) for k, v in result.items()}
    return result

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [112]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

### Train and Save the Model

In [113]:
trainer.train()

You're using a MBart50TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len,Meteor
1,No log,0.707038,50.2644,52.4348,0.7674
2,No log,0.701942,53.6005,51.1304,0.777


TrainOutput(global_step=212, training_loss=0.6183345002948113, metrics={'train_runtime': 151.5573, 'train_samples_per_second': 5.595, 'train_steps_per_second': 1.399, 'total_flos': 109507776086016.0, 'train_loss': 0.6183345002948113, 'epoch': 2.0})

In [114]:
eval_result = trainer.evaluate(tokenized_datasets['test'])

In [115]:
eval_result

{'eval_loss': 0.7019422054290771,
 'eval_bleu': 53.6005,
 'eval_gen_len': 51.1304,
 'eval_meteor': 0.777,
 'eval_runtime': 11.4205,
 'eval_samples_per_second': 2.014,
 'eval_steps_per_second': 0.525,
 'epoch': 2.0}

In [116]:
trainer.save_model()

### Test

In [117]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [118]:
sentence = "开空调的情况下，续航掉的太快了，特别是冬天天气冷的时候，不开空调不行，天气一冻，续航就掉的更快"

tokenizer.src_lang = 'zh_CexN'
tokenizer.tgt_lang = 'en_XX'

encoded = tokenizer(sentence, return_tensors="pt")
generated_tokens = model.generate(**encoded)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

['In the case of turning on the air conditioner, the electric range drops too fast, especially when the weather is cold in winter, it is not good to turn on the air conditioner, as soon as the weather freezes, the electric range drops faster.']

> "开空调的情况下，续航掉的太快了，特别是冬天天气冷的时候，不开空调不行，天气一冻，续航就掉的更快"

> 'If the air conditioner is on, the flight resumes too quickly, especially in winter when the weather is cold. If the air conditioner is not on, the flight resumes faster as soon as the weather is cold'

In [138]:
def translate(sentence):
    encoded = tokenizer(sentence, return_tensors="pt")
    generated_tokens = model.generate(**encoded)
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [139]:
for i in dataset['test']['translation'][:5]:
    print("="*30)
    print(i['zh'])
    print(i['en'])
    print("="*10)
    result = translate(i['zh'])
    print(result)

驾驶室扶手箱盖子坏了，车内气味很难闻 没有发现问题 安全性，整体外观漂亮，车内和尾厢空间大 车内皮革味和塑料味大，车窗的隔音效果太一般了
The lid of the cab armrest box is broken and the smell inside the car is bad. No problems found. Safety, beautiful overall appearance, large space in the car and rear compartment. The smell of leather and plastic inside the car is large, and the sound insulation effect of the windows is too general.
['The handlebar cover is broken, and the odor inside the car is very hard to smell. No problems found. Safety, beautiful overall appearance, large interior and tailgate space. The leather and plastic smells in the car, and the sound insulation effect of the windows is too general.']
安全性好，格调喜欢 没什么大的质量问题 坐起来舒服，质感好 车底出现的咔咔声
Good security and style like. There are no major quality problems It is comfortable to seat and has a good texture. A clicking sound that appears under the car.
['Good safety and I like the style. There are no major quality issues. Comfortable to sit and good texture. The clicking sound at the bottom of th