In [None]:
import os
import torch
import pandas as pd
import h5py
import Levenshtein
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from Levenshtein import distance
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR

print(torch.__version__)
#GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

2.0.1
cuda:0


In [None]:
pretrain_model = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = MarianTokenizer.from_pretrained(pretrain_model)
model = MarianMTModel.from_pretrained(pretrain_model)

In [None]:
def levenshtein_distance(str1, str2): #loss
    return distance(str1, str2)

TrainData_out = pd.read_csv('/mount/ml/hw3/data/train-TL.csv')
TrainData_in = pd.read_csv('/mount/ml/hw3/data/train-ZH.csv')
TestDate = pd.read_csv('/mount/ml/hw3/data/test-ZH-nospace.csv')

TrainData_in=TrainData_in['txt']
TrainData_out=TrainData_out['txt']

In [None]:
input_encoded_texts = []
max_length_zh=0
#find max_length_zh
for text in TrainData_in:
    encoded_text = tokenizer.encode(
        text,
        add_special_tokens=True,  #add bos eos
        return_tensors='pt'       #return PyTorch tensor
    )
    max_length_zh=max(max_length_zh,encoded_text.size(1))
#print("max_length_zh : ",max_length_zh)

for text in TrainData_in:
    encoded_text = tokenizer.encode(
        text,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        max_length=max_length_zh,
        return_tensors='pt'
    )
    input_encoded_texts.append(encoded_text)

input_ch = torch.cat(input_encoded_texts, dim=0).clone().detach()
attention_mask = (input_ch != 65000).to(torch.long)

In [None]:
output_encoded_texts = []
max_length_zh=0
#find max_length_zh
for text in TrainData_out:
    encoded_text = tokenizer.encode(
        text,
        add_special_tokens=True,  #add bos eos
        return_tensors='pt'       #return PyTorch tensor
    )
    max_length_zh=max(max_length_zh,encoded_text.size(1))
#print("max_length_zh : ",max_length_zh)

for text in TrainData_out:
    encoded_text = tokenizer.encode(
        text,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        max_length=max_length_zh,
        return_tensors='pt'
    )
    output_encoded_texts.append(encoded_text)

output_Eng = torch.cat(output_encoded_texts, dim=0).clone().detach()

In [None]:
trainDataset = TensorDataset(input_ch, attention_mask, output_Eng)
trainDataloader = DataLoader(trainDataset, batch_size=8, shuffle=True)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scheduler = StepLR(optimizer, step_size=1, gamma=0.9)
num_epochs = 90
model.to(device)
model.train()
epoch_list = []
MLD_list = []
for epoch in range(num_epochs):
    total_distance = 0
    total_samples = 0
    for batch in tqdm(trainDataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):
        input_ch, attention_mask, output_Eng = batch
        optimizer.zero_grad()
        input = {"input_ids": input_ch.to(device), "attention_mask": attention_mask.to(device)}
        output = model(**input, labels=output_Eng.to(device))
        predictions = tokenizer.batch_decode(torch.argmax(output.logits, dim=2), skip_special_tokens=True)
        tgt = tokenizer.batch_decode(output_Eng, skip_special_tokens=True)
        batch_distance = sum(levenshtein_distance(pred, target) for pred, target in zip(predictions, tgt))
        total_distance += batch_distance
        total_samples += len(predictions)
        loss = output.loss
        loss.backward()
        optimizer.step()
    mean_distance = total_distance / total_samples
    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Mean Levenshtein Distance: {mean_distance:.4f}")
    MLD_list.append(mean_distance)
    epoch_list.append(epoch+1)

    if (epoch + 1) % 10 == 0:  # save every 5 epoch
            save_dir = '/mount/ml/hw3/model_ver'
            os.makedirs(save_dir, exist_ok=True)
            PATH = os.path.join(save_dir, 'model_epoch_{}.n.pth'.format(epoch + 1))
            torch.save(model.state_dict(), PATH)

                                                               

Epoch 1/90, Mean Levenshtein Distance: 13.4365


                                                               

KeyboardInterrupt: 

In [None]:
#load pth
'''
PATH = '/mount/ml/hw3/model_ver/model_epoch_5.n.pth'
model.load_state_dict(torch.load(PATH))
model = model.to(device)
'''

In [None]:
tokenizer.save_pretrained("tokenizer")
model.eval()
result = []
with torch.no_grad():
    for i in tqdm(range(len(TestDate)), desc="翻譯", unit="文本"):
        Eng_text = TestDate['txt'][i]
        inputs = tokenizer(Eng_text, return_tensors="pt", max_length=512, truncation=True).to(device)
        out_id = model.generate(**inputs)
        precout_text = tokenizer.batch_decode(out_id, skip_special_tokens=True)[0]
        result.append(precout_text)

翻譯: 100%|██████████| 641/641 [02:05<00:00,  5.09文本/s]


In [None]:
#output
idx=[]
for i in range(len(TestDate)):
    idx.append(i+1)

df = pd.DataFrame({'id': idx, 'txt': result})
df.to_csv('/mount/ml/hw3/output.csv', index=False)
print("All complete")