In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# 先拿掉轉unicode及標點符號轉換，先看結果
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
import os


cuda


In [5]:
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 45

In [6]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [7]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

In [8]:
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.strip())
    # 標點符號前加空格
    s = re.sub(r"([.!?])", r" \1", s)
    return s

In [9]:
import re
def custom_tokenizer(text):
  tokens = re.findall(r'\b\w+\b|[，、「 」『 』。？：；]', text)
  output = ''
  # print(tokens)
  for i in range(len(tokens)):
    if(tokens[i]!=" "):
      output += tokens[i]
    if i != len(tokens)-1 and tokens[i] != " ":
      output += ' '
  return output

sentence_test = 'm7-kuan2-si7 o5-a2-tsian1 ，'
tokens = custom_tokenizer(sentence_test)
print(tokens)
print(tokens[0])


def zh_custom_tokenizer(sentence):
  tokens = list(sentence)
  alphabet_set = set('abcdefghijklmnopqrstuvwxyzABCDEFGIJKLMNOPQRSTUVWXYZ')
  english = ''
  output_list = []
  cnt = 0
  output = ''
  for i in range(len(tokens)):
    if tokens[i] in alphabet_set:
      english += tokens[i]
      cnt +=1
    else:
      if cnt !=0:
        output_list.append(english)
      if tokens[i] != " ":
        output_list.append(tokens[i])
      cnt = 0
      english = ""
  # print(output_list)
  for j in range(len(output_list)):
    output += output_list[j]
    if j != len(output_list)-1:
      output += ' '
  return output

print(zh_custom_tokenizer("毋管是 蚵仔煎 ，"))
test = tokens.split(' ')
character_to_remove = ''

# 使用列表解析创建新列表，排除特定字符
new_list = [item for item in test if item != character_to_remove]


print(test)
print(new_list)

m7 kuan2 si7 o5 a2 tsian1 ，
m
毋 管 是 蚵 仔 煎 ，
['m7', 'kuan2', 'si7', 'o5', 'a2', 'tsian1', '，']
['m7', 'kuan2', 'si7', 'o5', 'a2', 'tsian1', '，']


In [10]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Update the file path
    file_path = '/content/drive/MyDrive/作業3/tl-zh.txt'

    # Use the updated file path
    lines = open(file_path, encoding='utf-8').read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    # print(pairs[0][0])
    for i in range(len(pairs)):
      pairs[i][0] = custom_tokenizer(pairs[i][0])
      pairs[i][1] = zh_custom_tokenizer(pairs[i][1])
    # print(pairs[0][0])

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [11]:
def filterPair(p):
#保留長度條件即可
    p0 = p[0].split(' ')
    character_to_remove = ''

    new_list0 = [item for item in p0 if item != character_to_remove]

    p1 = p[1].split(' ')
    new_list1 = [item for item in p1 if item != character_to_remove]

    return len(new_list0) < MAX_LENGTH and \
        len(new_list1) < MAX_LENGTH

In [12]:
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)

    # Add the '<unknown>' token to the vocabulary
    input_lang.addWord('<unknown>')

    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    # for i in pairs:
    #     print(list(i))
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [13]:
input_lang, output_lang, pairs = prepareData('tl','zh' , True)
print(random.choice(pairs))

Reading lines...
Read 63469 sentence pairs
Trimmed to 63469 sentence pairs
Counting words...
Counted words:
zh 4454
tl 2650
['跟 隨 當 年 鄭 成 功 的 跤 步 ，', 'kun1 sui5 tong1 ni5 tenn7 sing5 kong1 e5 kha1 poo7 ，']


In [14]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [15]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [16]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

In [17]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData('tl', 'zh', True)

    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids
    print(input_ids)
    print(target_ids)
    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader

def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [18]:
print(pairs[75])

['日 子 柄 無 好 過 ，', 'jit8 tsi2 ping3 bo5 ho2 kue3 ，']


#### Training loop

In [19]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def evaluate_accuracy(encoder, decoder, dataloader, input_lang, output_lang):
    total_correct = 0
    total_words = 0

    with torch.no_grad():
        for data in dataloader:
            input_tensor, target_tensor = data

            encoder_outputs, encoder_hidden = encoder(input_tensor)
            decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

            _, topi = decoder_outputs.topk(1)
            correct = topi.squeeze() == target_tensor.view(-1)

            total_correct += correct.sum().item()
            total_words += correct.numel()

    accuracy = total_correct / total_words
    return accuracy

def evaluate_and_print_accuracy(encoder, decoder, train_dataloader, val_dataloader, input_lang, output_lang):
    train_accuracy = evaluate_accuracy(encoder, decoder, train_dataloader, input_lang, output_lang)
    val_accuracy = evaluate_accuracy(encoder, decoder, val_dataloader, input_lang, output_lang)

    print(f'Train Accuracy: {train_accuracy * 100:.2f}%')
    print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0


In [20]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

hidden_size = 256
batch_size = 64
n_epoch = 10

input_lang, output_lang, train_dataloader = get_dataloader(batch_size)
# print(input_lang[0])
# print(output_lang[0])
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)

train(train_dataloader, encoder, decoder, n_epochs=n_epoch, learning_rate=0.005, print_every=5, plot_every=5)

encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

Reading lines...
Read 63469 sentence pairs
Trimmed to 63469 sentence pairs
Counting words...
Counted words:
zh 4454
tl 2650
[[   3    4    5 ...    0    0    0]
 [  15    6   16 ...    0    0    0]
 [   3    4    5 ...    0    0    0]
 ...
 [ 592  562  595 ...    0    0    0]
 [ 432   87 1279 ...    0    0    0]
 [ 221    7  155 ...    0    0    0]]
[[  2   3   4 ...   0   0   0]
 [ 14   5  15 ...   0   0   0]
 [  2   3   4 ...   0   0   0]
 ...
 [294 502 526 ...   0   0   0]
 [396  61 662 ...   0   0   0]
 [ 93   6 152 ...   0   0   0]]
7m 37s (- 15m 14s) (5 33%) 1.0098
15m 10s (- 7m 35s) (10 66%) 1.3690
22m 37s (- 0m 0s) (15 100%) 1.3556
> 就 按 呢 日 籍 教 授 報 警 控 告 對 方 傷 害 。
= tsu7 an2 ne1 jit8 tsik8 kau3 siu7 po3 king2 khong3 ko3 tui3 hong1 siong1 hai7 。
< SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> 早 期 就 是 火 炭 窯 產 業 的 大 本 營 ，
= tsa2 ki5 tioh8 si7 hu

In [22]:
input_lang.name

'zh'

In [23]:
import pandas as pd
import torch


def indexesFromSentence_testdata(lang, sentence):
    return [lang.word2index.get(char, 0) for char in sentence]

def tensorFromSentence_testdata(lang, sentence):
    indexes = indexesFromSentence_testdata(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def evaluate_testdata(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence_testdata(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                # decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
        # Join the predicted words with a hyphen ("-")
        decoded_sentence = '-'.join(decoded_words)
    return decoded_words, decoder_attn

def predict_and_write_csv(encoder, decoder, test_data, input_lang, output_lang, output_file='submission.csv'):
    # Create an empty list to store predictions
    predictions = []

    # Iterate through the test data
    for idx, sentence in zip(test_data['id'], test_data['txt']):
        # Perform any necessary preprocessing on the test sentence
        preprocessed_sentence = normalizeString(sentence)
        preprocessed_sentence = zh_custom_tokenizer(preprocessed_sentence)
        # print(preprocessed_sentence)
        # print('\n')

        # Use the evaluate function to get predictions
        output_words, _ = evaluate_testdata(encoder, decoder, preprocessed_sentence, input_lang, output_lang)
        print(output_words)
        print('\n')
        # Join the predicted words to form the output sentence
        new_output_words = []
        check_list = {'，','、','「','」','『', '』','。','？','：','；'}
        for i in range(len(output_words)):
          if(output_words[i]!=''):
            if(i != len(output_words)-1):
              new_output_words.append(output_words[i])
              new_output_words.append('-')
            elif(output_words[i] in check_list):
              new_output_words.pop()
              new_output_words.append(output_words[i])

        output_sentence = ''.join(new_output_words)
        print(preprocessed_sentence)
        print(new_output_words)
        print('\n')
        print(output_sentence)
        print('\n')

        # Append the prediction to the list if it's not just special tokens
        if output_sentence.strip() != '<EOS>' and output_sentence.strip() != 'SOS':
            predictions.append({'id': idx, 'txt': output_sentence})
    print(predictions[0])
    # Create a DataFrame from the predictions list
    predictions_df = pd.DataFrame(predictions)

    # Save the DataFrame to a CSV file
    predictions_df.to_csv(output_file, index=False)

# You should replace 'test_data' with the actual variable containing your test data
test_data = pd.read_csv('/content/drive/MyDrive/作業3/translation_data/test-ZH-nospace.csv')
encoder.eval()
decoder.eval()
# Call the function to predict and write to CSV

predict_and_write_csv(encoder, decoder, test_data, input_lang, output_lang)

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m


規 台 車 四 輪 向 天 ，
['SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-', 'SOS', '-']


SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-SOS-


['SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS', 'SOS'