# **Load Modules**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import json

In [3]:
!cp /content/drive/MyDrive/Khai\ thác\ dữ\ liệu\ truyền\ thông\ xã\ hội\ -\ IE403.N22/Đồ\ án/Code/TextNormSeq2Seq/lib/DataLoader.py .
!cp /content/drive/MyDrive/Khai\ thác\ dữ\ liệu\ truyền\ thông\ xã\ hội\ -\ IE403.N22/Đồ\ án/Code/TextNormSeq2Seq/lib/Dataset.py .
!cp /content/drive/MyDrive/Khai\ thác\ dữ\ liệu\ truyền\ thông\ xã\ hội\ -\ IE403.N22/Đồ\ án/Code/TextNormSeq2Seq/lib/Dict.py .
!cp /content/drive/MyDrive/Khai\ thác\ dữ\ liệu\ truyền\ thông\ xã\ hội\ -\ IE403.N22/Đồ\ án/Code/TextNormSeq2Seq/lib/Tweet.py .
!cp /content/drive/MyDrive/Khai\ thác\ dữ\ liệu\ truyền\ thông\ xã\ hội\ -\ IE403.N22/Đồ\ án/Code/TextNormSeq2Seq/lib/constants.py .

!cp /content/drive/MyDrive/Khai\ thác\ dữ\ liệu\ truyền\ thông\ xã\ hội\ -\ IE403.N22/Đồ\ án/Code/TextNormSeq2Seq/lib/loss.py .
!cp /content/drive/MyDrive/Khai\ thác\ dữ\ liệu\ truyền\ thông\ xã\ hội\ -\ IE403.N22/Đồ\ án/Code/TextNormSeq2Seq/lib/metrics.py .
!cp /content/drive/MyDrive/Khai\ thác\ dữ\ liệu\ truyền\ thông\ xã\ hội\ -\ IE403.N22/Đồ\ án/Code/TextNormSeq2Seq/lib/utils.py .

!cp /content/drive/MyDrive/Khai\ thác\ dữ\ liệu\ truyền\ thông\ xã\ hội\ -\ IE403.N22/Đồ\ án/Code/TextNormSeq2Seq/lib/model.py .
!cp /content/drive/MyDrive/Khai\ thác\ dữ\ liệu\ truyền\ thông\ xã\ hội\ -\ IE403.N22/Đồ\ án/Code/TextNormSeq2Seq/lib/model_factory.py .

!cp /content/drive/MyDrive/Khai\ thác\ dữ\ liệu\ truyền\ thông\ xã\ hội\ -\ IE403.N22/Đồ\ án/Code/TextNormSeq2Seq/lib/evaluator.py .
!cp /content/drive/MyDrive/Khai\ thác\ dữ\ liệu\ truyền\ thông\ xã\ hội\ -\ IE403.N22/Đồ\ án/Code/TextNormSeq2Seq/lib/optim.py .
!cp /content/drive/MyDrive/Khai\ thác\ dữ\ liệu\ truyền\ thông\ xã\ hội\ -\ IE403.N22/Đồ\ án/Code/TextNormSeq2Seq/lib/trainer.py .

In [4]:
import torch
from torch.backends import cudnn
from torch import cuda
import numpy as np
import argparse
import random
import os
import logging
import copy
import easydict
from statistics import mean

In [5]:
from Tweet import Tweet
from DataLoader import create_data, create_datasets
from model_factory import create_model
from evaluator import Evaluator
from trainer import Trainer
import constants

# **Train Model**

## Setup

In [6]:
def change_args(opt):
    torch.backends.cudnn.enabled = False
    cudnn.benchmark = False
    cudnn.deterministic = True
    torch.backends.cudnn.deterministic = True
    random.seed(opt.seed)
    torch.manual_seed(opt.seed)
    torch.cuda.manual_seed(opt.seed)
    torch.cuda.manual_seed_all(opt.seed)
    np.random.seed(opt.seed)
    if opt.save_dir and not os.path.exists(opt.save_dir): os.makedirs(opt.save_dir)
    logging.basicConfig(filename=os.path.join(opt.save_dir, 'output.log') if opt.logfolder else None, level=logging.INFO)
    if opt.self_tok: opt.self_tok=constants.SELF
    opt.cuda = (opt.gpu != -1) # Set cuda
    if torch.cuda.is_available() and not opt.cuda:
        logger.warning("WARNING: You have a CUDA device, so you should probably run with -gpu 1")
    if opt.cuda: cuda.set_device(opt.gpu)
    if opt.share_embeddings:
        if not opt.share_vocab:
            logger.warning('src/tgt vocab should be the same if you use share_embeddings! Changing share_vocab to True.')
            opt.share_vocab = True
    return opt

In [7]:
logger = logging.getLogger("main")

def train_char_model(args):
    logger.info('*** Character model ***')
    opt = copy.deepcopy(args)
    opt.input = 'spelling'
    train_data, valid_data, test_data, vocab, mappings = create_datasets(opt)
    char_model, char_optim = create_model((vocab['src'], vocab['tgt']), opt, is_char_model = True)
    char_evaluator = Evaluator(char_model, opt)
    char_test_evaluator = Evaluator(char_model, opt)
    logger.info(char_model.opt)
    logger.info('Loading test data for character model from "%s"' % opt.testdata)
    logger.info('Loading training data for character model from "%s"' % opt.traindata)
    logger.info(' * Character model vocabulary size. source = %d; target = %d' % (len(vocab['src']), len(vocab['tgt'])))
    logger.info(' * Character model maximum batch size. %d' % opt.batch_size)
    logger.info(char_model)
    if opt.interactive and args.input != 'hybrid':
        while True:
            var = input("Please enter a word to be try spelling model (q to quit): ")
            if var.lower() == 'q': break
            tweets = [Tweet(var.split(), var.split(), '1', '1') for i in range(2)]  # suboptimal but works with minimal changes
            test_data, test_vocab, mappings = create_data(tweets, opt=opt, vocab=vocab, mappings=mappings)
            prediction = char_test_evaluator.eval(test_data)
            print('Prediction is: {}'.format(''.join(prediction)))
    elif opt.eval: # Evaluation only
        logger.info("=======Char eval on test set=============")
        pred_file = os.path.join(opt.save_dir, 'test.pred.char')
        char_test_evaluator.eval(test_data, pred_file=pred_file)
        logger.info("=======Char eval on validation set=============")
        pred_file = os.path.join(opt.save_dir, 'valid.pred.char')
        char_evaluator.eval(valid_data, pred_file=pred_file)
    else: # Training
        char_trainer = Trainer(char_model, char_evaluator, train_data, valid_data ,char_optim, opt)
        char_trainer.train(opt.start_epoch, opt.end_epoch)
        logger.info("=======Eval on test set=============")
        pred_file = os.path.join(opt.save_dir, 'test.pred.char')
        char_test_evaluator.eval(test_data, pred_file=pred_file)
        logger.info("=======Eval on validation set=============")
        pred_file = os.path.join(opt.save_dir, 'valid.pred.char')
        char_evaluator.eval(valid_data, pred_file=pred_file)
        logger.info('*** Finished Character model ***\n')
    return char_model

In [8]:
def main(args):
    opt = args
    opt = change_args(opt)
    logging.basicConfig(filename=os.path.join(opt.save_dir, 'output.log') if opt.logfolder else None, level=logging.INFO)
    unk_model = train_char_model(opt) if(opt.input in ['hybrid', 'spelling']) else None
    if(opt.input =='spelling'): exit()
    train_data, valid_data, test_data, vocab, mappings = create_datasets(opt)
    model, optim = create_model((vocab['src'], vocab['tgt']), opt)
    evaluator = Evaluator(model, opt, unk_model)
    test_evaluator = Evaluator(model, opt, unk_model)
    logger.info(model.opt)
    logger.info('Loading test data from "%s"' % opt.testdata)
    logger.info('Loading training data from "%s"' % opt.traindata)
    logger.info(' * Vocabulary size. source = %d; target = %d' % (len(vocab['src']), len(vocab['tgt'])))
    logger.info(' * Maximum batch size. %d' % opt.batch_size)
    logger.info(model)
    if opt.interactive:
        while True:
            var = input("Please enter the text to be normalized (q to quit): ")
            if var.lower() == 'q': break
            tweets = [Tweet(var.split(), var.split(), '1', '1') for i in range(2)] #suboptimal but works with minimal changes
            test_data, test_vocab, mappings = create_data(tweets, opt=opt, vocab=vocab,mappings=mappings)
            prediction = test_evaluator.eval(test_data)
            print('Prediction is: {}'.format(' '.join(prediction)))
    elif opt.eval: # Evaluation only
        logger.info("=======Eval on test set=============")
        pred_file = os.path.join(opt.save_dir, 'test.pred')
        test_evaluator.eval(test_data, pred_file=pred_file)
        logger.info("=======Eval on validation set=============")
        pred_file = os.path.join(opt.save_dir, 'valid.pred')
        evaluator.eval(valid_data, pred_file=pred_file)
    else: # Training
        trainer = Trainer(model, evaluator, train_data, valid_data ,optim, opt)
        trainer.train(opt.start_epoch, opt.end_epoch)
        logger.info("=======Eval on test set=============")
        pred_file = os.path.join(opt.save_dir, 'test.pred')
        test_evaluator.eval(test_data, pred_file=pred_file)
        logger.info("=======Eval on validation set=============")
        pred_file = os.path.join(opt.save_dir, 'valid.pred')
        evaluator.eval(valid_data, pred_file=pred_file)

## Word Model

In [9]:
word_args = easydict.EasyDict({
    'traindata': '/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Data/train_data.json',
    'testdata': '/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Data/test_data.json',
    'valsplit': 219,
    'vocab_size': None,
    'lowercase': False,
    'share_vocab': True,
    'eos': True,
    'bos': True,
    'self_tok': False,
    'input': 'word',
    'maxlen': None,
    'correct_unique_mappings': False,
    'char_model': 'None',
    'data_augm': False,
    'rnn_type': 'LSTM',
    'layers': 3,
    'brnn': True,
    'rnn_size': 200,
    'emb_size': 100,
    'attention': True,
    'bias': True,
    'tie_decoder_embeddings': True,
    'share_embeddings': True,
    'dropout': 0.5,
    'backward_split': None,
    'teacher_forcing_ratio': 0.6,
    'noise_ratio': 0,
    'batch_size': 32,
    'start_epoch': 1,
    'end_epoch': 50,
    'optim': 'adam',
    'lr': 0.01,
    'max_grad_norm': 5,
    'learning_rate_decay': 0.05,
    'start_decay_after': 15,
    'gpu': -1,
    'log_interval': 1,
    'save_interval': -1,
    'seed': 3435,
    'logfolder': True,
    'save_dir': '/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Code/TextNormSeq2Seq/S2S',
    'load_from': None,
    'eval': False,
    'interactive': False,
    'max_train_decode_len': 165
})

In [10]:
if __name__ == "__main__":
    main(word_args)

## S2SSelf

In [None]:
self_args = easydict.EasyDict({
    'traindata': '/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Data/train_data.json',
    'testdata': '/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Data/test_data.json',
    'valsplit': 219,
    'vocab_size': None,
    'lowercase': False,
    'share_vocab': True,
    'eos': True,
    'bos': True,
    'self_tok': True,
    'input': 'word',
    'maxlen': None,
    'correct_unique_mappings': False,
    'char_model': None,
    'data_augm': False,
    'rnn_type': 'LSTM',
    'layers': 3,
    'brnn': True,
    'rnn_size': 100,
    'emb_size': 100,
    'attention': True,
    'bias': True,
    'tie_decoder_embeddings': True,
    'share_embeddings': True,
    'dropout': 0.2,
    'backward_split': None,
    'teacher_forcing_ratio': 0.6,
    'noise_ratio': 0,
    'batch_size': 32,
    'start_epoch': 1,
    'end_epoch': 100,
    'optim': 'adam',
    'lr': 0.01,
    'max_grad_norm': 10,
    'learning_rate_decay': 0.05,
    'start_decay_after': 15,
    'gpu': -1,
    'log_interval': 1,
    'save_interval': -1,
    'seed': 3435,
    'logfolder': True,
    'save_dir': '/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Code/TextNormSeq2Seq/S2SSelf',
    'load_from': None,
    'eval': False,
    'interactive': False,
    'max_train_decode_len': 165
})

In [None]:
if __name__ == "__main__":
    main(self_args)

## S2SMulti

In [9]:
multi_args = easydict.EasyDict({
    'traindata': '/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Data/train_data.json',
    'testdata': '/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Data/test_data.json',
    'valsplit': 219,
    'vocab_size': None,
    'lowercase': False,
    'share_vocab': True,
    'eos': True,
    'bos': True,
    'self_tok': False,
    'input': 'word',
    'maxlen': None,
    'correct_unique_mappings': True,
    'char_model': None,
    'data_augm': False,
    'rnn_type': 'LSTM',
    'layers': 3,
    'brnn': True,
    'rnn_size': 200,
    'emb_size': 100,
    'attention': True,
    'bias': True,
    'tie_decoder_embeddings': True,
    'share_embeddings': True,
    'dropout': 0.5,
    'backward_split': None,
    'teacher_forcing_ratio': 0.6,
    'noise_ratio': 0,
    'batch_size': 32,
    'start_epoch': 1,
    'end_epoch': 80,
    'optim': 'adam',
    'lr': 0.01,
    'max_grad_norm': 5,
    'learning_rate_decay': 0.05,
    'start_decay_after': 15,
    'gpu': -1,
    'log_interval': 1,
    'save_interval': -1,
    'seed': 3435,
    'logfolder': True,
    'save_dir': '/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Code/TextNormSeq2Seq/S2SMulti',
    'load_from': None,
    'eval': False,
    'interactive': False,
    'max_train_decode_len': 165
})

In [10]:
if __name__ == "__main__":
    main(multi_args)

# **Evaluate**

## Build function

In [23]:
import numpy as np
import json
from statistics import mean
from nltk.translate.bleu_score import sentence_bleu

In [24]:
def find_indices(list_to_check, item_to_find):
    return [idx for idx, value in enumerate(list_to_check) if value == item_to_find]

In [25]:
def add_multiple_target(input, target):
  targets = []
  for inp in input:
    targets_idx = find_indices(input, inp)
    targets.append(list(np.array(target)[targets_idx]))

  return targets

In [26]:
def calculate_f1(input, true, pred):
  correct_norm, total_norm, total_nsw = 0.0, 0.0, 0.0
  for i in range(len(input)):
    if input[i] != pred[i] and true[i] == pred[i]: # số token được chuẩn hóa đúng
      correct_norm += 1
    if input[i] != true[i]: # nsw trong câu
      total_nsw += 1
    if input[i] != pred[i]: # các token đc chuẩn hóa
      total_norm += 1

  p = r = f1 = 0.0
  if(total_norm!=0 and correct_norm!= 0): p = correct_norm / total_norm
  if(total_norm!=0 and total_nsw!= 0): r = correct_norm / total_nsw
  if p != 0 and r != 0: f1 =  (2 * p * r) / (p + r)

  return f1

## Word Model

In [27]:
with open('/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Code/TextNormSeq2Seq/S2S/test.pred', 'r') as json_data:
    test_res = json.load(json_data)
with open('/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Code/TextNormSeq2Seq/S2S/valid.pred', 'r') as json_data:
    valid_res = json.load(json_data)

In [28]:
valid_input = []
valid_target = []
valid_output = []
for sent in valid_res:
  valid_input.append(sent['input'])
  valid_target.append(sent['target'])
  valid_output.append(sent['output'])

In [29]:
test_input = []
test_target = []
test_output = []
for sent in test_res:
  test_input.append(sent['input'])
  test_target.append(sent['target'])
  test_output.append(sent['output'])

In [30]:
valid_targets = add_multiple_target(valid_input, valid_target)
test_targets = add_multiple_target(test_input, test_target)

print(len(valid_targets))
print(len(test_targets))

219
219


  targets.append(list(np.array(target)[targets_idx]))


In [31]:
f1_valid = []
for i in range(len(valid_input)):
  scores = []
  input = valid_input[i]
  targets = valid_targets[i]
  output = valid_output[i]
  true_token = []
  pred_token = []
  for target in targets:
    scores.append(calculate_f1(input, target, output))

  f1_valid.append(max(scores))

In [32]:
f1_test = []
for i in range(len(test_input)):
  scores = []
  input = test_input[i]
  targets = test_targets[i]
  output = test_output[i]
  true_token = []
  pred_token = []
  for target in targets:
    scores.append(calculate_f1(input, target, output))

  f1_test.append(max(scores))

In [33]:
bleu_valid = []
for i in range(len(valid_input)):
  reference = valid_targets[i]
  candidate = valid_output[i]
  score = sentence_bleu(reference, candidate)
  bleu_valid.append(score)

In [34]:
bleu_test = []
for i in range(len(test_input)):
  reference = test_targets[i]
  candidate = test_output[i]
  score = sentence_bleu(reference, candidate)
  bleu_test.append(score)

In [35]:
print(mean(f1_valid))
print(mean(f1_test))

0.542604058315152
0.561573363233388


In [36]:
print(mean(bleu_valid))
print(mean(bleu_test))

0.6503697827023917
0.6870582884768347


## S2SSelf

In [37]:
with open('/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Code/TextNormSeq2Seq/S2SSelf/test.pred', 'r') as json_data:
    test_res = json.load(json_data)
with open('/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Code/TextNormSeq2Seq/S2SSelf/valid.pred', 'r') as json_data:
    valid_res = json.load(json_data)

In [38]:
valid_input = []
valid_target = []
valid_output = []
for sent in valid_res:
  valid_input.append(sent['input'])
  valid_target.append(sent['target'])
  valid_output.append(sent['output'])

In [39]:
test_input = []
test_target = []
test_output = []
for sent in test_res:
  test_input.append(sent['input'])
  test_target.append(sent['target'])
  test_output.append(sent['output'])

In [40]:
valid_targets = add_multiple_target(valid_input, valid_target)
test_targets = add_multiple_target(test_input, test_target)

print(len(valid_targets))
print(len(test_targets))

219
219


  targets.append(list(np.array(target)[targets_idx]))


In [41]:
f1_valid = []
for i in range(len(valid_input)):
  scores = []
  input = valid_input[i]
  targets = valid_targets[i]
  output = valid_output[i]
  true_token = []
  pred_token = []
  for target in targets:
    scores.append(calculate_f1(input, target, output))

  f1_valid.append(max(scores))

In [42]:
f1_test = []
for i in range(len(test_input)):
  scores = []
  input = test_input[i]
  targets = test_targets[i]
  output = test_output[i]
  true_token = []
  pred_token = []
  for target in targets:
    scores.append(calculate_f1(input, target, output))

  f1_test.append(max(scores))

In [43]:
bleu_valid = []
for i in range(len(valid_input)):
  reference = valid_targets[i]
  candidate = valid_output[i]
  score = sentence_bleu(reference, candidate)
  bleu_valid.append(score)

In [44]:
bleu_test = []
for i in range(len(test_input)):
  reference = test_targets[i]
  candidate = test_output[i]
  score = sentence_bleu(reference, candidate)
  bleu_test.append(score)

In [45]:
print(mean(f1_valid))
print(mean(f1_test))

0.40878356877551075
0.37647776886930373


In [46]:
print(mean(bleu_valid))
print(mean(bleu_test))

0.5755608227946879
0.5840170150784709


## S2SMulti

In [47]:
with open('/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Code/TextNormSeq2Seq/S2SMulti/test.pred', 'r') as json_data:
    test_res = json.load(json_data)
with open('/content/drive/MyDrive/Khai thác dữ liệu truyền thông xã hội - IE403.N22/Đồ án/Code/TextNormSeq2Seq/S2SMulti/valid.pred', 'r') as json_data:
    valid_res = json.load(json_data)

In [48]:
valid_input = []
valid_target = []
valid_output = []
for sent in valid_res:
  valid_input.append(sent['input'])
  valid_target.append(sent['target'])
  valid_output.append(sent['output'])

In [49]:
test_input = []
test_target = []
test_output = []
for sent in test_res:
  test_input.append(sent['input'])
  test_target.append(sent['target'])
  test_output.append(sent['output'])

In [50]:
valid_targets = add_multiple_target(valid_input, valid_target)
test_targets = add_multiple_target(test_input, test_target)

print(len(valid_targets))
print(len(test_targets))

219
219


  targets.append(list(np.array(target)[targets_idx]))


In [51]:
f1_valid = []
for i in range(len(valid_input)):
  scores = []
  input = valid_input[i]
  targets = valid_targets[i]
  output = valid_output[i]
  true_token = []
  pred_token = []
  for target in targets:
    scores.append(calculate_f1(input, target, output))

  f1_valid.append(max(scores))

In [52]:
f1_test = []
for i in range(len(test_input)):
  scores = []
  input = test_input[i]
  targets = test_targets[i]
  output = test_output[i]
  true_token = []
  pred_token = []
  for target in targets:
    scores.append(calculate_f1(input, target, output))

  f1_test.append(max(scores))

In [53]:
bleu_valid = []
for i in range(len(valid_input)):
  reference = valid_targets[i]
  candidate = valid_output[i]
  score = sentence_bleu(reference, candidate)
  bleu_valid.append(score)

In [54]:
bleu_test = []
for i in range(len(test_input)):
  reference = test_targets[i]
  candidate = test_output[i]
  score = sentence_bleu(reference, candidate)
  bleu_test.append(score)

In [55]:
print(mean(f1_valid))
print(mean(f1_test))

0.55625535021113
0.5830218057597293


In [56]:
print(mean(bleu_valid))
print(mean(bleu_test))

0.6526358149375606
0.6971257812210854
