/
dataset.py
129 lines (109 loc) · 5.17 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import torch
from logging import getLogger
from data_utils import load_data, build_vocab, text2idx
from enum_type import SpecialTokens
class Dataset:
def __init__(self, config):
self.config = config
self.dataset_path = config['data_path']
self.max_vocab_size = config['max_vocab_size']
self.source_max_length = config['src_len']
self.target_max_length = config['tgt_len']
self.is_pgen = config['is_pgen'] and config['is_attention']
self.test_only = config['test_only']
self.logger = getLogger()
self._init_special_token()
self._get_preset()
self.restored_exist = self._detect_restored()
if self.restored_exist:
self._from_restored()
else:
self._from_scratch()
if not self.test_only:
self._info()
def _get_preset(self):
for prefix in ['train', 'valid', 'test']:
setattr(self, f'{prefix}_data', dict())
self.source_text = []
self.target_text = []
def _init_special_token(self):
self.padding_token = SpecialTokens.PAD
self.unknown_token = SpecialTokens.UNK
self.sos_token = SpecialTokens.SOS
self.eos_token = SpecialTokens.EOS
self.padding_token_idx = 0
self.unknown_token_idx = 1
self.sos_token_idx = 2
self.eos_token_idx = 3
self.special_token_list = [self.padding_token, self.unknown_token, self.sos_token, self.eos_token]
def _detect_restored(self):
absent_file_flag = False
for prefix in ['train', 'valid', 'test', 'vocab']:
filename = os.path.join(self.dataset_path, f'{prefix}.bin' if prefix == 'vocab' else
f'{prefix}_raw.bin' if not self.is_pgen else f'{prefix}_extended.bin')
if not os.path.isfile(filename):
absent_file_flag = True
break
return not absent_file_flag
def _from_scratch(self):
self._load_data()
self._build_vocab()
self._build_data()
self._dump_data()
def _from_restored(self):
self._load_restored()
def _load_data(self):
self.logger.info('Loading data from scratch')
for prefix in ['train', 'valid', 'test']:
source_file = os.path.join(self.dataset_path, f'{prefix}.src')
target_file = os.path.join(self.dataset_path, f'{prefix}.tgt')
source_text = load_data(source_file, self.source_max_length)
target_text = load_data(target_file, self.target_max_length)
self.source_text.append(source_text)
self.target_text.append(target_text)
self.logger.info('Load finished')
def _build_vocab(self):
self.logger.info('Building vocab')
text_data = self.source_text + self.target_text
self.idx2token, self.token2idx, self.max_vocab_size = build_vocab(
text_data, self.max_vocab_size, self.special_token_list
)
self.logger.info('Build finished')
def _build_data(self):
self.logger.info('Building data')
for i, prefix in enumerate(['train', 'valid', 'test']):
data_dict = text2idx(self.source_text[i], self.target_text[i], self.token2idx, self.is_pgen)
for key, value in data_dict.items():
getattr(self, f'{prefix}_data')[key] = value
getattr(self, f'{prefix}_data')['source_text'] = self.source_text[i]
getattr(self, f'{prefix}_data')['target_text'] = self.target_text[i]
self.logger.info('Build finished')
def _dump_data(self):
self.logger.info('Dumping data')
for prefix in ['train', 'valid', 'test']:
filename = os.path.join(self.dataset_path,
f'{prefix}_raw.bin' if not self.is_pgen else f'{prefix}_extended.bin')
data = getattr(self, f'{prefix}_data')
torch.save(data, filename)
vocab_file = os.path.join(self.dataset_path, 'vocab.bin')
torch.save([self.idx2token, self.token2idx, self.max_vocab_size], vocab_file)
self.logger.info('Dump finished')
def _load_restored(self):
self.logger.info('Loading data from restored')
prefixes = ['test'] if self.test_only else ['train', 'valid', 'test']
for prefix in prefixes:
filename = os.path.join(self.dataset_path,
f'{prefix}_raw.bin' if not self.is_pgen else f'{prefix}_extended.bin')
data = torch.load(filename)
setattr(self, f'{prefix}_data', data)
vocab_file = os.path.join(self.dataset_path, 'vocab.bin')
self.idx2token, self.token2idx, self.max_vocab_size = torch.load(vocab_file)
self.logger.info("Restore finished")
def _info(self):
info_str = ''
self.logger.info(f"Vocab size: {self.max_vocab_size}")
for prefix in ['train', 'valid', 'test']:
data = getattr(self, f'{prefix}_data')['target_text']
info_str += f'{prefix}: {len(data)} cases, '
self.logger.info(info_str[:-2] + '\n')