In [1]:
# coding=utf-8
import os
import sys
import joblib
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import Constant
from tensorflow.keras.callbacks import ModelCheckpoint
import moxing as mox
import argparse

# BASE_DIR = 'G:\\trainingdata'


# parser = argparse.ArgumentParser(description='CNN Example')
# parser.add_argument('--data_url', type=str, default="./Data",
#                     help='path where the dataset is saved')
# parser.add_argument('--train_url', type=str, default="./Model", help='model path')
# args = parser.parse_args()
# # BASE_DIR为训练集根目录，这里设置为桶的dataset目录
# BASE_DIR = args.data_url

BASE_DIR = 'source'


# 文本语料路径
TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

# 将词变为词向量
print('Indexing word vectors.')
print(TEXT_DATA_DIR)
embeddings_index = {}
with open(os.path.join(BASE_DIR, 'glove.6B.100d.txt'), 'r', encoding='utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs


texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    print(path)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            # if fname.isdigit():
                fpath = os.path.join(path, fname)
                args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
                with open(fpath, **args) as f:
                    t = f.read()
                    i = t.find('\n\n')  # skip header
                    if 0 < i:
                        t = t[i:]
                    texts.append(t)
                labels.append(label_id)
print('Found %s texts.' % len(texts))

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
joblib.dump(tokenizer, 'token_result.pkl')

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(data)
labels = to_categorical(np.asarray(labels))
print(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
print(data)
labels = labels[indices]
print(labels)
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
print(data.shape[0])
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print('Preparing embedding matrix.')
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        # 从预训练模型的词向量到语料库的词向量映射
        embedding_matrix[i] = embedding_vector

        
embedding_layer = Embedding(MAX_NUM_WORDS,
                            EMBEDDING_DIM,
                            embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)
print('Training model.')

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

#请从此开始补充定义CNN和LSTM模型

# Your codes
model = keras.Sequential([
  embedding_layer,
  keras.layers.Bidirectional(keras.layers.LSTM(128)),
  keras.layers.Dense(16, activation='relu'),
  keras.layers.Dense(20, activation='softmax')
])

model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#补充代码ending
history = model.fit(x_train, y_train,
                    batch_size=128,
                    epochs=15,
                    validation_data=(x_val, y_val))

# 先在虚拟机上保存模型，再将模型拷贝至桶的输出路径下。
Model_DIR = os.path.join(os.getcwd(), 'mytextlstm_model.h5')
model.save(Model_DIR)
print('Saved model to disk'+Model_DIR)
# 第二个参数需要根据实验者的桶路径修改
mox.file.copy_parallel(Model_DIR,'obs://nlp-lab2-cpy/model/mytextlstm_model.h5')

2022-12-18 22:22:03.899109: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libnvinfer.so.6
2022-12-18 22:22:03.956810: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libnvinfer_plugin.so.6
INFO:root:Using MoXing-v2.1.0.5d9c87c8-5d9c87c8
INFO:root:Using OBS-Python-SDK-3.20.9.1


Indexing word vectors.
source/20_newsgroup
source/20_newsgroup/alt.atheism
source/20_newsgroup/comp.graphics
source/20_newsgroup/comp.os.ms-windows.misc
source/20_newsgroup/comp.sys.ibm.pc.hardware
source/20_newsgroup/comp.sys.mac.hardware
source/20_newsgroup/comp.windows.x
source/20_newsgroup/misc.forsale
source/20_newsgroup/rec.autos
source/20_newsgroup/rec.motorcycles
source/20_newsgroup/rec.sport.baseball
source/20_newsgroup/rec.sport.hockey
source/20_newsgroup/sci.crypt
source/20_newsgroup/sci.electronics
source/20_newsgroup/sci.med
source/20_newsgroup/sci.space
source/20_newsgroup/soc.religion.christian
source/20_newsgroup/talk.politics.guns
source/20_newsgroup/talk.politics.mideast
source/20_newsgroup/talk.politics.misc
source/20_newsgroup/talk.religion.misc
Found 19996 texts.
[[  58  576    3 ...    4  930 2050]
 [ 221   31  972 ... 2932  552  324]
 [   0    0    0 ...    3  316 5816]
 ...
 [   0    0    0 ...   71  197  514]
 [   0    0    0 ... 2113 1618 9557]
 [   0    0    

2022-12-18 22:22:50.035720: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2022-12-18 22:22:50.063865: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-18 22:22:50.064817: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1555] Found device 0 with properties: 
pciBusID: 0000:00:0e.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0
coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s
2022-12-18 22:22:50.064905: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2022-12-18 22:22:50.064967: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2022-12-18 22:22:50.126366: I tensorflow/stream_executor/plat

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 100)         2000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               234496    
_________________________________________________________________
dense (Dense)                (None, 16)                4112      
_________________________________________________________________
dense_1 (Dense)              (None, 20)                340       
Total params: 2,238,948
Trainable params: 2,238,948
Non-trainable params: 0
_________________________________________________________________
Train on 15997 samples, validate on 3999 samples
Epoch 1/15


2022-12-18 22:22:57.437201: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2022-12-18 22:22:58.227834: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


RuntimeError: Can't decrement id ref count (unable to close file, errno = 122, error message = 'Disk quota exceeded')