<a href="https://colab.research.google.com/github/ExtrApostroPhe/Classification/blob/main/Keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import jieba
import re
import nltk
import ssl
import os

import pandas as pd
from google.colab import drive
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Conv1D, Dense, MaxPool1D, Flatten, Input, Dropout, Embedding
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import BatchNormalization
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 


In [None]:
lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')
stw = set(stopwords.words('english'))
drive.mount('/content/drive', force_remount=True)  
os.chdir('/content/drive/My Drive/news_ipm')

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Mounted at /content/drive


In [None]:
data_env = pd.read_csv('./train_data/environment.txt', header = None, sep = "\n")  # 这里只获取部分数据
data_health = pd.read_csv('./train_data/health.txt', header = None, sep = "\n")
data_hk = pd.read_csv('./train_data/hk_macao.txt', header = None, sep = "\n")
data_inno = pd.read_csv('./train_data/innovation.txt', header = None, sep = "\n")
data_military = pd.read_csv('./train_data/military.txt', header = None, sep = "\n")
data_notional = pd.read_csv('./train_data/notional_affairs.txt', header = None, sep = "\n")
data_society = pd.read_csv('./train_data/society.txt', header = None, sep = "\n")

data_env[1] = 0
data_health[1] = 1
data_hk[1] = 2
data_inno[1] = 3
data_military[1] = 4
data_notional[1] = 5
data_society[1] = 6
dataset = pd.concat([data_env, data_health, data_hk, data_inno, data_military, data_notional, data_society ], 
                    axis = 0, 
                    ignore_index = True).astype(str)
X = dataset[0]
Y = dataset[1]
print(X)

0        Volunteer work sees 78.1b trees planted in China
1         After heavy snow, humans take food to antelopes
2       Key meeting stresses proper understanding of C...
3             China releases rare fish into Yangtze River
4       China makes notable progress in water, soil co...
                              ...                        
2655    Chinese mainland reports 47 locally transmitte...
2656        Lhasa group to help preserve Tibetan art form
2657                     Nation aims to narrow wealth gap
2658           Capital Airport boosts Olympic precautions
2659              Nation strengthens grassroots democracy
Name: 0, Length: 2660, dtype: object


In [None]:
def insert_text():
    X_temp = []
    for x in X:
        X_temp.append(x)
    return X_temp

def filter_text(text):
    str = re.sub('[^a-zA-Z]', ' ', text)
    return re.sub(r'\s+', ' ', str)

def lower_text(text):
    return text.lower()

def lemmatization_text(text_cut_list):
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(n) for n in text_cut_list]

def text_cut(text):
    return re.findall('[a-zA-z]+', text)

def stopwords_text(text_cut_list):
    Stop_Word_list = stopwords.words("english")
    return [n for n in text_cut_list if n not in Stop_Word_list and len(n)>=3]

article = insert_text()
X = []
print(article)
for content in article:
    content = filter_text(content)
    content = lower_text(content)
    print(content)
    word_cut = lemmatization_text(text_cut(content))
    word_cut = stopwords_text(word_cut)
#     print(word_cut)
    content = ''
    for w in word_cut:
        content = content + w + ' '
    X.append(content)
    # print(content)

# print(X)
# print(Y)

volunteer work see tree planted china 
heavy snow human take food antelope 
key meeting stress proper understanding china carbon neutrality goal 
china release rare fish yangtze river 
china make notable progress water soil conservation 
giant panda bounce back thanks reforestation 
ecological corridor help expand giant panda habitat 
migratory bird seen yinchuan 
beijing look forward end heavily polluted day 
wetland protection reaps rich reward 
team assist bird search food 
rare bird spotted china nature reserve 
chinese researcher discover new wild orchid specie 
country vow water cooperation 
migratory flamingo winter bird shanxi 
action plan improve rural living condition 
china top court clarify biodiversity protection law 
inspection focus high emission project 
beijing see increasing migratory bird winter 
inspection uncovers soe violation 
complex cure key curbing desertification 
beijing expects avian guest winter 
scientist urge better monitoring glacier 
benefit flowing pl

In [None]:
tokenizer=Tokenizer()  #创建一个Tokenizer对象
#fit_on_texts函数可以将输入的文本中的每个词编号，编号是根据词频的，词频越大，编号越小
tokenizer.fit_on_texts(X)
vocab=tokenizer.word_index #得到每个词的编号

print(vocab)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)

x_train_word_ids = tokenizer.texts_to_sequences(x_train)
x_test_word_ids = tokenizer.texts_to_sequences(x_test)
#序列模式
# 每条样本长度不唯一，将每条样本的长度设置一个固定值
x_train_padded_seqs = pad_sequences(x_train_word_ids,maxlen=20) #将超过固定值的部分截掉，不足的在最前面用0填充
x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=20)
print(x_train_padded_seqs)
# print(x_train_word_ids)

[[   0    0    0 ...  622  615  855]
 [   0    0    0 ...   47  158   49]
 [   0    0    0 ... 1853 1854   38]
 ...
 [   0    0    0 ...  162   63  224]
 [   0    0    0 ...  378   50  789]
 [   0    0    0 ...  226 1691   49]]


In [None]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train)
# print(y_train)
y_test = to_categorical(y_test)
# print(x_train)
print(x_train_padded_seqs)
# print(x_test_padded_seqs.shape)

[[   0    0    0 ...  622  615  855]
 [   0    0    0 ...   47  158   49]
 [   0    0    0 ... 1853 1854   38]
 ...
 [   0    0    0 ...  162   63  224]
 [   0    0    0 ...  378   50  789]
 [   0    0    0 ...  226 1691   49]]


In [None]:
#构建CNN分类模型(LeNet-5)
#模型结构：嵌入-卷积池化*2-dropout-BN-全连接-dropout-全连接
model = Sequential()
model.add(Embedding(len(vocab) + 1, 300, input_length=20)) #使用Embeeding层将每个词编码转换为词向量
model.add(Conv1D(256, 5, padding='same'))
model.add(MaxPool1D(3, 3, padding='same'))
model.add(Conv1D(128, 5, padding='same'))
model.add(MaxPool1D(3, 3, padding='same'))
model.add(Conv1D(64, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.1))
model.add(BatchNormalization())  # (批)规范化层
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(7, activation='softmax'))

from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=2)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train_padded_seqs, y_train,
                    epochs=10, batch_size=64, 
                    validation_data = (x_test_padded_seqs, y_test),
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

KeyboardInterrupt: ignored

In [None]:
import tensorflow as tf
import sklearn
import numpy as np
import pandas as pd
import math
import jieba
import pickle
import time
from collections import Counter

class RNNTextClassifier():
    def __init__(self,vocab_size, n_out, embedding_size=128, cell_size=128,
                 grad_clip=5.0,sess=tf.Session()):
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.cell_size = cell_size
        self.grad_clip = grad_clip
        self.n_out = n_out
        self.sess = sess
        self._pointer = None
        self.buildgraph()

    def buildgraph(self):
        self.add_input_layer()
        self.add_wordembedding_layer()
        self.add_dynamic_rnn()
        self.add_output_layer()
        self.add_optimizer()

    def add_input_layer(self,):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int64, [None])
        self.X_seq_len = tf.placeholder(tf.int32, [None])
        self.keep_prob = tf.placeholder(tf.float32)
        self.lr = tf.placeholder(tf.float32)
        self._pointer = self.X

    def add_wordembedding_layer(self):
        embedding = tf.get_variable("encoder",
                                    [self.vocab_size,self.embedding_size],
                                    dtype=tf.float32,
                                    initializer=tf.random_uniform_initializer(-1.0,1.0))
        embedded = tf.nn.embedding_lookup(embedding, self._pointer)
        # self._pointer = tf.nn.dropout(embedded, keep_prob=self.keep_prob)
        self._pointer = embedded

    def lstm_cell(self):
        lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self.cell_size,initializer=tf.orthogonal_initializer())
        return tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob= self.keep_prob)

    def add_dynamic_rnn(self):
        self.outputs, self.last_state = tf.nn.dynamic_rnn(
            cell=self.lstm_cell(),
            inputs=self._pointer,
            sequence_length=self.X_seq_len,
            dtype=tf.float32
        )
    def add_output_layer(self):
        self.logits = tf.layers.dense(self.last_state.h, self.n_out)

    def add_optimizer(self):
        self.loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.logits, labels=self.Y
            )
        )
        self.acc = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(self.logits, axis=1),self.Y),dtype=tf.float32))
        #gradient clipping
        params = tf.trainable_variables()
        gradients = tf.gradients(ys=self.loss, xs=params)
        clipped_gradients, _ = tf.clip_by_global_norm(t_list=gradients, clip_norm=self.grad_clip)
        self.train_op = tf.train.AdamOptimizer(self.lr).apply_gradients(zip(clipped_gradients, params))

    def fit(self, X, Y, val_data=None, n_epoch=10, batch_size=128, exp_decay=True,
            isshuffle=True, keep_prob=0.5):
        if val_data is None:
            print("Train %d samples" % len(X))
        else:
            print("Train %d samples | Test %d samples" % (len(X), len(val_data[0])))
        log = {'loss':[], 'acc':[], 'val_loss':[], 'val_acc':[]}
        global_step = 0
        self.sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        for epoch in range(n_epoch):
            if isshuffle:
                X, Y = sklearn.utils.shuffle(X,Y)
            for local_step, ((X_batch, X_batch_lens), Y_batch) in enumerate(
                    zip(self.next_batch(X, batch_size), self.gen_batch(Y, batch_size))):
                lr = self.decrease_lr(exp_decay,global_step, n_epoch, len(X), batch_size)
                _, loss, acc = self.sess.run([self.train_op, self.loss, self.acc],
                                             feed_dict={self.X:X_batch,
                                                        self.Y:Y_batch,
                                                        self.X_seq_len:X_batch_lens,
                                                        self.lr:lr,
                                                        self.keep_prob:keep_prob})
                global_step += 1
                if local_step % 50 == 0:
                    print("Epoch %d | Step %d%d | Train loss: %.4f | Train acc: %.4f | lr: %.4f" % (
                        epoch+1, local_step, int(len(X)/batch_size), loss, acc, lr
                    ))
                log['loss'].append(loss)
                log['acc'].append(acc)

            if val_data is not None:
                val_loss_list, val_acc_list = [],[]
                for (X_test_batch,X_test_batch_lens), Y_test_batch in zip(self.next_batch(val_data[0], batch_size),
                                                                          self.gen_batch(val_data[1],batch_size)):
                    v_loss, v_acc = self.sess.run([self.loss, self.acc],feed_dict={
                        self.X: X_test_batch, self.Y: Y_test_batch,
                        self.X_seq_len:X_test_batch_lens, self.keep_prob:1.0
                    })
                    val_loss_list.append(v_loss)
                    val_acc_list.append(v_acc)
                val_loss, val_acc = self.list_avg(val_loss_list), self.list_avg(val_acc_list)
                log['val_loss'].append(val_loss)
                log['val_acc'].append(val_acc)
                print("val_data loss: %.4f | val_data acc: %.4f" % (val_loss, val_acc))
        saver.save(self.sess,"c:/users/ll/desktop/model/model.ckpt")
        return log

    def predict(self, X_test, batch_size=128):
        batch_pred_list = []
        for (X_test_batch, X_test_batch_lens) in self.next_batch(X_test, batch_size):
            batch_pred = self.sess.run(self.logits,feed_dict={
                self.X: X_test_batch,
                self.X_seq_len: X_test_batch_lens,
                self.keep_prob: 1.0
            })
            batch_pred_list.append(batch_pred)
        return np.argmax(np.vstack(batch_pred_list), 1)

    def pad_sentence_batch(self, sentence_batch, pad_int=0):
        max_lens = max([len(sentence) for sentence in sentence_batch])
        padded_seqs = []
        seq_lens = []
        for sentence in sentence_batch:
            padded_seqs.append(sentence + [pad_int] * (max_lens-len(sentence)))
            seq_lens.append(len(sentence))

        return padded_seqs, seq_lens

    def next_batch(self, arr, batch_size):
        for i in range(0, len(arr), batch_size):
            padded_seqs, seq_lens = self.pad_sentence_batch(arr[i:i+batch_size])
            yield padded_seqs, seq_lens

    def gen_batch(self, arr, batch_size):
        for i in range(0, len(arr), batch_size):
            yield arr[i: i+batch_size]

    def list_avg(self, l):
        return sum(l)/len(l)

    def decrease_lr(self, exp_decay, global_step, n_epoch, len_x, batch_size):
        if exp_decay:
            max_lr = 0.005
            min_lr = 0.001
            decay_rate = math.log(min_lr/max_lr) / (-n_epoch*len_x/batch_size)
            lr = max_lr*math.exp(-decay_rate*global_step)
        else:
            lr = 0.001
        return lr
