In [2]:
import re
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import numpy as np
import os
from pickle import dump, load
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# 文件路径列表
file_paths = [
    'D:\\学习\\Postgra\\Text Technologies for Data Science\\Project\\Query-Processing\\OneDrive_1_2024-2-16\\combined123\\combined_part_1.jsonl',
    'D:\\学习\\Postgra\\Text Technologies for Data Science\\Project\\Query-Processing\\OneDrive_1_2024-2-16\\combined123\\combined_part_2.jsonl',
    'D:\\学习\\Postgra\\Text Technologies for Data Science\\Project\\Query-Processing\\OneDrive_1_2024-2-16\\combined123\\combined_part_3.jsonl',
    'D:\\学习\\Postgra\\Text Technologies for Data Science\\Project\\Query-Processing\\OneDrive_1_2024-2-16\\combined123\\combined_part_4.jsonl',
    'D:\\学习\\Postgra\\Text Technologies for Data Science\\Project\\Query-Processing\\OneDrive_1_2024-2-16\\combined123\\combined_part_5.jsonl'
]

# 打开一个新文件用于写入所有标题
with open('all_titles.txt', 'w', encoding='utf-8') as out_file:
    # 遍历文件路径
    for file_path in file_paths:
        # 打开文件并逐行读取
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                # 将每行的内容从JSON字符串转换为字典
                article = json.loads(line)
                # 替换掉标题中的换行符'\n'，然后写入到输出文件中
                title = article['title'].replace('\n', ' ')
                out_file.write(title + '\n')


In [None]:
def preprocess_text(text):
    """
    预处理文本：转换为小写并去除非字母数字字符
    """
    # 转换为小写
    text = text.lower()
    # 使用正则表达式去除非字母数字字符
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# 输入文件路径
input_file_path = 'all_titles.txt'
# 输出文件路径
output_file_path = 'preprocessed_titles.txt'

# 读取输入文件，预处理每一行，然后写入到输出文件
with open(input_file_path, 'r', encoding='utf-8') as in_file, \
     open(output_file_path, 'w', encoding='utf-8') as out_file:
    for line in in_file:
        preprocessed_line = preprocess_text(line)
        out_file.write(preprocessed_line + '\n')


In [14]:
file_path = 'preprocessed_titles.txt'

titles = []
with open(file_path, 'r', encoding='utf-8') as file:
    for i, line in enumerate(file):
        if i < 5000:  # 仅读取前5000行
            title = line.strip().lower()
            titles.append(title)
        else:
            break  # 当读取到5000行时，退出循环

In [15]:
titles

['unbiased intestinal single cell transcriptomics reveals previously uncharacterized enteric nervous system populations in larval zebrafish',
 '',
 'effect of ulv malathion on automotive paint finishes',
 '',
 'a new criteria for determining the best decomposition level and filter for waveletbased datadriven forecasting frameworks validating using three case studies on the camels dataset',
 '',
 'regulation of ntype calcium channels by nociceptin receptors and its possible role in neurological disorders',
 '',
 'association of serum lipids and abnormal lipid score with cancer risk a populationbased prospective study',
 '',
 'exploring the singlecell rnaseq analysis landscape with the scrnatools database',
 '',
 'griseolic acid an inhibitor of cyclic adenosine 35monophosphate phosphodiesterase i taxonomy isolation and characterization',
 '',
 'logic programming and logarithmic space',
 '',
 'the right formula',
 '',
 'pseudo transient continuation and time marching methods for mongeampe

In [16]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(titles)
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in titles:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# 填充序列以保持统一长度
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))


# 分割为特征和标签
predictors, label = input_sequences[:,:-1], input_sequences[:,-1]
label = to_categorical(label, num_classes=total_words)

In [21]:
# 定义模型
def create_model(total_words, max_sequence_len):
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(total_words, activation='softmax'))
    # 编译模型
    model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01), metrics=['accuracy'])
    return model


# 创建模型
model = create_model(total_words, max_sequence_len)
model.summary()

# 设置checkpoint
checkpoint_path = 'model_checkpoint.h5'
checkpoint = ModelCheckpoint(checkpoint_path, monitor='loss', verbose=1, save_best_only=True, mode='min')

# 训练模型
model.fit(predictors, label, epochs=100, verbose=1, callbacks=[checkpoint])

# 保存tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 45, 100)           900800    
                                                                 
 lstm_10 (LSTM)              (None, 45, 150)           150600    
                                                                 
 lstm_11 (LSTM)              (None, 100)               100400    
                                                                 
 dense_5 (Dense)             (None, 9008)              909808    
                                                                 
Total params: 2,061,608
Trainable params: 2,061,608
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 1: loss improved from inf to 8.92454, saving model to model_checkpoint.h5
Epoch 2/100
Epoch 2: loss improved from 8.92454 to 7.74757, saving model to model_ch

Epoch 28/100
Epoch 28: loss improved from 1.58796 to 1.52191, saving model to model_checkpoint.h5
Epoch 29/100
Epoch 29: loss improved from 1.52191 to 1.46992, saving model to model_checkpoint.h5
Epoch 30/100
Epoch 30: loss improved from 1.46992 to 1.45258, saving model to model_checkpoint.h5
Epoch 31/100
Epoch 31: loss improved from 1.45258 to 1.41180, saving model to model_checkpoint.h5
Epoch 32/100
Epoch 32: loss improved from 1.41180 to 1.33930, saving model to model_checkpoint.h5
Epoch 33/100
Epoch 33: loss improved from 1.33930 to 1.28797, saving model to model_checkpoint.h5
Epoch 34/100
Epoch 34: loss improved from 1.28797 to 1.26665, saving model to model_checkpoint.h5
Epoch 35/100
Epoch 35: loss improved from 1.26665 to 1.25315, saving model to model_checkpoint.h5
Epoch 36/100
Epoch 36: loss improved from 1.25315 to 1.23982, saving model to model_checkpoint.h5
Epoch 37/100
Epoch 37: loss improved from 1.23982 to 1.22298, saving model to model_checkpoint.h5
Epoch 38/100
Epoch 3

Epoch 59/100
Epoch 59: loss did not improve from 0.94610
Epoch 60/100
Epoch 60: loss did not improve from 0.94610
Epoch 61/100
Epoch 61: loss did not improve from 0.94610
Epoch 62/100
Epoch 62: loss did not improve from 0.94610
Epoch 63/100
Epoch 63: loss did not improve from 0.94610
Epoch 64/100
Epoch 64: loss did not improve from 0.94610
Epoch 65/100
Epoch 65: loss did not improve from 0.94610
Epoch 66/100
Epoch 66: loss did not improve from 0.94610
Epoch 67/100
Epoch 67: loss did not improve from 0.94610
Epoch 68/100
Epoch 68: loss did not improve from 0.94610
Epoch 69/100
Epoch 69: loss did not improve from 0.94610
Epoch 70/100
Epoch 70: loss did not improve from 0.94610
Epoch 71/100
Epoch 71: loss did not improve from 0.94610
Epoch 72/100
Epoch 72: loss did not improve from 0.94610
Epoch 73/100
Epoch 73: loss did not improve from 0.94610
Epoch 74/100
Epoch 74: loss did not improve from 0.94610
Epoch 75/100
Epoch 75: loss did not improve from 0.94610
Epoch 76/100
Epoch 76: loss did

Epoch 94: loss did not improve from 0.94610
Epoch 95/100
Epoch 95: loss did not improve from 0.94610
Epoch 96/100
Epoch 96: loss did not improve from 0.94610
Epoch 97/100
Epoch 97: loss did not improve from 0.94610
Epoch 98/100
Epoch 98: loss did not improve from 0.94610
Epoch 99/100
Epoch 99: loss did not improve from 0.94610
Epoch 100/100
Epoch 100: loss did not improve from 0.94610


In [10]:
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(titles)
# total_words = len(tokenizer.word_index) + 1

# input_sequences = []
# for line in titles:
#     token_list = tokenizer.texts_to_sequences([line])[0]
#     for i in range(1, len(token_list)):
#         n_gram_sequence = token_list[:i+1]
#         input_sequences.append(n_gram_sequence)

# # 填充序列以保持统一长度
# max_sequence_len = max([len(x) for x in input_sequences])
# input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# # 分割为特征和标签
# predictors, label = input_sequences[:,:-1], input_sequences[:,-1]
# label = to_categorical(label, num_classes=total_words)

# # 定义模型
# def create_model(total_words, max_sequence_len):
#     model = Sequential()
#     model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
#     model.add(LSTM(150, return_sequences=True))
#     model.add(LSTM(100))
#     model.add(Dense(total_words, activation='softmax'))
#     # 编译模型
#     model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01), metrics=['accuracy'])
#     return model

# # 创建模型
# model = create_model(total_words, max_sequence_len)
# model.summary()

# # 设置checkpoint
# checkpoint_path = 'model_checkpoint2.h5'
# checkpoint = ModelCheckpoint(checkpoint_path, monitor='loss', verbose=1, save_best_only=True, mode='min')

# # 训练模型
# model.fit(predictors, label, epochs=100, verbose=1, callbacks=[checkpoint])

# dump(tokenizer, open('tokenizer2.pkl', 'wb'))

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 45, 100)           900800    
                                                                 
 lstm_6 (LSTM)               (None, 45, 150)           150600    
                                                                 
 lstm_7 (LSTM)               (None, 100)               100400    
                                                                 
 dense_3 (Dense)             (None, 9008)              909808    
                                                                 
Total params: 2,061,608
Trainable params: 2,061,608
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100


KeyboardInterrupt: 

In [22]:
# 测试模型：
# 加载模型
model = load_model('model_checkpoint.h5')

# 加载tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

In [19]:
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predictions = model.predict(token_list, verbose=0)
        predicted = np.argmax(predictions, axis=-1)[0]
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [20]:
seed_text = "Deep learning"
next_words = 10  # 生成标题的长度

generated_title = generate_text(seed_text, next_words, model, tokenizer, max_sequence_len)
print(generated_title)

Deep learning and tullyfisher he fingerlings surface affected by sperms school therapy


In [26]:
seed_text = "Deep learning"
next_words = 5  # 生成标题的长度

generated_title = generate_text(seed_text, next_words, model, tokenizer, max_sequence_len)
print(generated_title)

Deep learning or refreshment l koch 18881976
