In [1]:
import pandas as pd
import jieba
from collections import Counter

scholar_data = pd.read_excel('学者撰写的论文摘要.xlsx')
ai_data = pd.read_excel('AI生成的论文摘要.xlsx')

scholar_abstracts = scholar_data['论文摘要']
ai_abstracts = ai_data['论文摘要']

# 加载停用词表
def load_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        stopwords = set(file.read().strip().split('\n'))
    return stopwords

stopwords = load_stopwords('stopwords.txt')

# 分词并去除停用词
def jieba_tokenizer(text):
    words = jieba.cut(text)
    return ' '.join([word for word in words if word not in stopwords])

scholar_abstracts = scholar_abstracts.apply(jieba_tokenizer)
ai_abstracts = ai_abstracts.apply(jieba_tokenizer)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\60938\AppData\Local\Temp\jieba.cache
Loading model cost 0.416 seconds.
Prefix dict has been built successfully.


In [2]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer

class BEM(nn.Module):
    def __init__(self, pretrained_model_name='bert-base-uncased'):
        super(BEM, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids=input_ids, 
                            attention_mask=attention_mask, 
                            token_type_ids=token_type_ids)
        last_hidden_state = outputs.last_hidden_state
        return last_hidden_state

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class RCNN(nn.Module):
    def __init__(self, embed_size, hidden_size, num_layers, num_classes, dropout=0.5):
        super(RCNN, self).__init__()
        self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, bidirectional=True, dropout=dropout)
        self.conv = nn.Conv1d(in_channels=hidden_size*2, out_channels=hidden_size*2, kernel_size=3, padding=1)
        self.max_pool = nn.MaxPool1d(kernel_size=2)
        self.fc = nn.Linear(hidden_size*2, num_classes)
    
    def forward(self, x):
        # x: [batch_size, seq_length, embed_size]
        h_rnn, _ = self.rnn(x)
        # h_rnn: [batch_size, seq_length, hidden_size*2]
        h_rnn = h_rnn.permute(0, 2, 1)
        # h_rnn: [batch_size, hidden_size*2, seq_length]
        h_conv = torch.relu(self.conv(h_rnn))
        # h_conv: [batch_size, hidden_size*2, seq_length]
        h_pool = self.max_pool(h_conv)
        # h_pool: [batch_size, hidden_size*2, seq_length//2]
        h_pool = h_pool.permute(0, 2, 1)
        # h_pool: [batch_size, seq_length//2, hidden_size*2]
        h_pool = torch.mean(h_pool, dim=1)
        # h_pool: [batch_size, hidden_size*2]
        output = self.fc(h_pool)
        # output: [batch_size, num_classes]
        return output

embed_size = 768
hidden_size = 256
num_layers = 2
num_classes = 10
rcnn_model = RCNN(embed_size, hidden_size, num_layers, num_classes)

batch_size = 2
seq_length = 20
dummy_embeddings = torch.randn(batch_size, seq_length, embed_size)

output = rcnn_model(dummy_embeddings)

In [5]:
class BEM_RCNN(nn.Module):
    def __init__(self, pretrained_model_name='bert-base-uncased', hidden_size=256, num_layers=2, num_classes=10, dropout=0.5):
        super(BEM_RCNN, self).__init__()
        self.bem = BEM(pretrained_model_name)
        self.rcnn = RCNN(embed_size=768, hidden_size=hidden_size, num_layers=num_layers, num_classes=num_classes, dropout=dropout)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        embeddings = self.bem(input_ids, attention_mask, token_type_ids)
        output = self.rcnn(embeddings)
        return output

bwem_rcnn_model = BEM_RCNN()

input_ids = torch.randint(0, 1000, (batch_size, seq_length))
attention_mask = torch.ones(batch_size, seq_length)
token_type_ids = torch.zeros(batch_size, seq_length)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
import torch.optim as optim

num_epochs = 10
learning_rate = 1e-4

model = BEM_RCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    outputs = model(input_ids, attention_mask, token_type_ids)
    labels = torch.tensor([0, 1]) 
    loss = criterion(outputs, labels)

    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

torch.save(model.state_dict(), 'bem_rcnn_model.pth')