In [2]:
from transformers import AutoTokenizer,AutoModel,RobertaTokenizer, RobertaModel
from datasets import Dataset, DatasetDict,load_dataset, load_metric, load_from_disk
from torch.utils.data import TensorDataset,DataLoader
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
import time
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.model_selection import train_test_split
from torch.cuda.amp import autocast
import pandas as pd
import numpy as np
import os
# 设置随机种子
seed = 42
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True  # 如果使用GPU，确保其可重复性
np.random.seed(seed)  # 也设置 NumPy 随机种子，以确保数据加载等操作的随机性一致

In [3]:
df1 = pd.read_csv('Amzon/1429_1.csv')
df2 = pd.read_csv('Amzon/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')
df3 = pd.read_csv('Amzon/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv')

  df1 = pd.read_csv('Amzon/1429_1.csv')


In [4]:
df = pd.concat([df1, df2, df3])
df.shape

(67992, 27)

In [5]:
df = df[df['reviews.rating'].notnull() & df['reviews.text'].notnull()]

In [6]:
df = df[['reviews.text', 'reviews.rating']]
df.columns = ['text', 'label']
df['label'] = df['label'].astype(int)
df['label'] = df['label'] - 1
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67958 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    67958 non-null  object
 1   label   67958 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.6+ MB


In [7]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=seed)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=seed)
print(df_train.shape, df_val.shape, df_test.shape)

(61162, 2) (3398, 2) (3398, 2)


In [8]:
dataset_dict = {
    "train": Dataset.from_pandas(df_train).remove_columns('__index_level_0__'),
    "val": Dataset.from_pandas(df_val).remove_columns('__index_level_0__'),
    "test": Dataset.from_pandas(df_test).remove_columns('__index_level_0__')
}
dataset = DatasetDict(dataset_dict)
dataset

del df, df_train, df_val, df_test

In [9]:
# 导入模型和分词器
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english') 
bertmodel = AutoModel.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
bertmodel = bertmodel.to('cuda')

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
def tokenize_function(example):
   return tokenizer(example["text"],truncation=True,padding="max_length")

In [11]:
lr = 0.001
num_epoches = 30
input_size = 768
output_size = 5

In [12]:
class Net(nn.Module):
    def __init__(self,input_size, hidden_size, num_layers):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers,batch_first = True)
        self.linear = nn.Linear(hidden_size,output_size)
    def forward(self,x):
        batch_size = x.shape[0]
        h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float()
        c_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float()
        out, hidden = self.lstm(x, (h_0, c_0))
        out = out[:, -1, :]
        out = self.linear(out)
        return out

In [13]:
def train(input_size, hidden_size, num_layers,rnn_model,optimizer):
    total_loss = []
    best_val=0
    best_n=0
    rnn_model.train()
    for epoch in range(num_epoches):
        total_correct=0
        for ite,(X_raw,y) in enumerate(data_loader):
            X_raw=X_raw.to('cuda')
            x = bertmodel(X_raw)[0].to('cuda')
            output = rnn_model(x)
            loss = F.cross_entropy(output.float(),y.to('cuda'))
            total_correct += int((output.cpu().argmax(dim=-1) == y.cpu()).sum())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss.append(loss.item())
            if ite%500==0:
                print("Epoch{} iteration {} loss {}".format(epoch,ite,loss.item()))
        print(f"____total_correct:{total_correct/61162:.4f}______")
        if best_val<(total_correct/61162):
            best_val=(total_correct/61162)
            best_n=0
        else:
            best_n+=1
        if best_n>2:
            break
    print(f"____best_val:{best_val:.4f}______")

In [14]:
def top2_accuracy(output, target):
    # 获取前两个最高分数的索引
    _, top2_indices = torch.topk(output, k=2, dim=-1)
    # 检查目标是否包含在前两个最高分数的索引中
    correct = ((top2_indices == target.view(-1, 1)) | (top2_indices == target.view(-1, 1).repeat(1, 2))).any(dim=1)
    # 计算top-2准确率
    top2_acc = correct.float().sum()
    return top2_acc

In [15]:
def test(rnn_model,optimizer):
    rnn_model.eval()
    y_pred=[]
    total_correct=0
    top2_correct=0
    t=0
    for ite,(X_raw,y) in enumerate(data_loader):
        t0=time.time()
        X_raw=X_raw.to('cuda')
        t1=time.time()
        x = bertmodel(X_raw)[0].to('cuda')
        t2=time.time()
        output = rnn_model(x)
        t3=time.time()
        y_pred.append(output.argmax(dim=-1))
        total_correct += int((output.cpu().argmax(dim=-1) == y.cpu()).sum())
        top2_correct  += top2_accuracy(output, y.to('cuda'))
        t+=t3-t0-(t2-t1)
    print(f"____test_total_correct:{total_correct/3398:.4f}______")
    print(f"____top2_correct:{top2_correct/3398:.4f}______")
    print(f"____f1_score:{f1_score(dataset['test']['label'], torch.hstack(y_pred).cpu().numpy(),average='macro'):.4f}______")
    print(f"____time:{t}______")

In [16]:
batch_size = 16
for num_layers in [1,2,3,4]:
    for hidden_size in [4,8,16,32,64,128,256]:
        rnn_model = Net(input_size, hidden_size, num_layers).to('cuda')
        optimizer = optim.Adam(rnn_model.parameters(),lr,weight_decay=5e-4)
        total_params = sum(p.numel() for p in rnn_model.parameters() if p.requires_grad)
        print(f"Total trainable parameters: {total_params}")

        tokenized_datasets = dataset['train'].map(tokenize_function, batched=True)
        data = TensorDataset(torch.tensor(tokenized_datasets['input_ids']),torch.tensor(dataset['train']['label']))
        data_loader = DataLoader(data, batch_size=batch_size, shuffle=False,pin_memory=False)
        train(input_size, hidden_size, num_layers,rnn_model,optimizer)
        
        del tokenized_datasets, data, data_loader
        
        tokenized_datasets = dataset['test'].map(tokenize_function, batched=True)
        data = TensorDataset(torch.tensor(tokenized_datasets['input_ids']),torch.tensor(dataset['test']['label']))
        data_loader = DataLoader(data, batch_size=batch_size, shuffle=False,pin_memory=False)
        test(rnn_model,optimizer)

Total trainable parameters: 12889


  0%|          | 0/62 [00:00<?, ?ba/s]

Epoch0 iteration 0 loss 1.531180739402771
Epoch0 iteration 500 loss 1.3334306478500366
Epoch0 iteration 1000 loss 0.5830434560775757
Epoch0 iteration 1500 loss 1.0218111276626587
Epoch0 iteration 2000 loss 1.0663148164749146
Epoch0 iteration 2500 loss 0.777960479259491
Epoch0 iteration 3000 loss 0.6810263991355896
Epoch0 iteration 3500 loss 0.9873929619789124
____total_correct:0.6832______
Epoch1 iteration 0 loss 0.5763147473335266
Epoch1 iteration 500 loss 1.3536567687988281
Epoch1 iteration 1000 loss 0.5823315382003784
Epoch1 iteration 1500 loss 1.0223135948181152
Epoch1 iteration 2000 loss 1.0668894052505493
Epoch1 iteration 2500 loss 0.7779914140701294
Epoch1 iteration 3000 loss 0.6808350086212158
Epoch1 iteration 3500 loss 0.9873849153518677
____total_correct:0.6933______
Epoch2 iteration 0 loss 0.5761855840682983
Epoch2 iteration 500 loss 1.3543697595596313
Epoch2 iteration 1000 loss 0.5822912454605103
Epoch2 iteration 1500 loss 1.0223913192749023
Epoch2 iteration 2000 loss 1.066

  0%|          | 0/4 [00:00<?, ?ba/s]

____test_total_correct:0.7004______
____top2_correct:0.9179______
____f1_score:0.1648______
____time:5.1954731941223145______
Total trainable parameters: 26669


  0%|          | 0/62 [00:00<?, ?ba/s]

Epoch0 iteration 0 loss 1.6706026792526245
Epoch0 iteration 500 loss 1.345374345779419
Epoch0 iteration 1000 loss 0.5837663412094116
Epoch0 iteration 1500 loss 1.0117253065109253
Epoch0 iteration 2000 loss 1.0692806243896484
Epoch0 iteration 2500 loss 0.7759770154953003
Epoch0 iteration 3000 loss 0.6824392080307007
Epoch0 iteration 3500 loss 0.9855846762657166
____total_correct:0.6892______
Epoch1 iteration 0 loss 0.5813770294189453
Epoch1 iteration 500 loss 1.3487082719802856
Epoch1 iteration 1000 loss 0.5831009745597839
Epoch1 iteration 1500 loss 1.0195457935333252
Epoch1 iteration 2000 loss 1.0682543516159058
Epoch1 iteration 2500 loss 0.7762359380722046
Epoch1 iteration 3000 loss 0.6823593378067017
Epoch1 iteration 3500 loss 0.9849435091018677
____total_correct:0.6933______
Epoch2 iteration 0 loss 0.5793467164039612
Epoch2 iteration 500 loss 1.3500044345855713
Epoch2 iteration 1000 loss 0.5828560590744019
Epoch2 iteration 1500 loss 1.0202122926712036
Epoch2 iteration 2000 loss 1.06

  0%|          | 0/4 [00:00<?, ?ba/s]

____test_total_correct:0.7004______
____top2_correct:0.9179______
____f1_score:0.1648______
____time:5.202576637268066______
Total trainable parameters: 56917


  0%|          | 0/62 [00:00<?, ?ba/s]

Epoch0 iteration 0 loss 1.5148251056671143
Epoch0 iteration 500 loss 1.3513903617858887
Epoch0 iteration 1000 loss 0.5850570201873779
Epoch0 iteration 1500 loss 1.0109034776687622
Epoch0 iteration 2000 loss 1.0718086957931519
Epoch0 iteration 2500 loss 0.7737839221954346
Epoch0 iteration 3000 loss 0.6819460988044739
Epoch0 iteration 3500 loss 0.9849610924720764
____total_correct:0.6933______
Epoch1 iteration 0 loss 0.5900102853775024
Epoch1 iteration 500 loss 1.3482285737991333
Epoch1 iteration 1000 loss 0.5845285654067993
Epoch1 iteration 1500 loss 1.0148593187332153
Epoch1 iteration 2000 loss 1.0720133781433105
Epoch1 iteration 2500 loss 0.7745486497879028
Epoch1 iteration 3000 loss 0.6821832656860352
Epoch1 iteration 3500 loss 0.9847955703735352
____total_correct:0.6933______
Epoch2 iteration 0 loss 0.5883729457855225
Epoch2 iteration 500 loss 1.347787857055664
Epoch2 iteration 1000 loss 0.5842571258544922
Epoch2 iteration 1500 loss 1.0160644054412842
Epoch2 iteration 2000 loss 1.07

  0%|          | 0/4 [00:00<?, ?ba/s]

____test_total_correct:0.7004______
____top2_correct:0.9179______
____f1_score:0.1648______
____time:5.205257415771484______
