In [1]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
 
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
 
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
 
%matplotlib inline
%config InlineBackend.figure_format='retina'
 
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
 
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
 
rcParams['figure.figsize'] = 12, 8
 
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = "C:/Users/86134/Desktop/毕业设计/data/ASAP_ASPECT/ASAP_ASPECT/train.tsv"
df = pd.read_csv(train_data, header=0,sep='\t')

In [3]:
df.shape

(213371, 3)

In [4]:
df.head(10)

Unnamed: 0,content,AspectTerm,polarity
0,# 大众点评网霸王餐活动#，第一次去布凡面包店，先用百度地图搜一下，从院士路小路进去很好找...,Ambience#Space,1
1,# 大众点评网霸王餐活动#，第一次去布凡面包店，先用百度地图搜一下，从院士路小路进去很好找...,Food#Taste,1
2,# 大众点评网霸王餐活动#，第一次去布凡面包店，先用百度地图搜一下，从院士路小路进去很好找...,Price#Cost_effective,1
3,# 大众点评网霸王餐活动#，第一次去布凡面包店，先用百度地图搜一下，从院士路小路进去很好找...,Price#Discount,1
4,# 大众点评网霸王餐活动#，第一次去布凡面包店，先用百度地图搜一下，从院士路小路进去很好找...,Service#Hospitality,1
5,####和朋友约在皇庭广场见面，差不多每个星期都会过来这边闲逛，里面大多数餐厅已经吃过了，朋...,Ambience#Noise,-1
6,####和朋友约在皇庭广场见面，差不多每个星期都会过来这边闲逛，里面大多数餐厅已经吃过了，朋...,Ambience#Space,0
7,####和朋友约在皇庭广场见面，差不多每个星期都会过来这边闲逛，里面大多数餐厅已经吃过了，朋...,Food#Portion,0
8,####和朋友约在皇庭广场见面，差不多每个星期都会过来这边闲逛，里面大多数餐厅已经吃过了，朋...,Food#Taste,0
9,####和朋友约在皇庭广场见面，差不多每个星期都会过来这边闲逛，里面大多数餐厅已经吃过了，朋...,Price#Discount,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213371 entries, 0 to 213370
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   content     213371 non-null  object
 1   AspectTerm  213371 non-null  object
 2   polarity    213371 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 4.9+ MB


In [6]:
def to_sentiment(polarity):
    polarity = int(polarity)
    if polarity == -1:
        return 0
    elif polarity == 0:
        return 1
    elif polarity == 1:
        return 2

 
df['sentiment'] = df.polarity.apply(to_sentiment)
 
class_names = ['negative','neutral', 'positive']


In [7]:
PRE_TRAINED_MODEL_NAME= 'bert-base-chinese'

In [8]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [9]:
MAX_LEN = 512

In [10]:
class GPReviewDataset(Dataset):
 
    def __init__(self, reviews, review_aspects, targets, tokenizer, max_len):
        self.reviews = reviews
        self.review_aspects = review_aspects
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
 
    def __len__(self):
        return len(self.reviews)
 
    def __getitem__(self, item):
        review = str(self.reviews[item])
        review_aspects = str(self.review_aspects[item])
        target = self.targets[item]
        mask_location = review + "[SEP]" + review_aspects
        #sequence = review + "[SEP]" +review_aspects + "[MASK]"
        sequence ="[MASK]"+review_aspects+"[SEP]"+ review ##把mask放到最前面
        
        encoding = self.tokenizer.encode_plus(
            sequence,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        id_list = len(sequence)
 
        return {
            'review_text':sequence,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.int64),
            'id_list':id_list
        }

In [11]:
df_train, df_test = train_test_split(
    df,
    test_size=0.1,
    random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
    df_test,
    test_size=0.5,
    random_state=RANDOM_SEED
)
 
df_train.shape, df_val.shape, df_test.shape

((192033, 4), (10669, 4), (10669, 4))

In [12]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = GPReviewDataset(
        reviews=df.content.to_numpy(),
        review_aspects = df.AspectTerm.to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
 
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=0
    )
 
BATCH_SIZE = 2
 
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [13]:
data = next(iter(train_data_loader))
data.keys()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


dict_keys(['review_text', 'input_ids', 'attention_mask', 'targets', 'id_list'])

In [14]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)
print(data['id_list'].shape)

torch.Size([2, 512])
torch.Size([2, 512])
torch.Size([2])
torch.Size([2])


In [15]:
data['id_list']

tensor([287, 299])

In [16]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
class SentimentClassifier(nn.Module):
 
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, output_hidden_states=True, output_attentions=True, return_dict=False)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
 
    def forward(self, input_ids, attention_mask,id_list):
        last_hidden_state,_,_,_ = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(last_hidden_state)
        #data['id_list']
        real_output = output[:,1, :]
        #output = output[id_list,:]
        #output = self.classifier(output)
        #return output
        return self.out(real_output)

In [18]:
model = SentimentClassifier(len(class_names))
model = model.to(device)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
#input_ids = data['input_ids'].to(device)
#attention_mask = data['attention_mask'].to(device)
 
#print(input_ids.shape) # batch size x seq length
#print(attention_mask.shape) # batch size x seq length

In [20]:
#torch.nn.functional.softmax(model(input_ids, attention_mask), dim=1)

In [21]:
EPOCHS = 40
 
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
 
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
 
loss_fn = nn.CrossEntropyLoss().to(device)



In [22]:
def train_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
  ):
    model = model.train()
 
    losses = []
    correct_predictions = 0
 
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        #input_ids = input_ids.squeeze(0)
        attention_mask = d["attention_mask"].to(device)
        #attention_mask = attention_mask.squeeze(0)
        #print(input_ids.shape) # batch size x seq length
        #print(attention_mask.shape) # batch size x seq length
        targets = d["targets"].to(device)
        id_list = d["id_list"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            id_list=id_list
        )
 
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
 
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
 
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
 
    return correct_predictions.double() / n_examples, np.mean(losses)

In [23]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
 
    losses = []
    correct_predictions = 0
 
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            id_list = d["id_list"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                id_list=id_list
            )
            _, preds = torch.max(outputs, dim=1)
 
            loss = loss_fn(outputs, targets)
 
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
 
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time
 
history = defaultdict(list)
best_accuracy = 0
 
for epoch in range(EPOCHS):
 
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
 
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
 
    print(f'Train loss {train_loss} accuracy {train_acc}')
 
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )
 
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()
 
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
 
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

Epoch 1/40
----------


In [23]:
##没有什么用的分割线
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
 
print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

torch.Size([4, 160])
torch.Size([4, 160])


In [24]:
history['train_acc']

[tensor(0.7008, device='cuda:0', dtype=torch.float64),
 tensor(0.8497, device='cuda:0', dtype=torch.float64),
 tensor(0.9025, device='cuda:0', dtype=torch.float64),
 tensor(0.9326, device='cuda:0', dtype=torch.float64),
 tensor(0.9566, device='cuda:0', dtype=torch.float64),
 tensor(0.9717, device='cuda:0', dtype=torch.float64),
 tensor(0.9811, device='cuda:0', dtype=torch.float64),
 tensor(0.9840, device='cuda:0', dtype=torch.float64),
 tensor(0.9868, device='cuda:0', dtype=torch.float64),
 tensor(0.9887, device='cuda:0', dtype=torch.float64)]

In [25]:
history['val_acc']

[tensor(0.7119, device='cuda:0', dtype=torch.float64),
 tensor(0.7458, device='cuda:0', dtype=torch.float64),
 tensor(0.7712, device='cuda:0', dtype=torch.float64),
 tensor(0.7712, device='cuda:0', dtype=torch.float64),
 tensor(0.7712, device='cuda:0', dtype=torch.float64),
 tensor(0.7881, device='cuda:0', dtype=torch.float64),
 tensor(0.7797, device='cuda:0', dtype=torch.float64),
 tensor(0.7797, device='cuda:0', dtype=torch.float64),
 tensor(0.7712, device='cuda:0', dtype=torch.float64),
 tensor(0.7712, device='cuda:0', dtype=torch.float64)]

In [26]:
#调用训练好的模型
model = SentimentClassifier(len(class_names))
model.load_state_dict(torch.load('best_model_state.bin'))
model = model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
test_acc, _ = eval_model(
    model,
    test_data_loader,
    loss_fn,
    device,
    len(df_test)
)
test_acc.item()

0.8220338983050848

In [41]:
def get_predictions(model, data_loader):
    model = model.eval()
 
    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []
 
    with torch.no_grad():
        for d in data_loader:
 
            texts = d["review_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            id_list = d["id_list"].to(device)
 
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                id_list=id_list
            )
            _, preds = torch.max(outputs, dim=1)
 
            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(targets)
 
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return review_texts, predictions, prediction_probs, real_values

In [42]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
    model,
    test_data_loader
)

In [44]:
idx = 1
 
review_text = y_review_texts[idx]
true_sentiment = y_test[idx]
pred_df = pd.DataFrame({
    'class_names': class_names,
    'values': y_pred_probs[idx]
})
 
print("\n".join(wrap(review_text)))
print()
print(f'True sentiment: {class_names[true_sentiment]}')

[MASK]user interface[SEP]The minute you fire it up it's all good, very
easy user interface.

True sentiment: positive


In [78]:
review_text = "Hate you!!!"

In [79]:
encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
)



In [80]:
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)
 
output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)
 
print(f'Review text: {review_text}')
print(f'Sentiment  : {class_names[prediction]}')

Review text: Hate you!!!
Sentiment  : negative
