# Predict the sentiment of all sentences by BERT

## Read data (sentences of the full text of the FYP-Ts of 23 cities)

In [15]:
import pandas as pd

all_data = pd.read_csv('all_23_cities.csv')
print(all_data.shape)
all_data.head(1)

(19676, 29)


Unnamed: 0,Tier,Region,City,ID,Year,Token,Car,Shared mobility,Bike,Walking,...,Regional development balance,Alternative fuel vehicles,Electric vehicles,Autonomous vehicles,Parcel delivery,sum,Notes,SA_Li,SA_Guo,Gap
0,1,East,Beijing,1,2010,“十一五”期间是北京市机动车增长最快、交通投入最大、交通结构改善最明显、交通管理最有效、市民...,,,,,...,,,,,,0,机动车,,,


## Data preprocessing (filter sentences by tokens)

In [16]:
all_data_token = all_data[all_data['sum'] > 0]
all_data_same_senti_label = all_data[all_data['Gap'] == 0]
all_data_token_same_senti_label = all_data_token[all_data_token['Gap'] == 0]

In [17]:
print(all_data_token.shape)
print(all_data_same_senti_label.shape)
print(all_data_token_same_senti_label.shape)

(6557, 29)
(5699, 29)
(5394, 29)


In [18]:
all_data_token_same_senti_label = all_data_token_same_senti_label.astype({"SA_Li": 'int64', "SA_Guo": 'int64', 'Gap': 'int64'})
# all_data_token_same_senti_label.dtypes

## Train test data split

In [19]:
train = all_data_token_same_senti_label.sample(n=600, random_state=33)
# exclude the training set
test = pd.concat([all_data_token, train, train]).drop_duplicates(keep=False)  # sentence level
# test = pd.concat([all_data, train, train]).drop_duplicates(keep=False).sample(n=1000, random_state=33)  # article level
print(train.shape)
print(test.shape)
print(all_data_token.shape)

(600, 29)
(5957, 29)
(6557, 29)


In [20]:
print(train['Token'].values[0])
print('- - - - - - - - - - - - -')
print(test['Token'].values[0])
train_x = train['Token'].tolist()
train_y = list(map(lambda x:x+1, train['SA_Li'].tolist()))
test_x = test['Token'].tolist()
test_y = test['SA_Li'].tolist()

结合贵阳市环城快铁、轨道交通和快速公交等交通系统的建设，促进不同交通系统专网融合以及交通专网与“天网”“公网”深度融合。
- - - - - - - - - - - - -
在市委、市政府的坚强领导下，交通系统深入学习实践科学发展观，振奋精神、开拓创新，克服重重困难，加快构建以“人文交通、科技交通、绿色交通”为特征的新北京交通体系，应对了5年净增200多万辆机动车的挑战，基本满足了市民不断增长和变化的交通需求，适应了首都经济社会发展的需要。


## Use transformer to define the fine-tune model based on BERT

In [21]:
import torch
from transformers import BertModel, BertTokenizer
import torch.nn as nn

model_name = 'hfl/chinese-bert-wwm'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# A bert fine-tuning strategy is used to adjust the parameters of the BERT and the linear layer together during back propagation to make BERT more suitable for the classification task.
class BertClassfication(nn.Module):
    def __init__(self):
        super(BertClassfication,self).__init__()
        self.model_name = 'hfl/chinese-bert-wwm'
        self.model = BertModel.from_pretrained(self.model_name)
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.fc = nn.Linear(768,3)     # depend on the structure of BERT, 2-layer, 768-hidden, 12-heads, 110M parameters
        # nn.Linear(in_features, out_features)

    def forward(self,x):               # The input is a list here.
        batch_tokenized = self.tokenizer.batch_encode_plus(x, add_special_tokens=True,
                                max_length=148, pad_to_max_length=True)     
        input_ids = torch.tensor(batch_tokenized['input_ids'])
        attention_mask = torch.tensor(batch_tokenized['attention_mask'])
        hiden_outputs = self.model(input_ids,attention_mask=attention_mask)
        outputs = hiden_outputs[0][:,0,:]     
        output = self.fc(outputs)
        return output
model = BertClassfication()

## Start training with batches

In [22]:
batch_size = 64
batch_count = int(len(train) / batch_size)
batch_train_inputs, batch_train_targets = [], []

for i in range(batch_count):
    batch_train_inputs.append(train_x[i*batch_size : (i+1)*batch_size])
    batch_train_targets.append(train_y[i*batch_size : (i+1)*batch_size])

# 初始化训练参数
bertclassfication = BertClassfication()
lossfuction = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(bertclassfication.parameters(),lr=2e-5)
epoch = 5
batch_count = batch_count
print_every_batch = 5

In [23]:
for _ in range(epoch):
    print_avg_loss = 0
    for i in range(batch_count):
        inputs = batch_train_inputs[i]
        targets = torch.tensor(batch_train_targets[i])
        optimizer.zero_grad()
        outputs = bertclassfication(inputs)
        loss = lossfuction(outputs, targets)
        loss.backward()
        optimizer.step()

        print_avg_loss += loss.item()
        if i % print_every_batch == (print_every_batch-1):
            print("Batch: %d, Loss: %.4f" % ((i+1), print_avg_loss/print_every_batch))
            print_avg_loss = 0

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.




Batch: 5, Loss: 0.8246
Batch: 5, Loss: 0.3328
Batch: 5, Loss: 0.2178
Batch: 5, Loss: 0.0980
Batch: 5, Loss: 0.0432


## Predict with the fine-tune model

### Try a demo

In [24]:
# sentiment_dict = {0:'Negative', 1:'Neutral', 2:'Positive'}

result = bertclassfication([train_x[0]])
_, predict = torch.max(result,1)
print(train_y[0])

2


### The results of the training set by BERT

In [25]:
output_train = []

for i in train_x:
    result = bertclassfication([i])
    _, predict = torch.max(result,1)
    output_train.append(int(predict))

In [26]:
train['SA_BERT'] = output_train
train['SA_BERT'] -= 1
# train.to_csv('training_set.csv', encoding='utf_8_sig', index=None)

### The results of the test set by BERT

In [27]:
output_test = []

for i in test_x:
    result = bertclassfication([i])
    _, predict = torch.max(result,1)
    output_test.append(int(predict))

In [None]:
test['SA_BERT'] = output_test
test['SA_BERT'] -= 1
# test.to_csv('sentence_level_test.csv', encoding='utf_8_sig', index=None)
# test.to_csv('article_level_test.csv', encoding='utf_8_sig', index=None)