# watermark

An IPython magic extension for printing date and time stamps, version numbers, and hardware information.

In [None]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import transformers
from transformers import BertModel, BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

from datetime import datetime as dt
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
torch.cuda.is_available()#確認是否能用 GPU 跑

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
#PRE_TRAINED_MODEL_NAME = 'bert-base-multilingual-cased'

EMOTION = 'cped_abu_chinese_bert'
MAX_LEN = 300
CLASS_NAMES = [f'not_{EMOTION.lower()}', f'{EMOTION.lower()}']

tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
class_df = pd.read_csv(f"CEmo2000_clean.csv", encoding="utf-8_sig")
normal_df = pd.read_csv(f"cped_neutral_4000.csv", encoding="utf-8_sig")

class_df.columns = ['tmp', 'content']
class_df.drop(columns = ['tmp'])

normal_df.columns = ['tmp', 'content']
normal_df.drop(columns = ['tmp'])

label = np.ones(len(class_df), dtype = int)
class_df.insert(1, 'label', label)

label = np.zeros(len(normal_df), dtype = int)
normal_df.insert(1, 'label', label)

df = pd.concat([class_df, normal_df], ignore_index=True)

# df
print("Number of negative samples:", len(df[df['label'] == 1]))
print("Number of positive samples:", len(df[df['label'] == 0]))

df

In [None]:
class CancerEmoDataset(Dataset):
    def __init__(self, sentences, targets, tokenizer, max_len):
        self.sentences = sentences
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        sentence = str(self.sentences[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'sentence': sentence,
            'input_ids': encoding['input_ids'].flatten(), ## look up this function,
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype = torch.long)
        }

In [None]:
from sklearn.utils import shuffle

df_train, df_test = train_test_split(df, train_size = 0.8)
df_test, df_val = train_test_split(df_test, train_size = 0.5)

print(df_train.shape, df_val.shape, df_test.shape)

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = CancerEmoDataset(
        sentences = df['content'].to_numpy(),
        targets = df['label'].to_numpy(),
        tokenizer = tokenizer,
        max_len = max_len
    )
    
    return DataLoader(
        ds,
        batch_size = batch_size,
        num_workers = 4
    )

In [None]:
BATCH_SIZE = 8 #original : 12

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
class SentimentClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p = 0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        pooled_output = self.bert(
            input_ids = input_ids, 
            attention_mask = attention_mask
        ).pooler_output
        output = self.drop(pooled_output)
        return self.out(output)


In [None]:
model = SentimentClassifier(len(CLASS_NAMES))
model = model.to(device)

In [None]:
EPOCHS = 8
LEARNING_RATE = 2e-6

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
    model = model.train()
    #optimizer = "adamw_torch"
    losses = []
    correct_predictions = 0
  
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    print(f"[Training] Correct predictions: {correct_predictions.double()}, Total examples: {n_examples}")
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
    
    print(f"[Validation] Correct predictions: {correct_predictions.double()}, Total examples: {n_examples}")
    return correct_predictions.double() / n_examples, np.mean(losses)

#### 開始訓練

In [None]:
%%time
def save_model(best_accuracy, best_state_dict):
    global filename
    acc_string = "{:.2f}".format(best_accuracy)
    filename = f"{EMOTION}-{dt.now().strftime('%Y-%m-%d-%H-%M-%S')}-{acc_string}.pkl"
    torch.save(best_state_dict, filename)

filename = ""

history = defaultdict(list)

best_accuracy = 0.0
best_state_dict = {}

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,    
        loss_fn, 
        optimizer, 
        device, 
        scheduler, 
        len(df_train)
    )

    print(f'[Training] Loss: {train_loss} Accuracy: {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn, 
        device, 
        len(df_val)
    )

    print(f'[Validation] Loss: {val_loss} Accuracy: {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        best_accuracy = val_acc
        best_state_dict = model.state_dict()

save_model(best_accuracy, best_state_dict)

#### 訓練完成，確認模型名稱

In [None]:
filename

In [None]:
def get_predictions(model, data_loader):
    model = model.eval()
    sentences = []
    predictions = []
    prediction_probs = []
    real_values = []
    with torch.no_grad():
        for d in data_loader:
            sentence = d["sentence"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            sentences.extend(sentence)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(targets)
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return sentences, predictions, prediction_probs, real_values

### 測試區域
filename 也可以填入其他現有的模型進行測試

In [None]:
#filename = 'model_name.pkl'

In [None]:
model.load_state_dict(torch.load(filename,map_location=torch.device('cpu')))
model.eval()

In [None]:
import json
#要預測的檔案
with open('測試集.json', 'r') as f:
    test = json.load(f)
    ping_df = pd.DataFrame(test)

In [None]:
#NEED
import random, re
t_index = []
for i in range(len(ping_df)):
  t_index.insert(0,i)

In [None]:
t_df = pd.DataFrame(ping_df['content'])
sample_t_df = pd.concat([t_df.loc[x] for x in t_index], ignore_index=True)
sample_t_df = pd.DataFrame(sample_t_df)
label = np.zeros(len(t_index), dtype = int)
sample_t_df.insert(1, 'label', label)
sample_t_df.columns  = ['content', 'label']
sample_t_df.tail(40)

In [None]:
con = list(sample_t_df['content'])
sample_t_df = sample_t_df.drop(['content'], axis = 1)
sample_t_df.insert(0, value = con, column = 'content')
sample_t_df.tail(40)

In [None]:
BATCH_SIZE = 8

sample_t_df_loader = create_data_loader(sample_t_df, tokenizer, MAX_LEN, BATCH_SIZE)
#NEED
y_sentences, y_pred, y_pred_probs, y_test = get_predictions(
    model,
    sample_t_df_loader
)
#NEED
results = pd.DataFrame(y_sentences)
results.insert(value = y_pred, column = "pred" , loc = 0)
results.insert(value = y_test, column = "test" , loc = 0)

# results.insert(value = y_pred_probs, column = "pred_probs", loc = 0)
# results.columns = ['pred', 'content']
results.columns = ['test', 'pred', 'content']

#type(results) : pandas.core.frame.DataFrame

In [None]:
#Scoring
from sklearn.metrics import accuracy_score
#score = accuracy_score(results['test'],results['pred'])
score = accuracy_score(ping_df['pred'],results['pred'])
print(score)

### 進行評分

In [None]:
tp = 0
tn = 0
fp = 0
fn = 0

for i in range(len(results)):
    if ping_df['pred'][i] == results['pred'][i]:
        if ping_df['pred'][i] == 1:
            tp+=1
        else:
            tn+=1
    else:
        if ping_df['pred'][i] == 1:
            fp+=1
        else:
            fn+=1

print("accuracy : " + str((tp+tn)/(tp+fp+fn+tn)))
print("precision : " + str(tp/(tp+fp)))
print("recall : "+ str(tp/(tp+fn)))

#results['pred'].head(40)
#ping_df['pred'].head(40)

In [None]:
datalist = []
cc = 0
count = 0
for r in range(len(results)):
    #強制轉 int 才可以存 json
    data = dict(pred = int(results['pred'][r]),content = results['content'][r])
    datalist.append(data)

with open('Prediction/model_prediction.json', 'w', encoding='utf-8') as f:
    json.dump(datalist, f, ensure_ascii=False, indent=4)