In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from transformers import Trainer, TrainingArguments

from IPython.display import clear_output
import logging
logging.basicConfig(level=logging.ERROR)

from cf_matrix import make_confusion_matrix

In [None]:
def split(df, need_emoji = True):
    if need_emoji:
        X = list(df['review'])
    else:
        X = list(df['no_emoji_review'])
    y = list(df['label'])
    # 80% train, 10% development, 10% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 0)
    return X_train, X_val, X_test, y_train, y_val, y_test

def performance(X_test, y_test, classifier):
    convert = {'positive (stars 4 and 5)': 1, 'negative (stars 1, 2 and 3)': 0}
    #y_pred = [convert[classifier(review)[0]['label']] for review in X_test]
    y_pred = []
    for review in X_test:
        y_pred.append(convert[classifier(review)[0]['label']])
        clear_output(wait = True)
        print("{}/{}".format(len(y_pred), len(y_test)))
    cf_matrix = confusion_matrix(y_test, y_pred)
    labels = ['TN', 'FP', 'FN', 'TP']
    categories = ['Negative', 'Positive']
    make_confusion_matrix(cf_matrix, group_names = labels, categories = categories, cmap = 'binary')

In [None]:
df = pd.read_csv('Data/processed_data.csv')
df = df.dropna()
df = df.drop("Unnamed: 0", axis = 1)
print(df.shape[0])
df.head()

In [None]:
df_emoji = df[df['has_emoji'] == 1]
X_train_1, X_val_1, X_test_1, y_train_1, y_val_1, y_test_1 = split(df_emoji)
X_train_2, X_val_2, X_test_2, y_train_2, y_val_2, y_test_2 = split(df_emoji, need_emoji = False)

In [None]:
class WeiboSentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('uer/chinese_roberta_L-12_H-768')

In [None]:
X_train_encodings = tokenizer(X_train_1, truncation=True, padding=True)
X_val_encodings = tokenizer(X_val_1, truncation=True, padding=True)
X_test_encodings = tokenizer(X_test_1, truncation=True, padding=True)

In [None]:
train_dataset = WeiboSentDataset(X_train_encodings, y_train_1)
val_dataset = WeiboSentDataset(X_val_encodings, y_val_1)
test_dataset = WeiboSentDataset(X_test_encodings, y_test_1)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    learning_rate=5e-5,              # learning rate or step size
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

In [None]:
trainer.train()