In [7]:
#!pip install sklearn
#!pip install transformers
#!pip install -U ray
#!pip install -U ray[tune]
#!pip install datasets
#!pip install seaborn

In [8]:
import numpy as np
import pandas as pd

import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from transformers import Trainer, TrainingArguments

from ray import tune
from ray.tune import CLIReporter
from ray.tune.suggest.bayesopt import BayesOptSearch

from datasets import load_metric

from cf_matrix import make_confusion_matrix

from IPython.display import clear_output
import logging
logging.basicConfig(level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore")

## Load and Split Data

In [9]:
# remove emoji from data
def no_emoji(X):
    for i in range(len(X)):
        s = ''
        count = 0
        for j in range(len(X[i])):
            if X[i][j] == "[":
                count += 1
            elif count == 0:
                s += X[i][j]
            if X[i][j] == "]" and count > 0:
                count -= 1
        X[i] = s
        
    return X


# split data to train, validation, test
def split(df, need_emoji = True, random_state = 0):
    X = list(df['review'])
    y = list(df['label'])
    
    # 60% train, 20% development, 20% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = random_state)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = random_state)
    if not need_emoji:
        X_train = no_emoji(X_train)
        X_val = no_emoji(X_val)
        X_test = no_emoji(X_test)
    return X_train, X_val, X_test, y_train, y_val, y_test

### weibo

In [10]:
# load weibo data
df_weibo = pd.read_csv('data/processed_weibo_data.csv')
df_weibo = df_weibo.dropna().drop("Unnamed: 0", axis = 1)

In [11]:
# split weibo data to train, validation, test
# data with emoji
X_train_wb_1, X_val_wb_1, X_test_wb_1, y_train_wb_1, y_val_wb_1, y_test_wb_1 = split(df_weibo)
# same data with emoji removed
X_train_wb_0, X_val_wb_0, X_test_wb_0, y_train_wb_0, y_val_wb_0, y_test_wb_0 = split(df_weibo, need_emoji = False)

In [12]:
[X_train_wb_1[0:5], X_train_wb_0[0:5]]

[['猛然发现，「财经郎眼」之前的广告是修正牌消糜栓。[衰]',
  '看爪就知道是美女！[亲亲][爱你]',
  '你现在是在去怀柔的路上吗？[哈哈][哈哈] ',
  '时间过得太快，老感觉不够用！[泪]',
  '吃饱喝足谁也不服[哈哈][哈哈][哈哈]'],
 ['猛然发现，「财经郎眼」之前的广告是修正牌消糜栓。',
  '看爪就知道是美女！',
  '你现在是在去怀柔的路上吗？ ',
  '时间过得太快，老感觉不够用！',
  '吃饱喝足谁也不服']]

### dianping

In [13]:
# load dianping data
df_dianping = pd.read_csv('data/processed_dianping_data.csv')
df_dianping = df_dianping.dropna().drop("Unnamed: 0", axis = 1)

In [14]:
# split dianping data to train, validation, test
# dianping data does not have emoji
X_train_dp_0, X_val_dp_0, X_test_dp_0, y_train_dp_0, y_val_dp_0, y_test_dp_0 = split(df_dianping)

In [15]:
X_train_dp_0[0:5]

['中午在旺角逛街，12点多就去了附近的潮楼。那里应该算是早茶的地方的，但是中午人依然很多。还好到的早，不怎么需要等位的。领位的服务生，忙是忙，一时忽略了我们，还打招呼，不好意思。\n绫鱼丸，新鲜鱼肉做的，能吃的出有鱼刺的成分，但是绝对不会卡到的那种，很鲜；\n皮蛋瘦肉粥，好浓的，以至于我和V都说，要么回家以后也把粥煮成这样，但是味道肯定调不好的；\n虾饺，里面真的有3个虾仁，很Q的，要不是吃不下，肯定还会再点的；\n凤爪，比较酥烂，食材本身研制的比较入味，所以很好吃，V说比较适合她妈吃；\n肠粉，里面的料很好，也有虾仁的成分面还有新鲜蔬菜，结合的好香；\nXO酱萝卜糕，因为便宜就点来吃吃，绝对超过了萝卜的口感，超级好吃；\n牛肉丸，做的比较嫩，但是确很Q，跟芹菜结合的，有种特殊的香味。\n最后结帐$123，出乎意料，本来还以为要超200了。这顿真的吃的很舒服。\n',
 '鸟照烧 大家都懂了对伐 20块钱的中饭 还是蛮灵的\n就是感觉很 穷人的感觉，几片腌萝卜 还有鸟照烧 。\n米饭还是蛮好吃的\xa0\xa0MISO有点咸。\n店很古老了....\n',
 '也是从网上看到这个地方的推荐，找了半天都没有找到。瓷器口进去大概是个T型号的路，从最下面进去，往T的丁字口往右走就是江边。快到丁字口的路上，在右边有个非常窄的巷子，不注意就错过。进去才发现别有洞天，里面位置挺宽敞的，里面有个小牌子说是CCTV2推荐的。我们专程点了毛血旺，网上人说这个很地道。这边的毛血旺的确是最老方式的毛血旺，不过味道不太好，感觉重庆城里随便一个摊位上的都比这好吃。还好，点了一个折耳根还下饭，要不然真不知道吃什么。建议，没有什么必要就不要找这个地方吧。\n',
 '一直很怀念那儿的鸡，不知道是怎么做的，但是真的很嫩，每次去人基本上都客满的，还有各种炒菜，只是店小了点，值得一试\n',
 '南非世界杯，2点钟德国的半决赛是在这里看的。\n我觉得麦当劳比肯德基好，最近几年扩张后劲十足，反观肯德基似乎孱弱并且诟病不断，推出不少纯噱头难吃的要死的骗钱食品。\n加之麦咖啡进驻，麦当劳让越来越多人喜欢。\n']

## Load Models

In [18]:
tokenizer = AutoTokenizer.from_pretrained('uer/chinese_roberta_L-12_H-768')

In [19]:
# model with weibo emoji
roberta_weibo_emoji = AutoModelForSequenceClassification.from_pretrained('trained_model/roberta_weibo_emoji')
# classifier
weibo_emoji_classifier = pipeline('sentiment-analysis', model = roberta_weibo_emoji, tokenizer = tokenizer, device = 0)

In [20]:
# model without weibo emoji
roberta_weibo_no_emoji = AutoModelForSequenceClassification.from_pretrained('trained_model/roberta_weibo_no_emoji')
# classifier
weibo_no_emoji_classifier = pipeline('sentiment-analysis', model = roberta_weibo_no_emoji, tokenizer = tokenizer, device = 0)

In [21]:
# uer dianping model
uer_roberta_dianping = AutoModelForSequenceClassification.from_pretrained('uer/roberta-base-finetuned-dianping-chinese')
# classifier
uer_dianping_classifier = pipeline('sentiment-analysis', model = uer_roberta_dianping, tokenizer = tokenizer, device = 0)

## Assess Performance

In [None]:
# assess performance of model, and print a confusion matrix
def performance(X_test, y_test, classifier, threshold = 0.5):
    #convert = {'positive (stars 4 and 5)': 1, 'negative (stars 1, 2 and 3)': 0}
    convert = {'LABEL_1': 1, 'LABEL_0': 0}
    y_pred = []
    
    for review in X_test:
        prediction = classifier(review)[0]
        label = convert[prediction['label']]
        if label == 1 and prediction['score'] < threshold:
            label = 0
        y_pred.append(label)
        clear_output(wait = True)
        print("{}/{}".format(len(y_pred), len(y_test)))

    cf_matrix = confusion_matrix(y_test, y_pred)
    labels = ['TN', 'FP', 'FN', 'TP']
    categories = ['Negative', 'Positive']
    
    make_confusion_matrix(cf_matrix, group_names = labels, categories = categories, cmap = 'binary')

### weibo emoji model

In [None]:
# emoji model performance on weibo data with emoji
performance(X_test_wb_1, y_test_wb_1, weibo_emoji_classifier)

In [None]:
# emoji model performance on weibo data with emoji removed
performance(X_test_wb_0, y_test_wb_0, weibo_emoji_classifier)

In [None]:
# emoji model performance on dianping data
performance(X_test_dp_0, y_test_dp_0, weibo_emoji_classifier)

### weibo no emoji model

In [None]:
# non emoji model performance on weibo data with emoji
performance(X_test_wb_1, y_test_wb_1, weibo_no_emoji_classifier)

In [None]:
# non emoji model performance on weibo data with emoji removed
performance(X_test_wb_0, y_test_wb_0, weibo_no_emoji_classifier)

In [None]:
# non emoji model performance on dianping data
performance(X_test_dp_0, y_test_dp_0, weibo_no_emoji_classifier)

### dianping model

In [None]:
# assess performance of model, and print a confusion matrix
def performance(X_test, y_test, classifier, threshold = 0.5):
    convert = {'positive (stars 4 and 5)': 1, 'negative (stars 1, 2 and 3)': 0}
    #convert = {'LABEL_1': 1, 'LABEL_0': 0}
    y_pred = []
    
    for review in X_test:
        prediction = classifier(review)[0]
        label = convert[prediction['label']]
        if label == 1 and prediction['score'] < threshold:
            label = 0
        y_pred.append(label)
        clear_output(wait = True)
        print("{}/{}".format(len(y_pred), len(y_test)))

    cf_matrix = confusion_matrix(y_test, y_pred)
    labels = ['TN', 'FP', 'FN', 'TP']
    categories = ['Negative', 'Positive']
    
    make_confusion_matrix(cf_matrix, group_names = labels, categories = categories, cmap = 'binary')

In [None]:
# dianping model performance on weibo data with emoji
performance(X_test_wb_1, y_test_wb_1, uer_dianping_classifier)

In [None]:
# dianping model performance on weibo data with emoji removed
performance(X_test_wb_0, y_test_wb_0, uer_dianping_classifier)

In [None]:
# dianping model performance on dianping data
performance(X_test_dp_0, y_test_dp_0, uer_dianping_classifier)

## Investigate Misclassified Data

In [None]:
def metrics_idx(data, target, classifier):
    threshold = 0.5
    convert = {'LABEL_1': 1, 'LABEL_0': 0}
    y_pred = []

    for review in data:
        prediction = classifier(review)[0]
        label = convert[prediction['label']]
        if label == 1 and prediction['score'] < threshold:
            label = 0
        y_pred.append(label)
    
    tp = (np.array(y_pred) + np.array(target)) == 2
    tn = (np.array(y_pred) + np.array(target)) == 0
    fp = (np.array(y_pred) + 2 * np.array(target)) == 1
    fn = (np.array(y_pred) + 2 * np.array(target)) == 2

    tp_idx = np.where(tp == True)[0]
    tn_idx = np.where(tn == True)[0]
    fp_idx = np.where(fp == True)[0]
    fn_idx = np.where(fn == True)[0]
    
    return tp_idx, tn_idx, fp_idx, fn_idx

In [None]:
# weibo review with emoji on weibo emoji model
y_test_wb_1_emoji_metrics = metrics_idx(X_test_wb_1, y_test_wb_1, weibo_emoji_classifier)

In [None]:
# weibo review without emoji on weibo emoji model
y_test_wb_0_emoji_metrics = metrics_idx(X_test_wb_0, y_test_wb_0, weibo_emoji_classifier)

In [None]:
wb_tp_emoji_diff = np.setdiff1d(y_test_wb_1_emoji_metrics[0], y_test_wb_0_emoji_metrics[0])
np.array(X_test_wb_1)[wb_tp_emoji_diff][0:20]

In [None]:
wb_tn_emoji_diff = np.setdiff1d(y_test_wb_1_emoji_metrics[1], y_test_wb_0_emoji_metrics[1])
np.array(X_test_wb_1)[wb_tn_emoji_diff][0:20]

In [None]:
wb_tp_emoji_diff_rev = np.setdiff1d(y_test_wb_0_emoji_metrics[0], y_test_wb_1_emoji_metrics[0])
np.array(X_test_wb_1)[wb_tp_emoji_diff_rev][0:20]

In [None]:
wb_tn_emoji_diff_rev = np.setdiff1d(y_test_wb_0_emoji_metrics[1], y_test_wb_1_emoji_metrics[1])
np.array(X_test_wb_1)[wb_tn_emoji_diff_rev][0:20]

## Sketch

In [None]:
weibo_emoji_classifier("小何，你觉得呢？[懒得理你]")

In [None]:
weibo_emoji_classifier("小何，你觉得呢？")

In [None]:
weibo_no_emoji_classifier("小何，你觉得呢？[懒得理你]")

In [None]:
weibo_no_emoji_classifier("小何，你觉得呢？")

In [None]:
uer_dianping_classifier("小何，你觉得呢？[懒得理你]")

In [None]:
uer_dianping_classifier("小何，你觉得呢？")

Around 5% of data has length less than or equal to 1 when emoji is removed, so they may be ambiguous when classifying. We set a threshold of 10 to filter out short texts, and re-assess.

In [None]:
# remove texts with length shorter than 10
X_test_1_filtered = []
y_test_1_filtered = []
X_test_2_filtered = []
y_test_2_filtered = []

for i in range(len(X_test_2)):
    if len(X_test_2[i]) >= 10:
        X_test_1_filtered.append(X_test_1[i])
        y_test_1_filtered.append(y_test_1[i])
        X_test_2_filtered.append(X_test_2[i])
        y_test_2_filtered.append(y_test_2[i])