In [15]:
import os
import pandas as pd
import numpy as np

os.chdir("/content/drive/MyDrive/Notebooks/PAPER_EXPERIMENT")

# Load Dataset and Vocabulary

In [24]:
train_reviews = pd.read_csv("MultiLabel_Classification/data/train.csv")
dev_reviews = pd.read_csv("MultiLabel_Classification/data/dev.csv")
test_reviews = pd.read_csv("MultiLabel_Classification/data/test.csv")

In [25]:
train_reviews["review_tokenized"] = train_reviews["review_tokenized"].apply(lambda x: eval(x))
dev_reviews["review_tokenized"] = dev_reviews["review_tokenized"].apply(lambda x: eval(x))
test_reviews["review_tokenized"] = test_reviews["review_tokenized"].apply(lambda x: eval(x))

In [26]:
with open("Word2Vec/vocab.txt", "r") as fp:
    vocab = fp.read().split("\n")

# Convert Text to Index and Padding

In [27]:
word_index = dict()
for idx, w in enumerate(vocab):
    word_index[w] = idx

In [28]:
train_reviews["review_index"] = \
    train_reviews["review_tokenized"].apply(lambda x: [word_index[w] for w in x])
dev_reviews["review_index"] = \
    dev_reviews["review_tokenized"].apply(lambda x: [word_index[w] for w in x])
test_reviews["review_index"] = \
    test_reviews["review_tokenized"].apply(lambda x: [word_index[w] if w in vocab else word_index["[UNK]"] for w in x])

In [29]:
max_seq_len = 300


def padding(sentence, max_seq_len):
    length = len(sentence)
    if length < max_seq_len:
        return sentence + [word_index["[PAD]"]]*(max_seq_len - length)
    else:
        return sentence[:max_seq_len]


train_reviews["review_index"] = \
    train_reviews["review_index"].apply(lambda x: padding(x, max_seq_len))
dev_reviews["review_index"] = \
    dev_reviews["review_index"].apply(lambda x: padding(x, max_seq_len))
test_reviews["review_index"] = \
    test_reviews["review_index"].apply(lambda x: padding(x, max_seq_len))

# Save Numpy Dataset

In [30]:
labels = ["Price", "Location", "Service", "Ambience", "Food"]

y_train = train_reviews[labels].values
y_dev = dev_reviews[labels].values
y_test = test_reviews[labels].values

In [31]:
X_train = np.array(train_reviews["review_index"].tolist())
X_dev = np.array(dev_reviews["review_index"].tolist())
X_test = np.array(test_reviews["review_index"].tolist())

In [32]:
np.save("MultiLabel_Classification/data/X_train", X_train)
np.save("MultiLabel_Classification/data/X_dev", X_dev)
np.save("MultiLabel_Classification/data/X_test", X_test)

np.save("MultiLabel_Classification/data/y_train", y_train)
np.save("MultiLabel_Classification/data/y_dev", y_dev)
np.save("MultiLabel_Classification/data/y_test", y_test)

# Run LSAN Model

In [4]:
!python MultiLabel_Classification/classification.py \
    --save_path=MultiLabel_Classification/checkpoints/checkpoint_LSAN.pth \
    --predict_result_path=MultiLabel_Classification/result/lsan_result.json

loading data...
load done

Running EPOCH 1 lr=0.00100

100% 287/287 [04:08<00:00,  1.16it/s]
epoch  1 train end : avg_loss = 0.0035
precision: 0.8332, recall: 0.8848, f1-score: 0.8441

100% 39/39 [00:10<00:00,  3.87it/s]
epoch  1 test end : avg_loss = 0.0021
precision: 0.9223, recall: 0.8985, f1-score: 0.9089


Running EPOCH 2 lr=0.00070

100% 287/287 [04:08<00:00,  1.16it/s]
epoch  2 train end : avg_loss = 0.0019
precision: 0.9207, recall: 0.9173, f1-score: 0.9180

100% 39/39 [00:10<00:00,  3.86it/s]
epoch  2 test end : avg_loss = 0.0018
precision: 0.9252, recall: 0.9222, f1-score: 0.9231


Running EPOCH 3 lr=0.00049

100% 287/287 [04:08<00:00,  1.16it/s]
epoch  3 train end : avg_loss = 0.0017
precision: 0.9284, recall: 0.9280, f1-score: 0.9274

100% 39/39 [00:10<00:00,  3.86it/s]
epoch  3 test end : avg_loss = 0.0017
precision: 0.9342, recall: 0.9199, f1-score: 0.9264


Running EPOCH 4 lr=0.00034

100% 287/287 [04:08<00:00,  1.16it/s]
epoch  4 train end : avg_loss = 0.0015
precision:

# Run CNN-LSAN Model

In [1]:
import os
os.chdir("/content/drive/MyDrive/Notebooks/PAPER_EXPERIMENT")


!python MultiLabel_Classification/classification.py \
    --save_path=MultiLabel_Classification/checkpoints/checkpoint_LSAN_CNN.pth \
    --predict_result_path=MultiLabel_Classification/result/lsan_cnn_result.json

loading data...
load done

Running EPOCH 1 lr=0.00100

100% 287/287 [05:55<00:00,  1.24s/it]
epoch  1 train end : avg_loss = 0.0024
precision: 0.9022, recall: 0.8894, f1-score: 0.8920

100% 39/39 [00:14<00:00,  2.65it/s]
epoch  1 test end : avg_loss = 0.0018
precision: 0.9291, recall: 0.9116, f1-score: 0.9193


Running EPOCH 2 lr=0.00070

100% 287/287 [05:59<00:00,  1.25s/it]
epoch  2 train end : avg_loss = 0.0018
precision: 0.9271, recall: 0.9122, f1-score: 0.9180

100% 39/39 [00:14<00:00,  2.67it/s]
epoch  2 test end : avg_loss = 0.0017
precision: 0.9424, recall: 0.9022, f1-score: 0.9209


Running EPOCH 3 lr=0.00049

100% 287/287 [05:59<00:00,  1.25s/it]
epoch  3 train end : avg_loss = 0.0016
precision: 0.9352, recall: 0.9215, f1-score: 0.9270

100% 39/39 [00:14<00:00,  2.68it/s]
epoch  3 test end : avg_loss = 0.0017
precision: 0.9429, recall: 0.9029, f1-score: 0.9211


Running EPOCH 4 lr=0.00034

100% 287/287 [05:59<00:00,  1.25s/it]
epoch  4 train end : avg_loss = 0.0014
precision:

---

# Prediction

In [2]:
import os
os.chdir("/content/drive/MyDrive/Notebooks/PAPER_EXPERIMENT")


!python MultiLabel_Classification/predict.py \
    --model_path=MultiLabel_Classification/checkpoints/checkpoint_LSAN_CNN.pth

100% 288/288 [01:33<00:00,  3.06it/s]
100% 39/39 [00:12<00:00,  3.11it/s]
100% 39/39 [00:12<00:00,  3.07it/s]


# Check the Prediction Result

In [3]:
import numpy as np
from sklearn.metrics import f1_score

In [None]:
x_test = pd.read_csv("MultiLabel_Classification/data/test.csv")
y_test_pred = np.load("MultiLabel_Classification/data/y_test_pred.npy")
y_test = np.load("MultiLabel_Classification/data/y_test.npy")

In [29]:
right_idx = np.argwhere((y_test_pred != y_test).sum(axis=1) == 0).flatten()
wrong_idx = np.argwhere((y_test_pred != y_test).sum(axis=1) > 0).flatten()

In [33]:
idx = np.random.randint(0, len(wrong_idx))
print(idx)
print(x_test.iloc[wrong_idx[idx]]["review"])
print("True Labels: ", x_test.iloc[wrong_idx[idx]].iloc[2:].values)
print("Predited Labels: ", y_test_pred[wrong_idx[idx]])

319
心爱的宝贝生日，做妈妈的自己早早就物色蛋糕店了，挑来挑去，好不容易挑了这家金之信蛋糕连锁店（墩和），看网上的评价反映不错，提前一天下单，下好单就打电话过去预约第二天自提。昨天下午四点钟左右，就乘坐464路直接到墩和总站下车，位置在电话里提醒挺好的，挺方便，就在附近208车站附近，位置很显眼，不一会儿就看到了，进入店内就在柜台看到自己的下单定的蛋糕，当时一看蛋糕外形很漂亮，有点小错误，就是听名字谐音写错了，不过服务员很快就拿到制作蛋糕的房间，重新用新的朱古力牌写名字，嗯，字写得很漂亮！很愉快的自提蛋糕过程，提完蛋糕就直接出门旁边坐464路回家了。赞一个！到了晚上，特别邀请了很多小伙伴们过来吃蛋糕，蛋糕的外形很漂亮，个个小伙伴都很喜欢，个个迫不及待地想吹蜡烛和切蛋糕了，哈哈，气氛很好，特别开心，终于许完愿吹了蜡烛切好蛋糕，小伙伴们终于如愿以偿地吃到蛋糕了，“好吃”“我还要一块”“还要”，看看，一不会儿，一个大蛋糕地就剩下一小块了，不错，过了一个很开心的生日，祝愿小宝贝健康成长，天天开心！下次还再来尝这里的蛋糕吧。
True Labels:  [0 1 0 0 1]
Predited Labels:  [0 1 1 0 1]
