## Dataset

In [24]:
import pandas as pd

train_df = pd.read_csv('../data/train_imdb_reviews.csv')
test_df = pd.read_csv('../data/test_imdb_reviews.csv')

train_df['data_type'] = 'train'
test_df['data_type'] = 'test'

all_df = pd.concat([train_df, test_df], axis=0)

In [25]:
all_data = all_df.reset_index(drop=True)

print(len(all_data))
all_data.head()

46299


Unnamed: 0,movie,score,title,review,sentiment,data_type
0,沙丘,5.0,beautiful,scenes beautiful thats part liked movie movie ...,0,train
1,從前，有個好萊塢,10.0,tarantinos best,never wanted end said loved ending made weep l...,1,train
2,不可能的任務：致命清算 第一章,7.0,theres nothing else see,first ive got say im huge fan franchise saw mo...,1,train
3,拿破崙,1.0,sad slow epic,great anticipation went see napoleon twenty mi...,0,train
4,之前的我們,8.0,irony poor connection talked facetime,past lives first great doomed love story audie...,1,train


In [26]:
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags
from nltk import word_tokenize

def tokenize(doc):
    return simple_preprocess(strip_tags(doc), deacc=True, min_len=2, max_len=15)

all_data['tagged_docs'] = all_data.apply(lambda r: TaggedDocument(tokenize(str(r['title'])+' '+r['review']), [str(r.name)]), axis=1)
all_data['doc_key'] = all_data.index.astype(str)

In [27]:
all_data.head()

Unnamed: 0,movie,score,title,review,sentiment,data_type,tagged_docs,doc_key
0,沙丘,5.0,beautiful,scenes beautiful thats part liked movie movie ...,0,train,"([beautiful, scenes, beautiful, thats, part, l...",0
1,從前，有個好萊塢,10.0,tarantinos best,never wanted end said loved ending made weep l...,1,train,"([tarantinos, best, never, wanted, end, said, ...",1
2,不可能的任務：致命清算 第一章,7.0,theres nothing else see,first ive got say im huge fan franchise saw mo...,1,train,"([theres, nothing, else, see, first, ive, got,...",2
3,拿破崙,1.0,sad slow epic,great anticipation went see napoleon twenty mi...,0,train,"([sad, slow, epic, great, anticipation, went, ...",3
4,之前的我們,8.0,irony poor connection talked facetime,past lives first great doomed love story audie...,1,train,"([irony, poor, connection, talked, facetime, p...",4


In order to do this unsupervised classification task, I need to define some of the keywords by myself.\
The detailed method and reason to do this can be referred to its original paper: Evaluating Unsupervised Text Classification: Zero-shot and Similarity-based Approaches (https://arxiv.org/abs/2211.16285)

In [28]:
label_df = pd.read_csv('../data/labels.csv')
label_df['keywords'] = label_df['keywords'].apply(lambda x: x.split(' '))
label_df['num_keywords'] = label_df['keywords'].apply(lambda x: len(x))

print(label_df.head())

   class_id class_name                                           keywords  \
0         1   positive  [nice, masterpiece, beautiful, excellent, awes...   
1         2   negative  [horrible, bad, disgusting, terrible, awful, b...   

   num_keywords  
0             8  
1             8  


In [29]:
label_df['class_name'] = label_df['class_name'].astype('str')

## Unsupervised Model

In [9]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [30]:
from lbl2vec import Lbl2TransformerVec, Lbl2Vec
from transformers import AutoModel
from tqdm import tqdm
from gensim.models import Doc2Vec

lblModel = Lbl2Vec(keywords_list=list(label_df['keywords']), tagged_documents=all_data['tagged_docs'][all_data['data_type']=='train'], label_names=list(label_df['class_name']), epochs=10, min_count=20, verbose=True)

# baseModel = AutoModel.from_pretrained('princeton-nlp/unsup-simcse-roberta-base').to(device)
# lblModel = Lbl2TransformerVec(transformer_model=baseModel, keywords_list=list(label_df['keywords']), documents=all_data[all_data['data_type']=='train']['review'], device=device)

In [31]:
lblModel.fit()

2024-03-14 20:37:06,639 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-14 20:37:06,639 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-14 20:38:04,401 - Lbl2Vec - INFO - Train label embeddings
2024-03-14 20:38:04,401 - Lbl2Vec - INFO - Train label embeddings


In [32]:
lblModel.save('lbl2vec_titlereview_model')

In [33]:
model_docs_lbl_similarities = lblModel.predict_model_docs()

2024-03-14 20:38:05,601 - Lbl2Vec - INFO - Get document embeddings from model
2024-03-14 20:38:05,601 - Lbl2Vec - INFO - Get document embeddings from model
2024-03-14 20:38:05,634 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-03-14 20:38:05,634 - Lbl2Vec - INFO - Calculate document<->label similarities


In [34]:
model_docs_lbl_similarities.head()

Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,positive,negative
0,0,negative,0.510638,0.24138,0.510638
1,1,negative,0.529083,0.320638,0.529083
2,2,negative,0.26254,0.164078,0.26254
3,3,negative,0.285527,0.117318,0.285527
4,4,negative,0.172333,0.124646,0.172333


In [35]:
y_train = all_data['sentiment']
y_pred = model_docs_lbl_similarities['most_similar_label']

print(y_pred[:5])

0    negative
1    negative
2    negative
3    negative
4    negative
Name: most_similar_label, dtype: object


In [36]:
for i in range(len(y_pred)):
    if y_pred[i] == 'positive':
        y_pred[i] = 1
    else:
        y_pred[i] = 0

print(y_pred[:5])

0    0
1    0
2    0
3    0
4    0
Name: most_similar_label, dtype: object


In [37]:
tp = 0
tn = 0
fp = 0
fn = 0

print(len(y_pred), len(y_train))

for i in range(len(y_pred)):
    if y_pred[i] == 1 and y_train.iloc[i] == 1:
        tp += 1
    elif y_pred[i] == 0 and y_train.iloc[i] == 0:
        tn += 1
    elif y_pred[i] == 1 and y_train.iloc[i] == 0:
        fp += 1
    elif y_pred[i] == 0 and y_train.iloc[i] == 1:
        fn += 1

print(tp, tn, fp, fn)
print(tp + tn + fp + fn)

41669 46299
15012 13595 779 12283
41669


In [38]:
def auc(tp, fp, tn, fn):
    return (tp / (tp + fn) + tn / (tn + fp)) / 2

print("true positives: ", tp)
print("true negatives: ", tn)
print("false positives: ", fp)
print("false negatives: ", fn)

acc = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("Accuracy: ", acc)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)
print("AUC: ", auc(tp, fp, tn, fn))

true positives:  15012
true negatives:  13595
false positives:  779
false negatives:  12283
Accuracy:  0.6865295543449567
Precision:  0.9506681020834653
Recall:  0.5499908408133358
F1:  0.6968388803787774
AUC:  0.7478978831866874


## Evaluate on test dataset

In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
from lbl2vec import Lbl2TransformerVec
from tqdm import tqdm

model = Lbl2TransformerVec.load('lbl2vec_model')

In [39]:
import pandas as pd

test_df = pd.read_csv('../data/test_imdb_reviews.csv')

X_test = test_df['title'].astype(str) + ' ' + test_df['review'].astype(str)
y_test = test_df['sentiment']

In [40]:
new_docs_lbl_similarities = lblModel.predict_new_docs(tagged_docs=all_data['tagged_docs'][all_data['data_type']=='test'])

2024-03-14 20:38:24,425 - Lbl2Vec - INFO - Calculate document embeddings
2024-03-14 20:38:24,425 - Lbl2Vec - INFO - Calculate document embeddings
2024-03-14 20:38:27,393 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-03-14 20:38:27,393 - Lbl2Vec - INFO - Calculate document<->label similarities


In [41]:
new_docs_lbl_similarities.head()

Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,positive,negative
0,41669,positive,0.35773,0.35773,0.155046
1,41670,positive,0.491405,0.491405,0.353151
2,41671,negative,0.2076,0.185149,0.2076
3,41672,positive,0.412254,0.412254,0.308475
4,41673,positive,0.363429,0.363429,0.350282


In [42]:
y_test_pred = new_docs_lbl_similarities['most_similar_label']

for i in range(len(y_test_pred)):
    if y_test_pred[i] == 'positive':
        y_test_pred[i] = 1
    else:
        y_test_pred[i] = 0

print(y_test_pred[:5])

0    1
1    1
2    0
3    1
4    1
Name: most_similar_label, dtype: object


In [43]:
test_tp = 0
test_tn = 0
test_fp = 0
test_fn = 0

for i in range(len(y_test_pred)):
    if y_test_pred[i] == 1 and y_test.iloc[i] == 1:
        test_tp += 1
    elif y_test_pred[i] == 0 and y_test.iloc[i] == 0:
        test_tn += 1
    elif y_test_pred[i] == 1 and y_test.iloc[i] == 0:
        test_fp += 1
    elif y_test_pred[i] == 0 and y_test.iloc[i] == 1:
        test_fn += 1

print("true positives: ", test_tp)
print("true negatives: ", test_tn)
print("false positives: ", test_fp)
print("false negatives: ", test_fn)

test_acc = (test_tp + test_tn) / (test_tp + test_tn + test_fp + test_fn)
test_precision = test_tp / (test_tp + test_fp)
test_recall = test_tp / (test_tp + test_fn)
test_f1 = 2 * (test_precision * test_recall) / (test_precision + test_recall)
test_auc = auc(test_tp, test_fp, test_tn, test_fn)

print("Accuracy: ", test_acc)
print("Precision: ", test_precision)
print("Recall: ", test_recall)
print("F1: ", test_f1)
print("AUC: ", test_auc)

true positives:  1828
true negatives:  1446
false positives:  117
false negatives:  1239
Accuracy:  0.7071274298056156
Precision:  0.9398457583547558
Recall:  0.5960221715030974
F1:  0.7294493216280925
AUC:  0.7605830627189192
