## Dataset

In [46]:
import pandas as pd

train_df = pd.read_csv('../data/train_imdb_reviews.csv')
test_df = pd.read_csv('../data/test_imdb_reviews.csv')

In [47]:
# This cell is only for experiments of balancing dataset sentiments

positive_count = train_df[train_df["sentiment"] == 1].shape[0]
negative_count = train_df[train_df["sentiment"] == 0].shape[0]
print("Total reviews:", train_df.shape[0])
print("Positive reviews:", positive_count)
print("Negative reviews:", negative_count)
print("ratio:", positive_count/negative_count)

print("-------------------------")

positive = train_df[train_df["sentiment"] == 1][:negative_count]
negative = train_df[train_df["sentiment"] == 0]
print("Positive reviews:", positive.shape[0])
print("Negative reviews:", negative.shape[0])
print("ratio:", positive.shape[0]/negative.shape[0])

train_df = pd.concat([positive, negative])

Total reviews: 41669
Positive reviews: 27295
Negative reviews: 14374
ratio: 1.8989147071100598
-------------------------
Positive reviews: 14374
Negative reviews: 14374
ratio: 1.0


In [48]:
train_df['data_type'] = 'train'
test_df['data_type'] = 'test'

all_df = pd.concat([train_df, test_df], axis=0)

In [49]:
all_data = all_df.reset_index(drop=True)

print(len(all_data))
all_data.head()

33378


Unnamed: 0,movie,score,title,review,sentiment,data_type
0,從前，有個好萊塢,10.0,tarantinos best,never wanted end said loved ending made weep l...,1,train
1,不可能的任務：致命清算 第一章,7.0,theres nothing else see,first ive got say im huge fan franchise saw mo...,1,train
2,之前的我們,8.0,irony poor connection talked facetime,past lives first great doomed love story audie...,1,train
3,從前，有個好萊塢,8.0,tarantinos love letter golden age,upon time hollywood perfect movie awesome one ...,1,train
4,旺卡,9.0,must see keep tradition alive,enjoyed every bit movie acting especially stor...,1,train


In [51]:
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags
from nltk import word_tokenize

def tokenize(doc):
    return simple_preprocess(strip_tags(doc), deacc=True, min_len=2, max_len=15)

all_data['tagged_docs'] = all_data.apply(lambda r: TaggedDocument(tokenize(r['review']), [str(r.name)]), axis=1)
all_data['doc_key'] = all_data.index.astype(str)

In [52]:
all_data.head()

Unnamed: 0,movie,score,title,review,sentiment,data_type,tagged_docs,doc_key
0,從前，有個好萊塢,10.0,tarantinos best,never wanted end said loved ending made weep l...,1,train,"([never, wanted, end, said, loved, ending, mad...",0
1,不可能的任務：致命清算 第一章,7.0,theres nothing else see,first ive got say im huge fan franchise saw mo...,1,train,"([first, ive, got, say, im, huge, fan, franchi...",1
2,之前的我們,8.0,irony poor connection talked facetime,past lives first great doomed love story audie...,1,train,"([past, lives, first, great, doomed, love, sto...",2
3,從前，有個好萊塢,8.0,tarantinos love letter golden age,upon time hollywood perfect movie awesome one ...,1,train,"([upon, time, hollywood, perfect, movie, aweso...",3
4,旺卡,9.0,must see keep tradition alive,enjoyed every bit movie acting especially stor...,1,train,"([enjoyed, every, bit, movie, acting, especial...",4


In order to do this unsupervised classification task, I need to define some of the keywords by myself.\
The detailed method and reason to do this can be referred to its original paper: Evaluating Unsupervised Text Classification: Zero-shot and Similarity-based Approaches (https://arxiv.org/abs/2211.16285)

In [53]:
label_df = pd.read_csv('../data/labels.csv')
label_df['keywords'] = label_df['keywords'].apply(lambda x: x.split(' '))
label_df['num_keywords'] = label_df['keywords'].apply(lambda x: len(x))

print(label_df.head())

   class_id class_name                                           keywords  \
0         1   positive  [nice, masterpiece, beautiful, excellent, awes...   
1         2   negative  [horrible, bad, disgusting, terrible, awful, b...   

   num_keywords  
0             8  
1             8  


In [54]:
label_df['class_name'] = label_df['class_name'].astype('str')

## Unsupervised Model

In [55]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [56]:
from lbl2vec import Lbl2TransformerVec, Lbl2Vec
from transformers import AutoModel
from tqdm import tqdm
from gensim.models import Doc2Vec

lblModel = Lbl2Vec(keywords_list=list(label_df['keywords']), tagged_documents=all_data['tagged_docs'][all_data['data_type']=='train'], label_names=list(label_df['class_name']), epochs=10, min_count=20, verbose=True)

# baseModel = AutoModel.from_pretrained('princeton-nlp/unsup-simcse-roberta-base').to(device)
# lblModel = Lbl2TransformerVec(transformer_model=baseModel, keywords_list=list(label_df['keywords']), documents=all_data[all_data['data_type']=='train']['review'], device=device)

In [57]:
lblModel.fit()

2024-03-15 00:37:51,748 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-15 00:37:51,748 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-15 00:37:51,748 - Lbl2Vec - INFO - Train document and word embeddings
2024-03-15 00:38:26,655 - Lbl2Vec - INFO - Train label embeddings
2024-03-15 00:38:26,655 - Lbl2Vec - INFO - Train label embeddings
2024-03-15 00:38:26,655 - Lbl2Vec - INFO - Train label embeddings


In [58]:
lblModel.save('models/lbl2vec_balance_model')

In [59]:
model_docs_lbl_similarities = lblModel.predict_model_docs()

2024-03-15 00:38:27,418 - Lbl2Vec - INFO - Get document embeddings from model
2024-03-15 00:38:27,418 - Lbl2Vec - INFO - Get document embeddings from model
2024-03-15 00:38:27,418 - Lbl2Vec - INFO - Get document embeddings from model
2024-03-15 00:38:27,432 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-03-15 00:38:27,432 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-03-15 00:38:27,432 - Lbl2Vec - INFO - Calculate document<->label similarities


In [60]:
model_docs_lbl_similarities.head()

Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,positive,negative
0,0,positive,0.626312,0.626312,0.625776
1,1,negative,0.317811,0.208327,0.317811
2,2,positive,0.311061,0.311061,0.236579
3,3,positive,0.223695,0.223695,0.187032
4,4,positive,0.466779,0.466779,0.211912


In [61]:
y_train = all_data['sentiment']
y_pred = model_docs_lbl_similarities['most_similar_label']

print(y_pred[:5])

0    positive
1    negative
2    positive
3    positive
4    positive
Name: most_similar_label, dtype: object


In [62]:
for i in range(len(y_pred)):
    if y_pred[i] == 'positive':
        y_pred[i] = 1
    else:
        y_pred[i] = 0

print(y_pred[:5])

0    1
1    0
2    1
3    1
4    1
Name: most_similar_label, dtype: object


In [63]:
tp = 0
tn = 0
fp = 0
fn = 0

print(len(y_pred), len(y_train))

for i in range(len(y_pred)):
    if y_pred[i] == 1 and y_train.iloc[i] == 1:
        tp += 1
    elif y_pred[i] == 0 and y_train.iloc[i] == 0:
        tn += 1
    elif y_pred[i] == 1 and y_train.iloc[i] == 0:
        fp += 1
    elif y_pred[i] == 0 and y_train.iloc[i] == 1:
        fn += 1

print(tp, tn, fp, fn)
print(tp + tn + fp + fn)

28748 33378
11151 8245 6129 3223
28748


In [64]:
def auc(tp, fp, tn, fn):
    return (tp / (tp + fn) + tn / (tn + fp)) / 2

print("true positives: ", tp)
print("true negatives: ", tn)
print("false positives: ", fp)
print("false negatives: ", fn)

acc = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("Accuracy: ", acc)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)
print("AUC: ", auc(tp, fp, tn, fn))

true positives:  11151
true negatives:  8245
false positives:  6129
false negatives:  3223
Accuracy:  0.6746904132461389
Precision:  0.6453125
Recall:  0.775775706136079
F1:  0.7045555064130916
AUC:  0.6746904132461389


## Evaluate on test dataset

In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
from lbl2vec import Lbl2TransformerVec
from tqdm import tqdm

model = Lbl2TransformerVec.load('lbl2vec_model')

In [65]:
import pandas as pd

test_df = pd.read_csv('../data/test_imdb_reviews.csv')

X_test = test_df['review'].astype(str)
y_test = test_df['sentiment']

In [66]:
new_docs_lbl_similarities = lblModel.predict_new_docs(tagged_docs=all_data['tagged_docs'][all_data['data_type']=='test'])

2024-03-15 00:39:03,176 - Lbl2Vec - INFO - Calculate document embeddings
2024-03-15 00:39:03,176 - Lbl2Vec - INFO - Calculate document embeddings
2024-03-15 00:39:03,176 - Lbl2Vec - INFO - Calculate document embeddings


2024-03-15 00:39:06,331 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-03-15 00:39:06,331 - Lbl2Vec - INFO - Calculate document<->label similarities
2024-03-15 00:39:06,331 - Lbl2Vec - INFO - Calculate document<->label similarities


In [68]:
new_docs_lbl_similarities.head()

Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,positive,negative
0,28748,positive,0.421605,0.421605,0.164033
1,28749,positive,0.576993,0.576993,0.394001
2,28750,positive,0.239823,0.239823,0.183607
3,28751,positive,0.622303,0.622303,0.499031
4,28752,positive,0.460922,0.460922,0.405098


In [69]:
y_test_pred = new_docs_lbl_similarities['most_similar_label']

for i in range(len(y_test_pred)):
    if y_test_pred[i] == 'positive':
        y_test_pred[i] = 1
    else:
        y_test_pred[i] = 0

print(y_test_pred[:5])

0    1
1    1
2    1
3    1
4    1
Name: most_similar_label, dtype: object


In [70]:
test_tp = 0
test_tn = 0
test_fp = 0
test_fn = 0

for i in range(len(y_test_pred)):
    if y_test_pred[i] == 1 and y_test.iloc[i] == 1:
        test_tp += 1
    elif y_test_pred[i] == 0 and y_test.iloc[i] == 0:
        test_tn += 1
    elif y_test_pred[i] == 1 and y_test.iloc[i] == 0:
        test_fp += 1
    elif y_test_pred[i] == 0 and y_test.iloc[i] == 1:
        test_fn += 1

print("true positives: ", test_tp)
print("true negatives: ", test_tn)
print("false positives: ", test_fp)
print("false negatives: ", test_fn)

test_acc = (test_tp + test_tn) / (test_tp + test_tn + test_fp + test_fn)
test_precision = test_tp / (test_tp + test_fp)
test_recall = test_tp / (test_tp + test_fn)
test_f1 = 2 * (test_precision * test_recall) / (test_precision + test_recall)
test_auc = auc(test_tp, test_fp, test_tn, test_fn)

print("Accuracy: ", test_acc)
print("Precision: ", test_precision)
print("Recall: ", test_recall)
print("F1: ", test_f1)
print("AUC: ", test_auc)

true positives:  2447
true negatives:  936
false positives:  627
false negatives:  620
Accuracy:  0.7306695464362851
Precision:  0.7960312296681847
Recall:  0.797848059993479
F1:  0.7969386093470119
AUC:  0.6983482142577759
