In [329]:
from model import CNN
from LSTM import RNN
from torch.utils.data import DataLoader
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from torch.nn import functional as F
import numpy as np
import math
import time
import torch
import os
import copy
def train_func(sub_train_, model, mode, optimizer, scheduler, t):
    # Train the model
    train_loss = 0
    train_acc = 0
    pseudo_aspect_train_acc = 0
    aspect_train_acc = 0
    data = DataLoader(sub_train_, batch_size=batch_size, shuffle=True,
                      collate_fn=generate_batch)
    model.train()
    for i, (text, cls, gt1,lengths) in enumerate(data):
        # print(f'size of text: {text.size()}')
        optimizer.zero_grad()
        if mode !='pretrain':
            cls = target_score(cls, t)
        text, cls, gt1 = text.to(device), cls.to(device), gt1.to(device)
        #output = model(text)
        output = model(text, lengths)
        # loss = criterion(output, cls)
        loss = kl_criterion(torch.log(F.softmax(output, dim=-1)), cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        pseudo_aspect_train_acc += (output.argmax(1) == cls.argmax(1)).sum().item()
        aspect_train_acc += (output.argmax(1) == gt1).sum().item()

            
    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), aspect_train_acc / len(sub_train_), pseudo_aspect_train_acc / len(sub_train_)

def test(data_, model, mode):
    loss = 0
    acc = 0
    pseudo_aspect_test_acc = 0
    aspect_test_acc = 0
    data = DataLoader(data_, batch_size=128, collate_fn=generate_batch)
    pred_distribution = []
    model.eval()
    gt=[]
    for text, cls, gt1,lengths in data:
        #print(text.shape,lengths)
        text, cls, gt1 = text.to(device), cls.to(device), gt1.to(device)
        with torch.no_grad():
            #output = model(text)
            output = model(text, lengths)
            cls = target_score(cls)
            loss = kl_criterion(torch.log(F.softmax(output, dim=-1)), cls)
            pseudo_aspect_test_acc += (output.argmax(1) == cls.argmax(1) ).sum().item()
            aspect_test_acc += (output.argmax(1)  == gt1).sum().item()
            #pred_distribution.append(torch.Tensor([softmax(o) for o in output]))
            pred_distribution.append(output)
            gt.append(gt1)
    pred_distribution = torch.cat(pred_distribution, dim=0)
    pred = pred_distribution.argmax(dim=1)
    gt = torch.cat(gt, dim=0) 
    
    p = precision_score(gt, pred,average='macro')
    r = recall_score(gt, pred, average='macro')
    f1_mac = f1_score(gt, pred, average='macro')
    p_w = precision_score(gt, pred,average='weighted')
    r_w = recall_score(gt, pred, average='weighted')
    f1_w = f1_score(gt, pred, average='weighted')
    print('mac {:.5f} {:.5f} {:.5f}'.format(p, r, f1_mac))
    print('weighted {:.5f} {:.5f} {:.5f}'.format(p_w, r_w, f1_w))
            

    return loss / len(data_), aspect_test_acc / len(data_), pseudo_aspect_test_acc / len(data_), pred_distribution


def generate_batch(batch):
    label = torch.cat([entry[1].unsqueeze(0) for entry in batch])
    text = []
    lengths = []
    for entry in batch:
        length = len(entry[0])
        lengths.append(length)
        tmp = F.pad(torch.tensor(entry[0]), (0,100-len(entry[0])), 'constant', 0).unsqueeze(0)
        text.append(tmp)
        for i in range(100):
            if tmp[0][i] >= len(wv):
                print(tmp[i])

    gt1 = torch.from_numpy(np.array([entry[2] for entry in batch]))
    
    text = torch.cat(text)

    return text, label, gt1,lengths

def read_vec(file):
    with open(file) as f:
        embs = f.readlines()
    wv = dict()
    wv['unk']=np.zeros(200)
    for line in embs[1:]:
        line = line.strip().split()
        word = line[0]
        vec = np.array([float(x) for x in line[1:]])
        wv[word] = vec
    return wv

def target_score(logits, t=1.2):
    preds = torch.nn.Softmax(dim=-1)(logits)  # batch * class
    weight = preds**t #/ torch.sum(preds, dim=0)
    return (weight.t() / torch.sum(weight, dim=1)).t()

def softmax(x):
    summ = sum(math.e**(xi) for xi in x if xi>0)
    y = [math.e**(xi)/summ if xi >0 else 0 for xi in x ]
    return y

vec_file = 'wv113.txt' #"restaurant.200d.txt" #
wv = read_vec(vec_file)
word2idx = {w: i for i,w in enumerate(wv)}
idx2word = {i: w for i,w in enumerate(wv)}
aspect_kw = {'location': ['street', 'block', 'avenue', 'river', 'convenient'],
             'drinks': ['drinks', 'beverage', 'wines', 'margarita', 'sake'],
             'food': ['food', 'spicy', 'sushi', 'pizza', 'tasty'],
             'ambience': ['romantic', 'atmosphere', 'room', 'seating', 'small'],
             'service': ['tips', 'manager', 'wait', 'waitress', 'servers'],
             }

In [330]:
learning_rate = 0.005
batch_size = 16
thres = 0.0
output_size = len(aspect_kw)
embedding_length = 200
N_EPOCHS_PRE = 50
N_EPOCHS = 200
self_training = True
label_file = 'pseudo4_10'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

min_valid_loss = float('inf')

with open(os.path.join(label_file)) as f:
    test_cont = f.readlines()
    asp_gt, asp_labels, docs, scores = list(), list(), list() ,list()
    for line in test_cont:
        asp_gti, pseudo, confi0, confi1, confi2, confi3, confi4, doc = line.split('\t')
        asp_gt.append(int(asp_gti))
        asp_labels.append(int(pseudo))
        scores.append([float(confi0),float(confi1),float(confi2),float(confi3),float(confi4)])
        docs.append(doc)

total_dataset_aspect = []
train_dataset_aspect = []
high_conf = []
for i, t in enumerate(docs):
    s_index = [word2idx[w] if w in wv else 0 for w in t.split(' ')]
    total_dataset_aspect.append([s_index, torch.tensor(scores[i]),asp_gt[i]])
    if max(torch.tensor(scores[i]))>thres:
        high_conf.append(i)
        train_dataset_aspect.append([s_index, torch.tensor(scores[i]),asp_gt[i]])
print(len(total_dataset_aspect))
print(train_dataset_aspect[0],len(train_dataset_aspect))


aspect_embedding = torch.zeros((len(wv), embedding_length))
for i in range(len(wv)):
    aspect_embedding[i] = torch.tensor(wv[idx2word[i]])
    
#pretrain_model = CNN(batch_size, output_size, 1, 20, [2,3,4,5], 1, 0, 0.0, len(wv), embedding_length, aspect_embedding)
pretrain_model = RNN(len(wv), embedding_length, embedding_length//2, output_size, 4, True,0.2, aspect_embedding)

criterion = torch.nn.CrossEntropyLoss().to(device)
kl_criterion = torch.nn.KLDivLoss()
pretrain_optimizer = torch.optim.Adam(pretrain_model.parameters(), lr=learning_rate)
pretrain_scheduler = torch.optim.lr_scheduler.StepLR(pretrain_optimizer, 1, gamma=0.9)
pretrain_model.to(device)

643
[[713, 0], tensor([0.1867, 0.1947, 0.2348, 0.1882, 0.1957]), 2] 642


RNN(
  (embedding): Embedding(8633, 200, padding_idx=0)
  (rnn): LSTM(200, 100, num_layers=4, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=200, out_features=5, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [314]:
update_confi = None
total_dist = None
for epoch in range(40):
    start_time = time.time()
    train_loss, aspect_train_acc,  pseudo_aspect_train_acc = \
    train_func(train_dataset_aspect, pretrain_model, 'aspect', pretrain_optimizer, pretrain_scheduler, t=1)
    #valid_loss, aspect_test_acc, pseudo_aspect_test_acc, _ = test(train_dataset_aspect, pretrain_model, 'aspect')
    _, aspect_test_acc_total, pseudo_aspect_test_acc_total, total_dist = test(total_dataset_aspect, pretrain_model, 'aspect')
    print('Aspect Model {:.7f}'.format(train_loss))# aspect_test_acc, pseudo_aspect_test_acc)
    print('Total validation {:.3f}, total {:.3f}'.format(aspect_test_acc_total, pseudo_aspect_test_acc_total))
    #total_dist = target_score(total_dist)
    update_confi, choice = torch.max(total_dist,axis=1)
    if epoch >0:
        label_change = (1 - torch.sum(last_choice == choice).item() / len(choice))*100
        print(epoch, label_change)
        #if aspect_test_acc_total>0.82:
        #    break
    last_choice = choice
    

# pretrain thres 0.0 lr 0.004
# Aspect Model 0.0012521617059418903 0.6635514018691588 0.7227414330218068
# Total validation 0.6640746500777605 0.7216174183514774
# all
# 0.7573872472783826


# pretrain thres 0.0 lr 0.005 t =1
# 0.7527216174183 
# 0.8118 t=1.4

# 110.txt
# pretrain lr 0.005 t=50 pretrain_model = RNN(len(wv), embedding_length, embedding_length, output_size, 4, True,0.2, aspect_embedding)
# 0.8709175738724728


mac 0.66044 0.62292 0.63524
weighted 0.84791 0.84603 0.84422
Aspect Model 0.0000002
Total validation 0.846, total 0.953
mac 0.67497 0.62238 0.64227
weighted 0.84618 0.84603 0.84369
Aspect Model 0.0000002
Total validation 0.846, total 0.963
1 2.488335925349927
mac 0.72558 0.60260 0.64007
weighted 0.84631 0.84137 0.83429
Aspect Model 0.0000002
Total validation 0.841, total 0.936
2 5.44323483670296
mac 0.71391 0.60451 0.64371
weighted 0.84525 0.84292 0.83755
Aspect Model 0.0000002
Total validation 0.843, total 0.949
3 3.732503888024885
mac 0.70662 0.63478 0.66473
weighted 0.84784 0.85070 0.84731
Aspect Model 0.0000001
Total validation 0.851, total 0.958
4 3.8880248833592534


KeyboardInterrupt: 

In [315]:
pretrain_choice = choice


In [316]:
aspect_model = copy.deepcopy(pretrain_model)
last_choice = pretrain_choice
aspect_lr = 0.0005
thres = 0.7
thres_num = 642
aspect_optimizer = torch.optim.Adam(aspect_model.parameters(), lr=aspect_lr)
aspect_scheduler = torch.optim.lr_scheduler.StepLR(aspect_optimizer, 1, gamma=0.9)
new_update_confi = update_confi
update_index = torch.argsort(new_update_confi, descending=True)[:thres_num]
for _ in range(50):
    sub_dataset_aspect = []
    '''
    update_index =[i for i in range(len(new_update_confi)) if new_update_confi[i]>thres and new_update_confi[i]<thres+0.10]
    print(len(update_index))
    for i, t in enumerate(docs):
        s_index = [word2idx[w] if w in wv else 0 for w in t.split(' ')]
        if i in update_index:# and i not in high_conf:
            sub_dataset_aspect.append([s_index, total_dist[i],asp_gt[i]])
    '''
    # reorder
    for i in update_index:
        t = docs[i]
        s_index = [word2idx[w] if w in wv else 0 for w in t.split(' ')]
        sub_dataset_aspect.append([s_index, total_dist[i],asp_gt[i]])
    print(len(sub_dataset_aspect))    
    train_loss, aspect_train_acc,  pseudo_aspect_train_acc = train_func(sub_dataset_aspect, aspect_model, 'aspect', aspect_optimizer, aspect_scheduler,t=1.4)
    #valid_loss, aspect_test_acc, pseudo_aspect_test_acc, _ = test(sub_dataset_aspect, aspect_model, 'aspect')
    _, aspect_test_acc_total, pseudo_aspect_test_acc_total, total_dist = test(total_dataset_aspect, aspect_model, 'aspect')
    #print('Aspect Model', train_loss, aspect_test_acc, pseudo_aspect_test_acc)
    print('Total validation {:.5f}, label acc {:.5f}'.format(aspect_test_acc_total, pseudo_aspect_test_acc_total))
    new_update_confi, choice = torch.max(total_dist,axis=1)
    label_change = (1 - torch.sum(last_choice == choice).item() / len(choice))*100
    print(label_change)
    if label_change< 1.0:
        break
    last_choice = choice

642
mac 0.70233 0.62243 0.65221
weighted 0.84992 0.85381 0.84955
Total validation 0.85381, label acc 0.94090
2.799377916018664
642
mac 0.74087 0.62972 0.66172
weighted 0.84265 0.84292 0.83795
Total validation 0.84292, label acc 0.94557
6.687402799377917
642
mac 0.71214 0.63495 0.65665
weighted 0.84602 0.84603 0.84201
Total validation 0.84603, label acc 0.93468
3.4214618973561484
642
mac 0.70764 0.61857 0.64399
weighted 0.84064 0.84137 0.83644
Total validation 0.84137, label acc 0.93779
2.488335925349927
642
mac 0.75028 0.62168 0.64989
weighted 0.85145 0.84914 0.84351
Total validation 0.84914, label acc 0.93313
2.799377916018664
642
mac 0.70975 0.61463 0.63939
weighted 0.84497 0.84137 0.83496
Total validation 0.84137, label acc 0.92379
4.043545878693622
642
mac 0.76748 0.62619 0.66361
weighted 0.85213 0.84914 0.84273
Total validation 0.84914, label acc 0.92846
1.7107309486780742
642
mac 0.77796 0.63917 0.67883
weighted 0.85981 0.85848 0.85290
Total validation 0.85848, label acc 0.93935


KeyboardInterrupt: 

In [158]:
torch.save(pretrain_model,'skip113.pt')


In [263]:
top_index = torch.argsort(new_update_confi, descending=True)[:10]
print(new_update_confi[:10])
print(top_index)

tensor([0.9044, 0.9669, 0.5842, 0.3265, 0.5674, 0.3346, 0.3634, 0.3765, 0.5489,
        0.7045])
tensor([471, 274, 445,  19, 351,   1, 618, 264, 416, 326])


In [73]:
x =torch.tensor([[0.1818, 0.1740, 0.2007, 0.1992, 0.2443]])
target_score(x,t=40)

tensor([[0.0554, 0.0406, 0.1180, 0.1111, 0.6749]])

pretrain_model.load('skip113.pt')

In [317]:
valid_loss, aspect_test_acc, pseudo_aspect_test_acc, _ = test(train_dataset_aspect, pretrain_model, 'aspect')
print(aspect_test_acc)   

mac 0.70035 0.68033 0.68567
weighted 0.84653 0.84424 0.84270
0.8442367601246106


In [331]:
restore_model = torch.load('skip113.pt')

In [332]:
valid_loss, aspect_test_acc, pseudo_aspect_test_acc, _ = test(train_dataset_aspect, restore_model, 'aspect')
print(aspect_test_acc) 

mac 0.73499 0.71394 0.72175
weighted 0.86414 0.86449 0.86369
0.8644859813084113


In [333]:
data = DataLoader(train_dataset_aspect, batch_size=128, collate_fn=generate_batch)
for text, cls, gt,lengths in data:
    output = aspect_model(text, lengths)
    pred = output.argmax(1)
    label = cls.argmax(1)
    for i in range(len(gt1)):
        if pred[i] == gt[i] and gt[i]!= label[i]:
            for j in text[i]:
                if j != 0:
                    print(idx2word[int(j)])
            print('gt {} label {} pred {}'.format(gt[i], label[i], pred[i]))

green
tea
creme
brulee
is
a
must
gt 2 label 1 pred 2
do
n't
leave
the
restaurant
without
it
gt 2 label 4 pred 2
normally
,
places
ask
how
hot
you
want
it
,
but
they
did
n't
gt 4 label 2 pred 4
great
open
and
friendly
ambience
gt 3 label 4 pred 3
two
thumbs
up
gt 2 label 3 pred 2
the
cooks
have
been
at
the
restaurant
for
years
and
cook
family
recipes
gt 2 label 4 pred 2
it
's
unpretentious
and
gt 3 label 2 pred 3
my
wife
also
ordered
a
of
hot
water
(
she
had
a
sore
throat
)
and
i
guess
that
since
it
was
only
water
,
it
was
n't
a
priority
for
them
to
actually
bring
it
gt 4 label 2 pred 4
green
tea
creme
brulee
gets
better
each
time
i
have
it
gt 2 label 1 pred 2
and
the
service
was
simply
-
quite
a
delight
gt 4 label 2 pred 4
sit
in
the
gt 3 label 0 pred 3
i
thought
the
restaurant
was
nice
and
clean
gt 3 label 2 pred 3
the
dancing
,
white
river
and
rolls
are
gt 2 label 0 pred 2
for
a
restaurant
with
such
a
good
reputation
and
that
is
usually
so
packed
,
there
was
no
reason
for
such
a
lack

IndexError: index 2 is out of bounds for dimension 0 with size 2

In [None]:
def test(data_, model, mode):
    loss = 0
    acc = 0
    pseudo_aspect_test_acc = 0
    aspect_test_acc = 0
    data = DataLoader(data_, batch_size=128, collate_fn=generate_batch)
    pred_distribution = []
    model.eval()
    gt=[]
    for text, cls, gt1,lengths in data:
        #print(text.shape,lengths)
        text, cls, gt1 = text.to(device), cls.to(device), gt1.to(device)
        with torch.no_grad():
            #output = model(text)
            output = model(text, lengths)
            cls = target_score(cls)
            loss = kl_criterion(torch.log(F.softmax(output, dim=-1)), cls)
            pseudo_aspect_test_acc += (output.argmax(1) == cls.argmax(1) ).sum().item()
            aspect_test_acc += (output.argmax(1)  == gt1).sum().item()
            #pred_distribution.append(torch.Tensor([softmax(o) for o in output]))
            pred_distribution.append(output)
            gt.append(gt1)
    pred_distribution = torch.cat(pred_distribution, dim=0)
    pred = pred_distribution.argmax(dim=1)
    gt = torch.cat(gt, dim=0) 
    
    p = precision_score(gt, pred,average='macro')
    r = recall_score(gt, pred, average='macro')
    f1_mac = f1_score(gt, pred, average='macro')
    p_w = precision_score(gt, pred,average='weighted')
    r_w = recall_score(gt, pred, average='weighted')
    f1_w = f1_score(gt, pred, average='weighted')
    print('mac {:.5f} {:.5f} {:.5f}'.format(p, r, f1_mac))
    print('wei