In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchtext.vocab import GloVe
import torchcrf
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import ast
from collections import defaultdict



In [2]:
truth_dataset = pd.read_csv('../datafiles/true_tags.csv')
truth_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2062 entries, 0 to 2061
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               2062 non-null   int64 
 1   App id                   2062 non-null   object
 2   Review id                2062 non-null   object
 3   Sentence id              2062 non-null   int64 
 4   Sentence content         2062 non-null   object
 5   Feature (Positive)       291 non-null    object
 6   Feature (Neutral)        638 non-null    object
 7   Feature (Negative)       110 non-null    object
 8   Feature (All Annotated)  971 non-null    object
 9   clean_content            2062 non-null   object
 10  tags                     2062 non-null   object
dtypes: int64(2), object(9)
memory usage: 177.3+ KB


In [3]:
truth_dataset['clean_content'] = truth_dataset['clean_content'].apply(ast.literal_eval)
truth_dataset['tags'] = truth_dataset['tags'].apply(ast.literal_eval)
truth_dataset = truth_dataset[truth_dataset['clean_content'].apply(len) > 0]

In [4]:
truth_dataset['App id'].value_counts()

App id
B004LOMB2Q                    367
B005ZXWMUS                    341
B0094BB4TW                    327
B004SIIBGU                    294
com.spotify.music             226
com.twitter.android           183
com.whatsapp                  169
com.zentertain.photoeditor    154
Name: count, dtype: int64

In [185]:
len(truth_dataset[truth_dataset['App id']=='B004LOMB2Q']['clean_content'].to_list())

367

In [5]:
all_sentences = truth_dataset['clean_content'].to_list()
all_tags = truth_dataset['tags'].to_list()

print(len(all_sentences))
print(len(all_tags))

2061
2061


In [6]:
word_to_ix = {word: i+1 for i, word in enumerate(set([w for s in all_sentences for w in s]))}
word_to_ix['<PAD>']=0
word_to_ix['<UNK>']=len(word_to_ix)
tag_to_ix = {'<PAD>': 0, 'B': 1, 'I': 2, 'O': 3}
ix_to_tag = {ix: tag for tag, ix in tag_to_ix.items()}

BATCH_SIZE = 32

In [7]:
def prepare_data(sentences, tags, word_to_ix, tag_to_ix, pad_idx=0):
    max_len = max(len(s) for s in sentences)
    
    sentences_idx = [[word_to_ix[word] for word in sent] + [pad_idx] * (max_len - len(sent)) for sent in sentences]
    tags_idx = [[tag_to_ix[tag] for tag in tag_seq] + [pad_idx] * (max_len - len(tag_seq)) for tag_seq in tags]
    
    sentences_tensor = torch.tensor(sentences_idx, dtype=torch.long)
    tags_tensor = torch.tensor(tags_idx, dtype=torch.long)
    
    return TensorDataset(sentences_tensor, tags_tensor)

In [None]:
dev_sentences, test_sentences, dev_tags, test_tags = train_test_split(all_sentences, all_tags, test_size=0.2, random_state=42)
train_sentences, val_sentences, train_tags, val_tags = train_test_split(dev_sentences, dev_tags, test_size=0.2, random_state=42)

train_data = prepare_data(train_sentences, train_tags, word_to_ix, tag_to_ix)
val_data = prepare_data(val_sentences, val_tags, word_to_ix, tag_to_ix)
test_data = prepare_data(test_sentences, test_tags, word_to_ix, tag_to_ix)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

In [8]:
cross_domain_data = []
for app in truth_dataset['App id'].value_counts().keys():
    print(app)
    dev_sentences = truth_dataset[truth_dataset['App id']!=app]['clean_content'].to_list()
    dev_tags = truth_dataset[truth_dataset['App id']!=app]['tags'].to_list()
    test_sentences = truth_dataset[truth_dataset['App id']==app]['clean_content'].to_list()
    test_tags = truth_dataset[truth_dataset['App id']==app]['tags'].to_list()

    print(len(dev_sentences), len(test_sentences))

    train_sentences, val_sentences, train_tags, val_tags = train_test_split(dev_sentences, dev_tags, test_size=0.2, random_state=42)

    train_data = prepare_data(train_sentences, train_tags, word_to_ix, tag_to_ix)
    val_data = prepare_data(val_sentences, val_tags, word_to_ix, tag_to_ix)
    test_data = prepare_data(test_sentences, test_tags, word_to_ix, tag_to_ix)

    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

    cross_domain_data.append({'app': app, 'train_loader': train_loader, 'val_loader': val_loader, 'test_loader': test_loader, 'test_sentences': test_sentences})

B004LOMB2Q
1694 367
B005ZXWMUS
1720 341
B0094BB4TW
1734 327
B004SIIBGU
1767 294
com.spotify.music
1835 226
com.twitter.android
1878 183
com.whatsapp
1892 169
com.zentertain.photoeditor
1907 154


In [17]:
class BiLSTMEncoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, glove_embeddings):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(glove_embeddings, freeze=False)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
    
    def forward(self, inputs):
        embeds = self.embedding(inputs)
        out, hidden = self.encoder(embeds)
        return out, hidden

In [18]:
class LSTMDecoder(nn.Module):
    def __init__(self, hidden_dim, tagset_size) -> None:
        super().__init__()
        self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tagset_size)
    
    def forward(self, decoder_inputs):
        out, hidden = self.decoder(decoder_inputs)
        emissions = self.fc(out)
        return emissions

In [24]:
class SelfAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attention = nn.Linear(hidden_dim, 1)
    
    def forward(self, input):
        attention_weights = torch.tanh(self.attention(input))
        attention_weights = torch.softmax(attention_weights, dim=1)
        out = input * attention_weights
        return out

In [26]:
class Seq2SeqModel(nn.Module):
    def __init__(self, tagset_size, encoder, decoder, attention):
        super(Seq2SeqModel, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.attention = attention
        self.crf = torchcrf.CRF(tagset_size, batch_first=True)

    def forward(self, sentence):
        out, _ = self.encoder(sentence)
        out = self.attention(out)
        emissions = self.decoder(out)
        
        return emissions
    
    def loss(self, emissions, tags, mask=None):
        return -self.crf(emissions, tags, mask=mask, reduction='mean')
    
    def decode(self, emissions, mask=None):
        return self.crf.decode(emissions, mask=mask)

In [12]:
HIDDEN_DIM = 512
VOCAB_SIZE = len(word_to_ix)
TAGSET_SIZE = len(tag_to_ix)
EMBEDDING_DIM = 300

In [13]:
glove = GloVe(name='6B', dim=EMBEDDING_DIM)

glove_embeddings = torch.zeros(VOCAB_SIZE, EMBEDDING_DIM)

In [14]:
for word, idx in word_to_ix.items():
    if word in glove.stoi:
        glove_embeddings[idx] = glove[word]
    else:
        glove_embeddings[idx] = torch.randn(EMBEDDING_DIM)

print(glove_embeddings.shape)

torch.Size([2818, 300])


In [27]:
encoder = BiLSTMEncoder(EMBEDDING_DIM, HIDDEN_DIM, glove_embeddings)
decoder = LSTMDecoder(HIDDEN_DIM, TAGSET_SIZE)
attention = SelfAttention(HIDDEN_DIM)
model = Seq2SeqModel(TAGSET_SIZE, encoder, decoder, attention)

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [196]:
def train_model(model, train_loader, optimizer):
    model.train()
    total_loss = 0
    for sentences_batch, tags_batch in train_loader:
        mask = (sentences_batch != 0)
        optimizer.zero_grad()
        
        emissions = model(sentences_batch)

        loss = model.loss(emissions, tags_batch, mask)
        loss.backward()
        
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Train Loss: {total_loss / len(train_loader)}")

In [197]:
def evaluate_model(model, val_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for sentences_batch, tags_batch in val_loader:
            mask = (sentences_batch != 0)
            emissions = model(sentences_batch)
            loss = model.loss(emissions, tags_batch, mask)
            total_loss += loss.item()
    print(f"Validation Loss: {total_loss / len(val_loader)}")

In [198]:
def train_eval_loop(model, train_loader, val_loader, optimizer, epochs=10):
    for epoch in range(epochs):
        print(f'Epoch: {epoch}')
        for phase in ['train', 'val']:
            if phase=='train':
                train_model(model, train_loader, optimizer)
            else:
                evaluate_model(model, val_loader)

In [169]:
train_eval_loop(model, train_loader, val_loader, optimizer, 25)

Epoch: 0
Train Loss: 8.235164937518892
Validation Loss: 4.840473370118574
Epoch: 1
Train Loss: 3.823301831881205
Validation Loss: 4.1454877419905225
Epoch: 2
Train Loss: 2.4943369783106304
Validation Loss: 3.948097337375988
Epoch: 3
Train Loss: 2.048884467000053
Validation Loss: 4.039437185634267
Epoch: 4
Train Loss: 1.6912370856319154
Validation Loss: 4.471763329072432
Epoch: 5
Train Loss: 1.3907384088351613
Validation Loss: 4.363630013032393
Epoch: 6
Train Loss: 1.1941032116966588
Validation Loss: 4.195348609577525
Epoch: 7
Train Loss: 1.112944009935572
Validation Loss: 4.361788099462336
Epoch: 8
Train Loss: 0.9677009975005474
Validation Loss: 5.029920512979681
Epoch: 9
Train Loss: 0.8925407479206721
Validation Loss: 4.3115399859168315
Epoch: 10
Train Loss: 0.7250430214972723
Validation Loss: 4.797048449516296
Epoch: 11
Train Loss: 0.6777466975507282
Validation Loss: 4.943498860705983
Epoch: 12
Train Loss: 0.5961136810836338
Validation Loss: 5.108651811426336
Epoch: 13
Train Loss: 0.

In [170]:
def test_model(model, test_loader):
    model.eval()
    all_predictions = []
    masks = []
    with torch.no_grad():
        for sentences_batch, _ in test_loader:
            mask = (sentences_batch != 0)
            emissions = model(sentences_batch)
            predictions = model.decode(emissions, mask=mask)
            pred_tags = [[ix_to_tag[t] for t in seq] for seq in predictions]
            
            all_predictions.extend(pred_tags)
            masks.extend(mask)
    
    return all_predictions, masks

In [171]:
test_predictions, masks = test_model(model, test_loader)

In [98]:
def test_exact_match(true_tags, pred_tags, tag_to_ix, masks):
    y_true = []
    y_pred = []
    
    for true_seq, pred_seq, mask_seq in zip(true_tags, pred_tags, masks):
        # print(len(true_seq), len(pred_seq))
        for i, m in enumerate(mask_seq):
            if m:
                y_true.append(true_seq[i])
                y_pred.append(pred_seq[i])
    
    # y_true = [tag_to_ix[tag] for tag in y_true]
    # y_pred = [tag_to_ix[tag] for tag in y_pred]
    
    print(set(y_true), set(y_pred))

    precision = precision_score(y_true, y_pred, average="macro")
    recall = recall_score(y_true, y_pred, average="macro")
    f1 = f1_score(y_true, y_pred, average="macro")
    
    return precision, recall, f1

In [172]:
exact_precision, exact_recall, exact_f1 = test_exact_match(test_tags, test_predictions, tag_to_ix, masks)
print(f"Exact Match - Precision: {exact_precision}, Recall: {exact_recall}, F1: {exact_f1}")

{'O', 'B', 'I'} {'O', 'B', 'I'}
Exact Match - Precision: 0.7142572950373802, Recall: 0.706020111884896, F1: 0.7091976436876989


In [173]:
def calculate_metrics_1(predictions, true_tags, true_tokens):
    
    # print(len(predictions), len(true_tags), len(test_sentences))
    levels = 3
    
    def extract_entities(seq, sentence):

        entities = []
        current_entity = None
        
        for i, tag in enumerate(seq):
            if tag == 'B':
                if current_entity:
                    entities.append(current_entity)
                current_entity = [sentence[i]]
            elif tag == 'I':
                if current_entity is None:
                    current_entity = [sentence[i]]
                else:
                    current_entity.append(sentence[i])
            elif tag == 'O':
                if current_entity:
                    entities.append(current_entity)
                    current_entity = None
        
        if current_entity:
            entities.append(current_entity)
        
        return entities
    
    def is_match(f1, f2, n):
        """
        Check if two features match at level n.
        Conditions:
        1. One feature is equal to or is a subset of the other
        2. Absolute length difference is at most n
        """
        f1=set(f1)
        f2=set(f2)
        
        is_subset = f1.issubset(f2) or f2.issubset(f1)
        length_diff = abs(len(f1) - len(f2))
        
        return is_subset and length_diff <= n


    all_true_entites = []
    all_pred_entites = []
    
    for pred_seq, true_seq, token_seq in zip(predictions, true_tags, true_tokens):
        true_entities = extract_entities(true_seq, token_seq)
        pred_entities = extract_entities(pred_seq, token_seq)
        # print(pred_entities)
        # print(true_entities)

        all_true_entites.append(true_entities)
        all_pred_entites.append(pred_entities)

    total_true = len(all_true_entites)
    total_pred = len(all_pred_entites)
    metrics = {}
    # print(total_pred, total_true, all_levels_TPs)

    for level in range(levels):
        tp = 0
        fp = 0
        fn = 0

        for true_entities, pred_entities in zip(all_true_entites, all_pred_entites):   
            matched_true = set()
            for pred_entity in pred_entities:
                found_match = False
                
                for i, true_entity in enumerate(true_entities):
                    if i not in matched_true and is_match(pred_entity, true_entity, level):
                        tp += 1
                        matched_true.add(i)
                        found_match = True
                        break
                
                if not found_match:
                    fp += 1

            fn += len(true_entities) - len(matched_true)
            # print(fn)
    
        print(level, tp, fp, fn)
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        level_name = ['exact', 'n-1', 'n-2'][level]
        metrics.update({
            f'{level_name}_precision': precision,
            f'{level_name}_recall': recall,
            f'{level_name}_f1': f1
        })
    
    return metrics

In [132]:
def calculate_metrics_2(predictions, true_tags):
    
    def extract_entities(seq):

        entities = []
        current_entity = None
        
        for i, tag in enumerate(seq):
            if tag == 'B':
                if current_entity:
                    entities.append((current_entity[0], i-1))
                current_entity = (i,)
            elif tag == 'I':
                if current_entity is None:
                    current_entity = (i,)
            elif tag == 'O':
                if current_entity:
                    entities.append((current_entity[0], i-1))
                    current_entity = None
        
        if current_entity:
            entities.append((current_entity[0], len(seq)-1))
        
        return entities
    
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    for pred_seq, true_seq in zip(predictions, true_tags):
        pred_entities = set(extract_entities(pred_seq))
        true_entities = set(extract_entities(true_seq))
        
        true_positives += len(pred_entities & true_entities)
        false_positives += len(pred_entities - true_entities)
        false_negatives += len(true_entities - pred_entities)
    
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [174]:
calculate_metrics_1(test_predictions, test_tags, test_sentences)
# calculate_metrics_2(test_predictions, test_tags)

0 127 175 153
1 161 141 119
2 180 122 100


{'exact_precision': 0.4205298013245033,
 'exact_recall': 0.45357142857142857,
 'exact_f1': 0.436426116838488,
 'n-1_precision': 0.5331125827814569,
 'n-1_recall': 0.575,
 'n-1_f1': 0.5532646048109965,
 'n-2_precision': 0.5960264900662252,
 'n-2_recall': 0.6428571428571429,
 'n-2_f1': 0.6185567010309279}

In [202]:
def train_test_model_across_domains(epochs):
    results=[]
    
    for data in cross_domain_data:
        print(f'For app: {data['app']}')
        model = Seq2SeqModel(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAGSET_SIZE, glove_embeddings)
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        train_eval_loop(model, data['train_loader'], data['val_loader'], optimizer, epochs)
        test_predictions, masks = test_model(model, test_loader)
        f1_scores = calculate_metrics_1(test_predictions, test_tags, test_sentences)
        results.append({data['app']: f1_scores})
    
    return results

In [203]:
results = train_test_model_across_domains(10)

For app: com.zentertain.photoeditor
Epoch: 0
Train Loss: 7.805342613264572
Validation Loss: 5.477112770080566
Epoch: 1
Train Loss: 4.253778269124585
Validation Loss: 3.832847161726518
Epoch: 2
Train Loss: 3.0995182658350746
Validation Loss: 3.445531715046276
Epoch: 3
Train Loss: 2.4657742616742158
Validation Loss: 3.3044710592790083
Epoch: 4
Train Loss: 2.040808748367221
Validation Loss: 3.3759177164597944
Epoch: 5
Train Loss: 1.7147681519042615
Validation Loss: 3.5015265074643223
Epoch: 6
Train Loss: 1.3128584664921428
Validation Loss: 3.7212621081959116
Epoch: 7
Train Loss: 1.1011630206607108
Validation Loss: 3.85394607890736
Epoch: 8
Train Loss: 0.9755697153335394
Validation Loss: 3.9998093084855513
Epoch: 9
Train Loss: 0.7582010599069817
Validation Loss: 4.10980393669822
0 74 14 17
1 80 8 11
2 83 5 8
For app: com.zentertain.photoeditor
Epoch: 0
Train Loss: 8.06432899208956
Validation Loss: 5.992537325078791
Epoch: 1
Train Loss: 3.9946117927861766
Validation Loss: 4.281915664672852


[{'B004LOMB2Q': {'exact_precision': 0.8409090909090909,
   'exact_recall': 0.8131868131868132,
   'exact_f1': 0.8268156424581005,
   'n-1_precision': 0.9090909090909091,
   'n-1_recall': 0.8791208791208791,
   'n-1_f1': 0.8938547486033518,
   'n-2_precision': 0.9431818181818182,
   'n-2_recall': 0.9120879120879121,
   'n-2_f1': 0.9273743016759776}},
 {'B005ZXWMUS': {'exact_precision': 0.8809523809523809,
   'exact_recall': 0.8131868131868132,
   'exact_f1': 0.8457142857142858,
   'n-1_precision': 0.9523809523809523,
   'n-1_recall': 0.8791208791208791,
   'n-1_f1': 0.9142857142857143,
   'n-2_precision': 0.9642857142857143,
   'n-2_recall': 0.8901098901098901,
   'n-2_f1': 0.9257142857142856}},
 {'B0094BB4TW': {'exact_precision': 0.8539325842696629,
   'exact_recall': 0.8351648351648352,
   'exact_f1': 0.8444444444444446,
   'n-1_precision': 0.898876404494382,
   'n-1_recall': 0.8791208791208791,
   'n-1_f1': 0.8888888888888888,
   'n-2_precision': 0.9325842696629213,
   'n-2_recall': 

In [131]:
inference_sentence = 'The app crashes when I try to share photos with my contacts from another social network'
tokens = inference_sentence.split()
sentence_idx = [word_to_ix.get(word, word_to_ix['<UNK>']) for word in tokens]
sentence_tensor = torch.tensor([sentence_idx], dtype=torch.long)

model.eval()
emissions = model(sentence_tensor)
pred_tags_ix = model.decode(emissions)
pred_tags = [ix_to_tag[t] for t in pred_tags_ix[0]]
print(pred_tags)

['O', 'O', 'O', 'O', 'B', 'I', 'O', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'B']
