In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchtext.vocab import GloVe
import torchcrf
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score
import ast

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

### S2S Architecture

In [3]:
class BiLSTMEncoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, glove_embeddings):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(glove_embeddings, freeze=False)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
    
    def forward(self, inputs):
        embeds = self.embedding(inputs)
        out, hidden = self.encoder(embeds)
        return out, hidden

In [4]:
class LSTMDecoder(nn.Module):
    def __init__(self, hidden_dim, tagset_size) -> None:
        super().__init__()
        self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tagset_size)
    
    def forward(self, decoder_inputs):
        out, hidden = self.decoder(decoder_inputs)
        emissions = self.fc(out)
        return emissions

In [5]:
class SelfAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attention = nn.Linear(hidden_dim, 1)
    
    def forward(self, input):
        attention_weights = torch.tanh(self.attention(input))
        attention_weights = torch.softmax(attention_weights, dim=1)
        out = input * attention_weights
        return out

In [6]:
class Seq2SeqModel(nn.Module):
    def __init__(self, tagset_size, encoder, decoder, attention):
        super(Seq2SeqModel, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.attention = attention
        self.crf = torchcrf.CRF(tagset_size, batch_first=True)

    def forward(self, sentence):
        out, _ = self.encoder(sentence)
        out = self.attention(out)
        emissions = self.decoder(out)
        
        return emissions
    
    def loss(self, emissions, tags, mask=None):
        return -self.crf(emissions, tags, mask=mask, reduction='mean')
    
    def decode(self, emissions, mask=None):
        return self.crf.decode(emissions, mask=mask)

### Utility funtions

In [64]:
def transform_data(reviews):
    word_to_ix = {word: i+1 for i, word in enumerate(set([w for s in reviews for w in s]))}
    word_to_ix['<PAD>']=0
    word_to_ix['<UNK>']=len(word_to_ix)
    tag_to_ix = {'<PAD>': 0, 'B': 1, 'I': 2, 'O': 3}
    ix_to_tag = {ix: tag for tag, ix in tag_to_ix.items()}
    return word_to_ix, tag_to_ix, ix_to_tag

In [8]:
def prepare_data(sentences, tags, word_to_ix, tag_to_ix, pad_idx=0):
    max_len = max(len(s) for s in sentences)
    
    sentences_idx = [[word_to_ix[word] for word in sent] + [pad_idx] * (max_len - len(sent)) for sent in sentences]
    tags_idx = [[tag_to_ix[tag] for tag in tag_seq] + [pad_idx] * (max_len - len(tag_seq)) for tag_seq in tags]
    
    sentences_tensor = torch.tensor(sentences_idx, dtype=torch.long)
    tags_tensor = torch.tensor(tags_idx, dtype=torch.long)
    
    return TensorDataset(sentences_tensor, tags_tensor)

In [9]:
def get_glove(EMBEDDING_DIM, VOCAB_SIZE, word_to_ix):
    glove = GloVe(name='6B', dim=EMBEDDING_DIM)
    glove_embeddings = torch.zeros(VOCAB_SIZE, EMBEDDING_DIM)
    for word, idx in word_to_ix.items():
        if word in glove.stoi:
            glove_embeddings[idx] = glove[word]
        else:
            glove_embeddings[idx] = torch.randn(EMBEDDING_DIM)
    return glove_embeddings

In [10]:
def initialize_model(EMBEDDING_DIM, HIDDEN_DIM, glove_embeddings, TAGSET_SIZE, device):
    encoder = BiLSTMEncoder(EMBEDDING_DIM, HIDDEN_DIM, glove_embeddings)
    decoder = LSTMDecoder(HIDDEN_DIM, TAGSET_SIZE)
    attention = SelfAttention(HIDDEN_DIM)
    model = Seq2SeqModel(TAGSET_SIZE, encoder, decoder, attention)
    model.to(device)
    return model

In [None]:
def initialize_optimizer(model, lr=0.001):
    return optim.Adam(model.parameters(), lr=lr)

In [12]:
def train_model(model, train_loader, optimizer, epoch):
    model.train()
    total_loss = 0
    for sentences_batch, tags_batch in train_loader:
        sentences_batch = sentences_batch.to(device)
        tags_batch = tags_batch.to(device)
        
        mask = (sentences_batch != 0)
        optimizer.zero_grad()
        
        emissions = model(sentences_batch)

        loss = model.loss(emissions, tags_batch, mask)
        loss.backward()
        
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch: {epoch}, Train Loss: {total_loss / len(train_loader)}")

In [13]:
def evaluate_model(model, val_loader, epoch):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for sentences_batch, tags_batch in val_loader:
            sentences_batch = sentences_batch.to(device)
            tags_batch = tags_batch.to(device)
            mask = (sentences_batch != 0)
            
            emissions = model(sentences_batch)
            loss = model.loss(emissions, tags_batch, mask)
            total_loss += loss.item()
    print(f"Epoch: {epoch}, Validation Loss: {total_loss / len(val_loader)}")

In [44]:
def test_model(model, test_loader, ix_to_tag):
    model.eval()
    all_predictions = []
    masks = []
    with torch.no_grad():
        for sentences_batch, _ in test_loader:
            sentences_batch = sentences_batch.to(device)
            mask = (sentences_batch != 0)
            
            emissions = model(sentences_batch)
            predictions = model.decode(emissions, mask=mask)
            pred_tags = [[ix_to_tag[t] for t in seq] for seq in predictions]
            # print(pred_tags)
            all_predictions.extend(pred_tags)
            masks.extend(mask)
    
    return all_predictions, masks

In [None]:
def calculate_metrics(predictions, true_tags, true_tokens):
    
    levels = 1
    
    def extract_entities(seq, sentence):

        entities = []
        current_entity = None
        
        for i, tag in enumerate(seq):
            if tag == 'B':
                if current_entity:
                    entities.append(current_entity)
                current_entity = [sentence[i]]
            elif tag == 'I':
                if current_entity is None:
                    current_entity = [sentence[i]]
                else:
                    current_entity.append(sentence[i])
            elif tag == 'O':
                if current_entity:
                    entities.append(current_entity)
                    current_entity = None
        
        if current_entity:
            entities.append(current_entity)
        
        return entities
    
    def is_match(f1, f2, n):
        f1=set(f1)
        f2=set(f2)
        
        is_subset = f1.issubset(f2) or f2.issubset(f1)
        length_diff = abs(len(f1) - len(f2))
        
        return is_subset and length_diff <= n


    all_true_entites = []
    all_pred_entites = []
    
    for pred_seq, true_seq, token_seq in zip(predictions, true_tags, true_tokens):
        true_entities = extract_entities(true_seq, token_seq)
        pred_entities = extract_entities(pred_seq, token_seq)

        all_true_entites.append(true_entities)
        all_pred_entites.append(pred_entities)

    total_true = len(all_true_entites)
    total_pred = len(all_pred_entites)
    metrics = {}

    for level in range(levels):
        tp = 0
        fp = 0
        fn = 0

        for true_entities, pred_entities in zip(all_true_entites, all_pred_entites):   
            matched_true = set()
            for pred_entity in pred_entities:
                found_match = False
                
                for i, true_entity in enumerate(true_entities):
                    if i not in matched_true and is_match(pred_entity, true_entity, level):
                        tp += 1
                        matched_true.add(i)
                        found_match = True
                        break
                
                if not found_match:
                    fp += 1

            fn += len(true_entities) - len(matched_true)
    
        print(level, tp, fp, fn)
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        level_name = ['exact', 'n-1', 'n-2'][level]
        metrics.update({
            f'{level_name}_precision': precision,
            f'{level_name}_recall': recall,
            f'{level_name}_f1': f1
        })
    
    return metrics

In [None]:
def prepare_cross_domain_data(df, category_key, word_to_ix, tag_to_ix, BATCH_SIZE):
    cross_domain_data = []
    for category in df[category_key].value_counts().keys():
        train_sentences = df[df[category_key]!=category]['clean_content'].to_list()
        train_tags = df[df[category_key]!=category]['tags'].to_list()
        test_sentences = df[df[category_key]==category]['clean_content'].to_list()
        test_tags = df[df[category_key]==category]['tags'].to_list()

        train_data = prepare_data(train_sentences, train_tags, word_to_ix, tag_to_ix)
        test_data = prepare_data(test_sentences, test_tags, word_to_ix, tag_to_ix)

        train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
        test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

        cross_domain_data.append({'app_name_category': category, 'train_loader': train_loader, 'test_loader': test_loader, 'test_tags': test_tags, 'test_sentences': test_sentences})
    return cross_domain_data

In [None]:
def cross_domain_training_eval(EMBEDDING_DIM, HIDDEN_DIM, glove_embeddings, TAGSET_SIZE, cross_domain_data, ix_to_tag, epochs, device):
    results=[]
    
    for data in cross_domain_data:
        print(f'For app name/category: {data['app_name_category']}')
        model = initialize_model(EMBEDDING_DIM, HIDDEN_DIM, glove_embeddings, TAGSET_SIZE, device)
        optimizer = initialize_optimizer(model, lr=0.001)

        for i in range(epochs):
            train_model(model, data['train_loader'], optimizer, i)
        test_predictions, masks = test_model(model, data['test_loader'], ix_to_tag)
        f1_scores = calculate_metrics(test_predictions, data['test_tags'], data['test_sentences'])
        results.append({data['app_name_category']: f1_scores})
    
    return results

### Training and Evaluation on Dataset 1

In [None]:
dataset = pd.read_csv('../datafiles/dataset_1_preprocessed.csv')
dataset.info()

In [67]:
dataset['clean_content'] = dataset['clean_content'].apply(ast.literal_eval)
dataset['tags'] = dataset['tags'].apply(ast.literal_eval)
dataset = dataset[dataset['clean_content'].apply(len) > 0]

In [None]:
BATCH_SIZE = 32
HIDDEN_DIM = 512
EMBEDDING_DIM = 300
epochs=10
def out_domain_re_bert_test():
    reviews = dataset['clean_content'].to_list()
    tags = dataset['tags'].to_list()
    word_to_ix, tag_to_ix, ix_to_tag = transform_data(reviews)
    VOCAB_SIZE = len(word_to_ix)
    TAGSET_SIZE = len(tag_to_ix)
    glove_embeddings = get_glove(EMBEDDING_DIM, VOCAB_SIZE, word_to_ix)
    out_domain_data = prepare_cross_domain_data(dataset, 'App id', word_to_ix, tag_to_ix, BATCH_SIZE)
    results = cross_domain_training_eval(EMBEDDING_DIM, HIDDEN_DIM, glove_embeddings, TAGSET_SIZE, out_domain_data, ix_to_tag, epochs, device)
    return results

In [None]:
all_results_dataset_1=[]
for run in range(15):
    run_results = out_domain_re_bert_test()
    all_results_dataset_1.append({'run': run+1, 'results': run_results})
# Average of all the results reported above for final evaluation on dataset 1

### Training and Evaluation on Dataset 2

In [None]:
dataset = pd.read_csv('../datafiles/dataset_2_preprocessed.csv')
dataset.info()

In [71]:
dataset['clean_content'] = dataset['clean_content'].apply(ast.literal_eval)
dataset['tags'] = dataset['tags'].apply(ast.literal_eval)
dataset = dataset[dataset['clean_content'].apply(len) > 0]

In [None]:
BATCH_SIZE = 32
HIDDEN_DIM = 512
EMBEDDING_DIM = 300
epochs=1
def out_domain_tfrex_test():
    reviews = dataset['clean_content'].to_list()
    tags = dataset['tags'].to_list()
    word_to_ix, tag_to_ix, ix_to_tag = transform_data(reviews)
    VOCAB_SIZE = len(word_to_ix)
    TAGSET_SIZE = len(tag_to_ix)
    glove_embeddings = get_glove(EMBEDDING_DIM, VOCAB_SIZE, word_to_ix)
    out_domain_data = prepare_cross_domain_data(dataset, 'category', word_to_ix, tag_to_ix, BATCH_SIZE)
    results = cross_domain_training_eval(EMBEDDING_DIM, HIDDEN_DIM, glove_embeddings, TAGSET_SIZE, out_domain_data, ix_to_tag, epochs, device)
    return results

In [None]:
all_results_dataset_2=[]
for run in range(15):
    run_results = out_domain_tfrex_test()
    all_results_dataset_2.append({'run': run+1, 'results': run_results})
# Average of all the results reported above for final evaluation on dataset 2