In [None]:
!./environment_setup.sh

## Imports

In [None]:
import itertools
import json
import os
import sys
import time

import pandas as pd
import tensorflow_hub as hub
import torch

sys.path.append('../src/')
from candidate_extraction.candidate_extraction import Extractor, supported_extraction_methods
from intent_label_generation.intent_label_generation import IntentLabelGenerator, supported_label_generation_methods
from intent_label_generation.readable_intents import readable_intents
from utils.io import load, save_dataframe, save_json, save

## Config

In [None]:
mode = 'generate'

# Data
dataset = 'clinc'
embeddings_path = os.path.join("..", "data", "interim", "embeddings", f"{dataset}_embeddings.pkl")
data_dir = os.path.join("..", "data", "raw", dataset)
results_dir = os.path.join("..", "results")

# Cluster
cluster_models_dir = os.path.join("..", "models")
cluster_model_filename = f"{dataset}_deep_aligned_cluster_model.pkl"
cluster_model_path = os.path.join(cluster_models_dir, cluster_model_filename)
print(cluster_model_path)

all_choices = [supported_extraction_methods, supported_label_generation_methods]
combinations = list(itertools.product(*all_choices))

## Load Data

In [None]:
all_files = [os.path.join(data_dir, f"{split}.tsv") for split in ['train', 'dev', 'test']]
dfs = [pd.read_csv(filename, index_col=None, sep='\t') for filename in all_files]
df = pd.concat(dfs, axis=0, ignore_index=True)
texts = df['text'].tolist()
num_labels = len(df['label'].unique())

In [None]:
if mode == 'generate':
    start_time = time.time()
    for i, choices in enumerate(combinations):
        config = {'extraction_method': choices[0], 'generation_method': choices[1]}
        print(config)
        # Clustering
        if os.path.exists(cluster_model_path):
            print("Loading existing cluster model...")
            cluster_model = load(os.path.join(cluster_model_path))
            assert len(set(cluster_model.labels_)) == num_labels
        else:
            print("No cluster model found.")
            break
        # Candidate Extraction
        extractor = Extractor(dataset, config['extraction_method'])
        df = extractor.extract(df)
        
        # Intent Label Generation
        generator = IntentLabelGenerator(dataset, config['extraction_method'], config['generation_method'])
        df = generator.generate(df, cluster_model.labels_, False)
        

    end_time = time.time()
    total_time = end_time - start_time
    print("Total time: %.2f. Average: %.2f" % (total_time, total_time/len(combinations)))

## Semantic Evaluation

In [None]:
if mode == 'evaluate':
    from evaluation.semantic_label_evaluation import SemanticLabelEvaluator
    embedding_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    avg_cos_sims = []
    avg_bart_scores = []
    for i, choices in enumerate(combinations):
        #print(i)
        extraction_method = choices[0]
        generation_method = choices[1]
        config_str = '_'.join([dataset, extraction_method, generation_method])
        config_dir = os.path.join(results_dir, config_str) 
        df = pd.read_csv(os.path.join(config_dir, 'generation_df.csv'))
        evaluator = SemanticLabelEvaluator(dataset, extraction_method, generation_method)            
        evaluator.evaluate(df, embedding_model)
        
        with open(os.path.join(config_dir, 'avg_sim.json'), 'r') as f:
            avg_sim = json.load(f)
            avg_cos_sims.append(avg_sim['avg'])
        with open(os.path.join(config_dir, 'avg_bart.json'), 'r') as f:
            avg_bart = json.load(f)
            avg_bart_scores.append(avg_bart['avg'])
    # Get best config using cosine similarity
    best_cos_idx = avg_cos_sims.index(max(avg_cos_sims))
    best_cos_combination = combinations[best_cos_idx]
    best_cos_config = {'extraction_method': best_cos_combination[0],
                'generation_method': best_cos_combination[1]}
    best_cos_config_dir = os.path.join(results_dir, '_'.join([dataset, best_cos_config['extraction_method'], best_cos_config['generation_method']]))
    best_cos_labels = pd.read_csv(os.path.join(best_cos_config_dir, 'df_labels.csv'), index_col=False)

    # Save best config and labels to results
    best_dir = os.path.join(results_dir, dataset)
    save_json(os.path.join(best_dir, 'best_cos_config.json'), best_cos_config)
    save_dataframe(os.path.join(best_dir, 'best_cos_labels.csv'), best_cos_labels)
    
    # Get best config using bart scores
    best_bart_idx = avg_bart_scores.index(max(avg_bart_scores))
    best_bart_combination = combinations[best_bart_idx]
    best_bart_config = {'extraction_method': best_bart_combination[0],
                'generation_method': best_bart_combination[1]}
    best_bart_config_dir = os.path.join(results_dir, '_'.join([dataset, best_bart_config['extraction_method'], best_bart_config['generation_method']]))
    best_bart_labels = pd.read_csv(os.path.join(best_bart_config_dir, 'df_labels.csv'), index_col=False)
    #print(best_labels)

    # Save best config and labels to results
    save_json(os.path.join(best_dir, 'best_bart_config.json'), best_bart_config)
    save_dataframe(os.path.join(best_dir, 'best_bart_labels.csv'), best_bart_labels)