In [None]:
import itertools
import json
import os
import sys
import time

import pandas as pd
import tensorflow_hub as hub

sys.path.append('../src/')
from intent_label_generation.intent_label_generation import supported_label_generation_methods
from utils.io import generate_config_str, save_dataframe, save_json

## Config

In [None]:
# Data
dataset = ['stack_overflow']
data_dir = os.path.join("..", "data", "raw")
supervision = "unsupervised"

if supervision == "unsupervised":
    embedding_models = ["bert-base-uncased", "all-mpnet-base-v2", "use"]
    clustering_algorithms = ["kmeans", "dbscan", "iter_dbscan"]
else:
    embedding_models = ["bert-base-uncased"]
    clustering_algorithms = ["deep-aligned"]

all_choices = [dataset, embedding_models, clustering_algorithms, ["silhouette"], ["liu", "prompting"], supported_label_generation_methods]
combinations = list(itertools.product(*all_choices))
if supervision == "unsupervised":
    indexes = [8,9,10,11,20,21,22,23]
    for index in sorted(indexes, reverse=True):
        del combinations[index]

In [None]:
for i, c in enumerate(combinations):
    print(i, c)

## Semantic Evaluation

In [None]:
dataset = dataset[0]
results_dir = os.path.join("..", "results", dataset)

from intent_label_generation.readable_intents import readable_intents
from semantic_representation.semantic_representation import SemanticRepresentation
from evaluation.semantic_label_evaluation import SemanticLabelEvaluator
embedding_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
avg_cos_sims = []
avg_bart_scores = []
dataset_combos = [c for c in combinations if c[0] == dataset]
for i, choices in enumerate(dataset_combos):
    #print(i)
    config = {'dataset': choices[0],
            'embedding_model_name': choices[1],
            'clustering_algorithm': choices[2],
            'clustering_measure': choices[3],
            'extraction_method': choices[4],
            'generation_method': choices[5]}
    print(generate_config_str(config))
    config_dir = os.path.join(results_dir, generate_config_str(config))
    df = pd.read_csv(os.path.join(config_dir, 'generation_df.csv'))
    print(df)
    if not os.path.exists(os.path.join(config_dir, 'avg_sim.json')):
        evaluator = SemanticLabelEvaluator(config)            
        evaluator.evaluate(df, embedding_model)
    
    with open(os.path.join(config_dir, 'avg_sim.json'), 'r') as f:
        avg_sim = json.load(f)
        avg_cos_sims.append(avg_sim['avg'])
    with open(os.path.join(config_dir, 'avg_bart.json'), 'r') as f:
        avg_bart = json.load(f)
        avg_bart_scores.append(avg_bart['avg'])
# Get best config using cosine similarity
best_cos_idx = avg_cos_sims.index(max(avg_cos_sims))
best_cos_combination = dataset_combos[best_cos_idx]
best_cos_config = {'dataset': best_cos_combination[0],
            'embedding_model_name': best_cos_combination[1],
            'clustering_algorithm': best_cos_combination[2],
            'clustering_measure': best_cos_combination[3],
            'extraction_method': best_cos_combination[4],
            'generation_method': best_cos_combination[5]}
best_cos_config_dir = os.path.join(results_dir, generate_config_str(best_cos_config))
best_cos_labels = pd.read_csv(os.path.join(best_cos_config_dir, 'df_labels.csv'), index_col=False)

# Save best config and labels to results
best_dir = os.path.join(results_dir, supervision)
save_json(os.path.join(best_dir, f'best_cos_config_{supervision}.json'), best_cos_config)
save_dataframe(os.path.join(best_dir, f'best_cos_labels_{supervision}.csv'), best_cos_labels)

# Get best config using bart scores
best_bart_idx = avg_bart_scores.index(max(avg_bart_scores))
best_bart_combination = dataset_combos[best_bart_idx]
best_bart_config = {'dataset': best_bart_combination[0],
            'embedding_model_name': best_bart_combination[1],
            'clustering_algorithm': best_bart_combination[2],
            'clustering_measure': best_bart_combination[3],
            'extraction_method': best_bart_combination[4],
            'generation_method': best_bart_combination[5]}
best_bart_config_dir = os.path.join(results_dir, generate_config_str(best_bart_config))
best_bart_labels = pd.read_csv(os.path.join(best_bart_config_dir, 'df_labels.csv'), index_col=False)
#print(best_labels)

# Save best config and labels to results
save_json(os.path.join(best_dir, f'best_bart_config_{supervision}.json'), best_bart_config)
save_dataframe(os.path.join(best_dir, f'best_bart_labels_{supervision}.csv'), best_bart_labels)