In [None]:
import itertools
import json
import os
import sys
import time

import pandas as pd
import tensorflow_hub as hub


sys.path.append('../src/')
from semantic_representation.semantic_representation import supported_embedding_models
from clustering.clustering import supported_clustering_algorithms
from clustering.clustering_evaluation import supported_clustering_metrics
from candidate_extraction.candidate_extraction import supported_extraction_methods
from intent_label_generation.intent_label_generation import supported_label_generation_methods
from utils.io import generate_config_str, save_dataframe, save_json
from supported_config import params as default_params

In [None]:
gpu_fit = True

## Config

In [None]:
mode = 'evaluate'

# Data
dataset = ['stack_overflow']
data_dir = os.path.join("..", "data", "raw")
results_dir = os.path.join("..", "results")

all_choices = [dataset, ["bert-base-uncased", "all-mpnet-base-v2", "use"], ["dbscan"], ["silhouette"], ["liu", "prompting"], supported_label_generation_methods]
combinations = list(itertools.product(*all_choices))

#indexes = [4,5,6,7,12,13,14,15]


# With DA
#indexes = [8,9,10,11,16,17,18,19,20,21,22,23,28,29,30,31]
# Remove DA
#indexes.extend([4,5,6,7])
#for index in sorted(indexes, reverse=True):
#    del combinations[index]

In [None]:
for i, c in enumerate(combinations):
    print(i, c)

In [None]:
if mode == 'generate':
    from semantic_representation.semantic_representation import SemanticRepresentation
    from candidate_extraction.candidate_extraction import Extractor
    from clustering.clustering import ClusteringModel
    from clustering.clustering_evaluation import ClusteringEvaluation
    from intent_label_generation.intent_label_generation import IntentLabelGenerator
    start_time = time.time()
    for i, choices in enumerate(combinations):
        config = {'dataset': choices[0],
              'embedding_model_name': choices[1],
              'clustering_algorithm': choices[2],
              'clustering_measure': choices[3],
              'extraction_method': choices[4],
              'generation_method': choices[5]}
        config_str = generate_config_str(config)
        print(f"\nConfiguration {i}: {generate_config_str(config)}")
        if os.path.exists(os.path.join(results_dir, config_str, "generation_df.csv")):
            print("\tResults already exist")
            continue
        # Read in raw data
        data_filename = f"{config['dataset']}.csv"
        df = pd.read_csv(os.path.join(data_dir, data_filename))
        # Semantic Representation
        embedding_model = SemanticRepresentation(config)
        embeddings = embedding_model.embed(df["text"].tolist())
        if config['clustering_algorithm'] not in ["deep-aligned", "iter_dbscan"]:
            # Clustering
            cluster_model = ClusteringModel(config, gpu_fit=gpu_fit)
            params = dict(default_params)
            if config["clustering_algorithm"] == "kmeans":
                default_min_k = params["kmeans"]["range"][0]
                max_k = min(params["kmeans"]["range"][-1]+1, len(df))
                params["kmeans"]["range"] = range(default_min_k, max_k)
            cluster_model.run_clustering(embeddings, params[config["clustering_algorithm"]])
        cluster_evaluation = ClusteringEvaluation(config, gpu_eval=False)
        cluster_model = cluster_evaluation.get_best_params(embeddings)
        # Candidate Extraction
        extractor = Extractor(config)
        df = extractor.extract(df)
        # Intent Label Generation
        generator = IntentLabelGenerator(config)
        if config["clustering_algorithm"] == "iter_dbscan":
            labels = cluster_model["cluster_id"].tolist()
        else:
            labels = cluster_model.labels_
        df = generator.generate(df, labels, True)

    end_time = time.time()
    total_time = end_time - start_time
    print("Total time: %.2f. Average: %.2f" % (total_time, total_time/len(combinations)))

## Semantic Evaluation

In [None]:
for choices in combinations:
    config = {'dataset': choices[0],
          'embedding_model_name': choices[1],
          'clustering_algorithm': choices[2],
          'clustering_measure': choices[3],
          'extraction_method': choices[4],
          'generation_method': choices[5]}
    config_str = generate_config_str(config)
    filepath = f"../results/{config_str}/generation_df.csv"
    df = pd.read_csv(filepath, index_col=0)
    df = df.rename({"label": "intent"}, axis=1)
    df.to_csv(filepath)

In [None]:
dataset = dataset[0]
if mode == 'evaluate':
    from intent_label_generation.readable_intents import readable_intents
    from semantic_representation.semantic_representation import SemanticRepresentation
    from evaluation.semantic_label_evaluation import SemanticLabelEvaluator
    embedding_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    avg_cos_sims = []
    avg_bart_scores = []
    dataset_combos = [c for c in combinations if c[0] == dataset]
    for i, choices in enumerate(dataset_combos):
        #print(i)
        config = {'dataset': choices[0],
                'embedding_model_name': choices[1],
                'clustering_algorithm': choices[2],
                'clustering_measure': choices[3],
                'extraction_method': choices[4],
                'generation_method': choices[5]}
        print(generate_config_str(config))
        config_dir = os.path.join(results_dir, generate_config_str(config))
        df = pd.read_csv(os.path.join(config_dir, 'generation_df.csv'))
        print(df)
        evaluator = SemanticLabelEvaluator(config)            
        evaluator.evaluate(df, embedding_model)
        
        with open(os.path.join(config_dir, 'avg_sim.json'), 'r') as f:
            avg_sim = json.load(f)
            avg_cos_sims.append(avg_sim['avg'])
        with open(os.path.join(config_dir, 'avg_bart.json'), 'r') as f:
            avg_bart = json.load(f)
            avg_bart_scores.append(avg_bart['avg'])
    # Get best config using cosine similarity
    best_cos_idx = avg_cos_sims.index(max(avg_cos_sims))
    best_cos_combination = dataset_combos[best_cos_idx]
    best_cos_config = {'dataset': best_cos_combination[0],
                'embedding_model_name': best_cos_combination[1],
                'clustering_algorithm': best_cos_combination[2],
                'clustering_measure': best_cos_combination[3],
                'extraction_method': best_cos_combination[4],
                'generation_method': best_cos_combination[5]}
    best_cos_config_dir = os.path.join(results_dir, generate_config_str(best_cos_config))
    best_cos_labels = pd.read_csv(os.path.join(best_cos_config_dir, 'df_labels.csv'), index_col=False)

    # Save best config and labels to results
    best_dir = os.path.join(results_dir, dataset)
    save_json(os.path.join(best_dir, 'best_cos_config.json'), best_cos_config)
    save_dataframe(os.path.join(best_dir, 'best_cos_labels.csv'), best_cos_labels)
    
    # Get best config using bart scores
    best_bart_idx = avg_bart_scores.index(max(avg_bart_scores))
    best_bart_combination = dataset_combos[best_bart_idx]
    best_bart_config = {'dataset': best_bart_combination[0],
                'embedding_model_name': best_bart_combination[1],
                'clustering_algorithm': best_bart_combination[2],
                'clustering_measure': best_bart_combination[3],
                'extraction_method': best_bart_combination[4],
                'generation_method': best_bart_combination[5]}
    best_bart_config_dir = os.path.join(results_dir, generate_config_str(best_bart_config))
    best_bart_labels = pd.read_csv(os.path.join(best_bart_config_dir, 'df_labels.csv'), index_col=False)
    #print(best_labels)

    # Save best config and labels to results
    save_json(os.path.join(best_dir, 'best_bart_config.json'), best_bart_config)
    save_dataframe(os.path.join(best_dir, 'best_bart_labels.csv'), best_bart_labels)