Description:
Code for creating the Answer Similarity Graph.It also has the code for loading,saving,creating subgraphs with HTML and .txt(used by Louvain).

In [1]:
import time
import typing
from typing import Union

import pyspark.sql.functions as F
from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .appName("Simple Graphs")
    .config('spark.executor.instances','2')
    .config('spark.executor.memory','12G')
    .config('spark.executor.cores','4')
    .config('spark.driver.memory','6G')
    .config('spark.dynamicAllocation.enabled','false') # musai??
    .master('spark://master:7077')
    .config("spark.jars.packages", "graphframes:graphframes:0.8.4-spark3.5-s_2.12")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .getOrCreate()
)
from graphframes import GraphFrame

DATASETS_FOLDER='/home/ubuntu/data/'
OUTPUT_FILE_FOLDER = 'results'

SAVED_GRAPHS_FOLDER='/home/ubuntu/graphs/'
ANSWER_SIMILARITY_GRAPH_NAME = 'AS Graph'

GRAPH_NODE_SIZE = 5

:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1a2a0e08-7165-486c-9dab-3f45ca8dec6c;1.0
	confs: [default]
	found graphframes#graphframes;0.8.4-spark3.5-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 127ms :: artifacts dl 4ms
	:: modules in use:
	graphframes#graphframes;0.8.4-spark3.5-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------

In [2]:
from pyvis.network import Network
import networkx as nx
from IPython.display import display, HTML
import os

def read_dataset_with_pyspark(dataset_folder: str, dataset_name: str, dataset_extension: str = ".parquet"):
    dataset_path = dataset_folder + dataset_name
    if dataset_extension !='.parquet':
        dataset_path+=dataset_extension

    if dataset_extension == '.json':
        return spark.read.json(dataset_path, multiLine=True)
    elif dataset_extension == '.parquet':
        return spark.read.parquet(dataset_path)
    return None



def save_answer_similarity_graph(dataset_folder: str, dataset_name: str,
                                 threshold: float = 0.5, precision: float = 1e-9,
                                 max_no_nodes: Union[int, None] = 400,
                                 full_graph_folder: str = '',
                                 full_graph_name: str = ''):
    graph = get_answer_similarity_graph(dataset_folder, dataset_name,
                                        max_no_nodes=max_no_nodes,
                                        threshold=threshold, precision=precision,
                                        verbose_console=False, verbose_file=False,
                                        output_file=None
                                        )

    vertices_path = full_graph_folder + '/' + full_graph_name + ' Vertices'
    edges_path = full_graph_folder + '/' + full_graph_name + ' Edges'

    graph.vertices.write.parquet(vertices_path)
    graph.edges.write.parquet(edges_path)


def load_answer_similarity_graph(full_graph_folder: str,
                                 full_graph_name: str) -> GraphFrame:
    vertices_path = full_graph_folder + '/' + full_graph_name + ' Vertices'
    edges_path = full_graph_folder + '/' + full_graph_name + ' Edges'

    vertices = spark.read.parquet(vertices_path)
    edges = spark.read.parquet(edges_path)

    return GraphFrame(vertices, edges)


def load_graph(full_graph_folder: str,
                                 full_graph_name: str) -> GraphFrame:
    vertices_path = full_graph_folder + '/' + full_graph_name + ' Vertices'
    edges_path = full_graph_folder + '/' + full_graph_name + ' Edges'

    vertices = spark.read.parquet(vertices_path)
    edges = spark.read.parquet(edges_path)

    return GraphFrame(vertices, edges)

# Answer similarities graph (AS)

## Code for word/answer similarities and wordnet

In [3]:
# import nltk
# nltk.download('wordnet')

from nltk.corpus import wordnet as wn

from typing import Union


def compute_wordnet_metrics(word: str) -> Union[dict, None]:
    synset = wn.synsets(word)

    if len(synset) == 0:
        return None

    main_item = synset[0]

    word_metrics = {
        'Synonyms': set([syn.name().split('.')[0] for syn in wn.synsets(word)]),
        'Hyponyms': set([syn.name().split('.')[0] for syn in main_item.hyponyms()]),
        'Hypernyms': set([syn.name().split('.')[0] for syn in main_item.hypernyms()]),
        'Meronyms': set([syn.name().split('.')[0] for syn in main_item.part_meronyms()])
    }

    antonyms = []
    for syn in synset:
        for lem in syn.lemmas():
            if lem.antonyms():
                antonyms.append(lem.antonyms()[0].name().split('.')[0])

    word_metrics['Antonyms'] = set(antonyms)

    return word_metrics


def compute_similarity_scores(word1: str, word2: str) -> Union[dict, None]:
    if len(wn.synsets(word1)) == 0 or len(wn.synsets(word2)) == 0:
        return None

    syn1 = wn.synsets(word1)[0]
    syn2 = wn.synsets(word2)[0]

    return {
        'path': syn1.path_similarity(syn2),
        'wup': syn1.wup_similarity(syn2),
    }


def compute_word_similarities(word1: str, word2: str) -> float:
    """
    Take the metrics of both words and for each set count the common no of words over the total words.
    I thought of this as applying the Jaccard metric
    :param word1:
    :param word2:
    :return: the total number of common words / the total number of words
    """
    word1_metrics = compute_wordnet_metrics(word1)
    word2_metrics = compute_wordnet_metrics(word2)

    if word1_metrics is None or word2_metrics is None:
        return 0

    count_common_words = 0
    count_all_words = 0
    for key in word1_metrics.keys():
        count_common_words += len(word1_metrics[key] & word2_metrics[key])
        count_all_words += len(word1_metrics[key] | word2_metrics[key])

    return count_common_words / count_all_words


def appears_in_wordnet(word: str) -> bool:
    return len(wn.synsets(word)) > 0


## AS graph creation

In [4]:
def get_question_similarity_based_on_answer(question1_answer: str, question2_answer: str) -> float:
    """
    Compute the similarity between 2 answers(assuming they are 1 word).The values are in range[0,1]
    using both the similarity scores between WUP and a custom metric(similar with Jacobi) using the neighborhood
    of the answer/word
    """
    similarity_scores = compute_similarity_scores(question1_answer, question2_answer)
    wup_score = (similarity_scores['wup'] if similarity_scores is not None else 0)

    answers_neighborhood_similarity = compute_word_similarities(question1_answer, question2_answer)

    return wup_score + answers_neighborhood_similarity
    

def get_answer_similarity_graph(dataset_folder: str, dataset_name: str,
                                max_no_nodes: Union[int, None] = None,
                                threshold: float = 0.5, precision: float = 1e-9,
                                verbose_console: bool = True, verbose_file: bool = False,
                                output_file: Union[None, typing.TextIO] = None
                                ):
    dataset = read_dataset_with_pyspark(dataset_folder, dataset_name)

    vertices = []
    vertices_scheme = ['id', 'question', 'answer', 'level', 'type']
    edges = []
    edges_scheme = ['src', 'dst', 'relationship']

    all_answers = set()
    for index, question in enumerate(dataset.toLocalIterator()):
        answer_text = question['answer']
        if len(answer_text.split(' ')) == 1 and appears_in_wordnet(answer_text) and answer_text not in all_answers:
            all_answers.add(answer_text)

            answer_node_id = 'a' + str(index)

            vertices.append((answer_node_id, question['question'], answer_text, question['level'], question['type']))

        if max_no_nodes is not None and len(vertices) >= max_no_nodes:
            break

    no_valid_questions=len(vertices)
    if verbose_console:
        print(f"No valid questions: {no_valid_questions}")

    if verbose_file:
        print(f"No valid questions: {no_valid_questions}",file=output_file)
    

    no_edges_from_similarity, no_edges_from_level, no_edges_from_type = 0, 0, 0
    for index1, node1 in enumerate(vertices):
        index2 = index1 + 1
        node1_id = node1[0]
        while index2 < len(vertices):
            node2 = vertices[index2]
            node2_id = node2[0]

            # If the 2 questions have similar answers
            question_similarity = get_question_similarity_based_on_answer(node1[2], node2[2])
            if question_similarity - threshold >= precision:
                edges.append((node1_id, node2_id, 'similar'))
                no_edges_from_similarity += 1

            index2 += 1
        # print(f"At {index1}")

    no_questions=dataset.count()
    if verbose_console:
        print(f"\nNo questions: {no_questions}")

    if verbose_file:
        print(f"\nNo questions: {no_questions}", file=output_file)

    vertices = spark.createDataFrame(vertices, vertices_scheme)
    edges = spark.createDataFrame(edges, edges_scheme)
    graph = GraphFrame(vertices, edges).dropIsolatedVertices()

    no_nodes, no_edges = None, None
    if verbose_console or verbose_file:
        no_nodes = graph.vertices.count()
        no_edges = graph.edges.count()

    if verbose_console:
        print(f"No nodes: {no_nodes}")
        print(f"Total no edges: {no_edges}")
        print(
            f"No edges from similarity >= {threshold} : {no_edges_from_similarity} ({(no_edges_from_similarity / no_edges):.2f} %)")

    if verbose_file:
        print(f"No nodes: {no_nodes}", file=output_file)
        print(f"Total no edges: {no_edges}", file=output_file)
        print(
            f"No edges from similarity >= {threshold} : {no_edges_from_similarity} ({(no_edges_from_similarity / no_edges):.2f} %)",
            file=output_file)

    return graph

## AS Graph info

In [5]:
def show_answer_similarity_graph(dataset_folder: str, dataset_name: str,
                                 threshold: float = 0.5, precision: float = 1e-9,
                                 max_no_nodes: Union[int, None] = 400,
                                 verbose_console: bool = True):
    graph = get_answer_similarity_graph(dataset_folder, dataset_name, max_no_nodes=max_no_nodes,
                                        threshold=threshold, precision=precision,
                                        verbose_console=verbose_console,
                                        verbose_file=False, output_file=None)

    df_connected_components = graph.connectedComponents(algorithm="graphx").collect()

    nx_graph = nx.empty_graph()
    for row in df_connected_components:
        node_id = row['id']
        group = row['component']

        question, answer, level, question_type = row['question'], row['answer'], row['level'], \
                                                 row['type']
        title = f'Question:{question}\nAnswer:{answer}\nLevel:{level}\nType:{question_type}'
        shape = 'triangle'

        nx_graph.add_node(node_id, size=GRAPH_NODE_SIZE, title=title, group=group, shape=shape)

    for edge in graph.edges.collect():
        node1 = edge['src']
        node2 = edge['dst']
        nx_graph.add_edge(node1, node2)

    nt = Network('900px', '1200px', select_menu=True, cdn_resources='remote')
    nt.from_nx(nx_graph)

    html = nt.generate_html()
    with open(OUTPUT_FILE_FOLDER + "/" + ANSWER_SIMILARITY_GRAPH_NAME + " Visualization.html", mode='w',
              encoding='utf-8') as html_file:
        html_file.write(html)
    display(HTML(html))



def show_answer_similarity_graph_df(dataset_folder: str, dataset_name: str,
                                    threshold: float = 0.5, precision: float = 1e-9,
                                    max_no_nodes: Union[int, None] = 400,
                                    save_graph: bool = False,
                                    verbose_console: bool = True, verbose_file: bool = False,
                                    from_graph=None):
    """
    4.Nodes as the answers,and create edges based on the similarity between answers.In this case,only consider answers with 1 word in order
    #   compute similarity using WordNet
    """

    output_file = None
    if verbose_file:
        output_file = open(
            OUTPUT_FILE_FOLDER + "/" + dataset_name + ' ' + ANSWER_SIMILARITY_GRAPH_NAME + ".txt", 'w')

    graph=None
    if from_graph is None:
        graph = get_answer_similarity_graph(dataset_folder, dataset_name,
                                            max_no_nodes=max_no_nodes,
                                            threshold=threshold, precision=precision,
                                            verbose_console=verbose_console, verbose_file=verbose_file,
                                            output_file=output_file
                                            )
    else:
        graph=from_graph

    no_vertices,no_edges=graph.vertices.count(),graph.edges.count()
    if verbose_console:
        print(f"\nNo vertices: {no_vertices}")
        print(f"No edges: {no_edges}")

    if verbose_file:
        print(f"\nNo vertices: {no_vertices}",file=output_file)
        print(f"No edges: {no_edges}",file=output_file)

    df_connected_components = graph.connectedComponents(algorithm="graphx")

    # df_connected_components.show()

    df_connected_components_grouped = df_connected_components.groupBy("component")

    no_components=df_connected_components.select("component").distinct().count()
    if verbose_console:
        print(f"\nNo components: {no_components}")

    if verbose_file:
        print(f"\nNo components: {no_components}",file=output_file)

    df_connected_components_sizes = df_connected_components_grouped.agg(F.count("*").alias("size"))

    df_connected_components_max_min = df_connected_components_sizes.agg(F.max(F.col('size')).alias('max_no_nodes'),
                                                                        F.min(F.col('size')).alias(
                                                                            'min_no_nodes')).first()
    max_size_connected_component, min_size_connected_component = df_connected_components_max_min[0], \
                                                                 df_connected_components_max_min[1]

    if verbose_console:
        print(f"\nMax no nodes in a component: {max_size_connected_component}")
        print(f"Min no nodes in a component: {min_size_connected_component}")

    if verbose_file:
        print(f"\nMax no nodes in a component: {max_size_connected_component}", file=output_file)
        print(f"Min no nodes in a component: {min_size_connected_component}", file=output_file)

In [15]:
def main():
    start_time = time.time()
    
    dataset_name='hotpot_train_v1.1'
    threshold=0.9
    max_no_nodes=500

    show_answer_similarity_graph(DATASETS_FOLDER, dataset_name,
                                     threshold=threshold,
                                     max_no_nodes=max_no_nodes,
                                     verbose_console=True)

    print(f"Execution time: {time.time() - start_time} s")


main()

[Stage 610:>                                                        (0 + 1) / 1]

No valid questions: 500


                                                                                


No questions: 90447
No nodes: 231
Total no edges: 436
No edges from similarity >= 0.9 : 436 (1.00 %)


                                                                                

Execution time: 57.497283935546875 s


In [19]:
def main2():
    start_time = time.time()
    
    dataset_name='hotpot_train_v1.1'

    # Irrelevant since we load from the graph
    max_no_patterns_to_print=20
    threshold=0.9
    max_no_nodes=None

    graph=load_graph('/home/ubuntu/AS Graphs','AS Graph Train')
    
    show_answer_similarity_graph_df(DATASETS_FOLDER, dataset_name,
                                    threshold=threshold,
                                    max_no_nodes=max_no_nodes,
                                    save_graph=True,
                                    verbose_console=True, verbose_file=True,
                                    from_graph=graph
                                   )

    print(f"Execution time: {time.time() - start_time} s")


main2()


Max no nodes in a component: 194
Min no nodes in a component: 2
Execution time: 5.877483129501343 s


## AS Subgraphs and application of PageRank and Label Propagation

In [7]:
import codecs

ANSWER_SIMILARITY_SUBGRAPH_FOLDER = 'AS Subgraphs'

GRAPH_NODE_SIZE = 5

PAGE_RANK_NODE_SCALING_SIZE = 5
AS_LENGTH_EDGE = 500

def create_txt_for_AS_subgraph(subgraph, component_id: int):
    no_vertices = subgraph.vertices.count()
    no_edges = subgraph.edges.count()
    subgraph_name = f"Component {component_id} - {no_vertices}.txt"

    directory = 'results/Louvain/AS Subgraphs/'
    # Create folders if they don't exist
    if not os.path.exists(directory):
        os.makedirs(directory)
    file_path = directory + subgraph_name

    start_index = 1
    answer_index = dict()

    with codecs.open(file_path, 'w', "utf-8") as file:
        file.write(f'p {no_vertices} {no_edges}\n')

        for row in subgraph.vertices.collect():
            node_id = row['id']

            if node_id not in answer_index:
                answer_index[node_id] = start_index
                start_index += 1

            question, answer, level, question_type = row['question'], row['answer'], row['level'], \
                                                     row['type']
            node_index = answer_index[node_id]

            file.write(f'v {node_index} {answer} {level} {question_type} {question}\n')

        for row in subgraph.edges.collect():
            node_id1 = row['src']
            node_id2 = row['dst']

            file.write(f'e {answer_index[node_id1]} {answer_index[node_id2]} 1\n')


def create_html_for_AS_subgraph(subgraph, component_id: int):
    no_vertices = subgraph.vertices.count()

    nx_graph = nx.empty_graph()
    for row in subgraph.vertices.collect():
        node_id = row['id']

        question, answer, level, question_type = row['question'], row['answer'], row['level'], \
                                                 row['type']

        page_rank = row['pagerank']
        group = row['label']

        title = f'Page Rank:{page_rank}\nQuestion:{question}\nAnswer:{answer}\nLevel:{level}\nType:{question_type}'
        label = f'Answer:{answer}\nPage Rank:{page_rank:.2f}\nLabel:{group}'
        shape = 'square'

        node_size = int(round(PAGE_RANK_NODE_SCALING_SIZE * page_rank))

        nx_graph.add_node(node_id, size=node_size, title=title, label=label, shape=shape, group=group)

    for edge in subgraph.edges.collect():
        node1 = edge['src']
        node2 = edge['dst']
        weight = edge['weight']

        nx_graph.add_edge(node1, node2, label=weight)

    nt = Network('900px', '1700px', select_menu=True, cdn_resources='remote')
    nt.from_nx(nx_graph)

    for edge in subgraph.edges.collect():
        node1 = edge['src']
        node2 = edge['dst']
        nx_graph.add_edge(node1, node2, length=AS_LENGTH_EDGE)

    nt = Network('900px', '1700px', select_menu=True, cdn_resources='remote')
    nt.from_nx(nx_graph)

    # nt.enable_physics(True)

    nt.set_options("""
                    {
                      "interaction": {
                        "hover": true,          
                        "navigationButtons": true,
                        "keyboard": true        
                      },
                      "manipulation": {
                        "enabled": true         
                      },
                      "physics": {
                        "enabled": true,
                        "repulsion": {
                          "nodeDistance": 450,
                          "springLength": 500
                        }
                      },
                      "layout": {
                        "hierarchical": {
                          "enabled": false
                        }
                      }
                    }
                    """
                   )

    html_name = f"Component {component_id} - {no_vertices} nodes.html"
    html_directory = 'results/Louvain/AS Subgraphs/'
    # Create folders if they don't exist
    if not os.path.exists(html_directory):
        os.makedirs(html_directory)

    html_path = html_directory + html_name

    html = nt.generate_html()
    with open(html_path, mode='w',
              encoding='utf-8') as html_file:
        html_file.write(html)
    # display(HTML(html))


def show_answer_similarity_subgraphs(dataset_folder: str, dataset_name: str,
                                     threshold: float = 0.5, precision: float = 1e-9,
                                     max_no_nodes: Union[int, None] = 400,
                                     subgraphs_min_no_nodes: int = 4, subgraphs_max_no_nodes:int=1000,
                                     subgraphs_min_no_edges: int = 4,
                                     page_rank_beta: float = 0.15, page_rank_max_no_iterations: int = 10,
                                     lpa_max_no_iterations: int = 10,
                                     save_full_graph: bool = False,
                                     full_graph_folder: Union[None, str] = None,
                                     full_graph_name: Union[None, str] = None,
                                     verbose_console: bool = True, verbose_file: bool = False,
                                     from_graph=None):
    output_file = None
    if verbose_file:
        output_file = open(
            OUTPUT_FILE_FOLDER + "/" + dataset_name + ' ' + ANSWER_SIMILARITY_GRAPH_NAME + ".txt", 'w')

    graph=None
    if from_graph is None:
        graph = get_answer_similarity_graph(dataset_folder, dataset_name,
                                            max_no_nodes=max_no_nodes,
                                            threshold=threshold, precision=precision,
                                            verbose_console=verbose_console, verbose_file=verbose_file,
                                            output_file=output_file
                                            )
    else:
        graph=from_graph

    if save_full_graph and full_graph_folder is not None and full_graph_name is not None:
        vertices_path = full_graph_folder + '/' + full_graph_name + ' Vertices'
        edges_path = full_graph_folder + '/' + full_graph_name + ' Edges'

        graph.vertices.write.parquet(vertices_path)
        graph.edges.write.parquet(edges_path)

    df_connected_components = graph.connectedComponents(algorithm="graphx")

    # df_connected_components.show()

    df_connected_components_grouped = df_connected_components.groupBy("component")

    df_connected_components_sizes = df_connected_components_grouped.agg(F.count("*").alias("size"))

    df_connected_components_max_min = df_connected_components_sizes.agg(F.max(F.col('size')).alias('max_no_nodes'),
                                                                        F.min(F.col('size')).alias(
                                                                            'min_no_nodes')).first()
    max_size_connected_component, min_size_connected_component = df_connected_components_max_min[0], \
                                                                 df_connected_components_max_min[1]

    if verbose_console:
        print(f"Max no nodes in a component: {max_size_connected_component}")
        print(f"Min no nodes in a component: {min_size_connected_component}")

    df_connected_components = df_connected_components.select(['id', 'component'])

    # df_max_components = df_connected_components_sizes.filter(F.col("size") == max_size_connected_component).select(
    #     "component").collect()

    for component in df_connected_components_sizes.collect():
        component_id, component_size = component['component'], component['size']

        if component_size < subgraphs_min_no_nodes or component_size > subgraphs_max_no_nodes:
            continue

        # print(f"Nodes for the component {component_id}")

        subgraph_vertices = graph.vertices.join(df_connected_components,
                                                on='id').filter(
            F.col('component') == component_id).dropDuplicates(['id'])

        # TODO:Attempt more efficient way to get the nodes and edges...
        # subgraph_vertices.show(truncate=False)
        # subgraph = graph.filterVertices(F.col('id')).dropIsolatedVertices()

        vertices_ids_src = subgraph_vertices.select('id').withColumnRenamed('id', 'src')
        vertices_ids_dst = subgraph_vertices.select('id').withColumnRenamed('id', 'dst')

        subgraph_edges = graph.edges.join(vertices_ids_src, on='src', how='inner').join(vertices_ids_dst, on='dst',
                                                                                        how='inner')

        no_edges = subgraph_edges.count()
        if no_edges < subgraphs_min_no_edges:
            continue

        subgraph = GraphFrame(subgraph_vertices, subgraph_edges)
        labels_df = subgraph.labelPropagation(maxIter=lpa_max_no_iterations).select(['id', 'label'])

        subgraph_vertices = subgraph_vertices.join(labels_df, on='id')
        subgraph = GraphFrame(subgraph_vertices, subgraph_edges)

        subgraph = subgraph.pageRank(resetProbability=page_rank_beta, maxIter=page_rank_max_no_iterations)

        create_html_for_AS_subgraph(subgraph, component_id)
        create_txt_for_AS_subgraph(subgraph,component_id)

In [8]:
def main3():
    dataset_folder = 'datasets'
    # dataset_name = 'hotpot_dev_distractor_v1'
    dataset_name = 'hotpot_train_v1.1'

    start_time = time.time()

    graph = load_graph('/home/ubuntu/AS Graphs/','AS Graph Train')

    show_answer_similarity_subgraphs(dataset_folder, dataset_name,
                                     threshold=0.9,
                                     max_no_nodes=250,
                                     subgraphs_min_no_nodes=25, subgraphs_max_no_nodes=250,
                                     subgraphs_min_no_edges=4,
                                     page_rank_beta=0.15, page_rank_max_no_iterations=25,
                                     lpa_max_no_iterations=25,
                                     save_full_graph=False,
                                     full_graph_folder='results/AS Graphs',
                                     full_graph_name='AS Graph1',
                                     verbose_console=True,
                                     from_graph=graph)

main3()

25/01/16 23:06:27 WARN CacheManager: Asked to cache already cached data.


Max no nodes in a component: 194
Min no nodes in a component: 2


                                                                                

## Loading and saving AS graphs

In [9]:
def main():
    start_time = time.time()
    
    dataset_folder=DATASETS_FOLDER
    dataset_name='hotpot_train_v1.1'
    
    full_graph_folder='/home/ubuntu/AS Graphs/'
    full_graph_name='AS Graph Train'
    
    # save_answer_similarity_graph(dataset_folder, dataset_name,
    #                              threshold=0.9, max_no_nodes=None,
    #                              full_graph_folder=full_graph_folder,
    #                              full_graph_name=full_graph_name,
    #                              )
    
    print(f"Execution time: {time.time() - start_time} s")

# main()

[Stage 118:>                                                        (0 + 1) / 1]

KeyboardInterrupt: 