In [1]:
# Code for processing files,such as getting the ids of questions that have a common support paragraph,used for clusterization

import codecs
import json
import time
import typing
from typing import Union

import os

import pyspark.sql.functions as F
from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .appName("HotpotQA Clustering")
    .config('spark.executor.instances','2')
    .config('spark.executor.memory','4G')
    .config("spark.driver.memory", "4G")
    .config('spark.executor.cores','6')
    .config('spark.dynamicAllocation.enabled','false') # musai??
    .master('spark://master:7077')
    .config("spark.jars.packages", "graphframes:graphframes:0.8.4-spark3.5-s_2.12")
    .getOrCreate())
from graphframes import GraphFrame

OUTPUT_FILE_FOLDER = 'processed'
DATASETS_FOLDER='/home/ubuntu/data/'

QUESTION_PARAGRAPH_GRAPH_NAME = 'QP Graph Q1 Q2 P'


def read_dataset_with_pyspark(dataset_folder: str, dataset_name: str, dataset_extension: str = ".parquet"):
    dataset_path = dataset_folder + dataset_name

    return spark.read.parquet(dataset_path, multiLine=True)


def get_question_paragraph_graph(dataset_folder: str, dataset_name: str,
                                 max_no_nodes: Union[int, None] = None,
                                 verbose_console: bool = True) -> GraphFrame:
    # dataset = read_dataset(dataset_folder, dataset_name)
    dataset = read_dataset_with_pyspark(dataset_folder, dataset_name)

    vertices = []
    vertices_scheme = ['id', 'node_type', 'node_text', 'question_answer', 'question_level', 'question_type',
                       'dataset_id']
    edges = []
    edges_scheme = ["src", "dst"]

    # TODO:The number of nodes is suspiciously no of questions * 3...Which implies that there aren't any 2 paragraphs with the same title...
    #  Probably not true,meaning the GraphFrame doesn't eliminate duplicate nodes with the same id...Fix

    all_paragraph_titles = dict()

    for index, question in enumerate(dataset.collect()):
        question_node_id = 'q' + str(index)

        if max_no_nodes is not None and index >= max_no_nodes:
            break

        supporting_titles = set()
        for supporting_fact in question['supporting_facts']:
            supporting_title = supporting_fact[0]
            supporting_titles.add(supporting_title)

        vertices.append(
            (question_node_id, 'q', question['question'], question['answer'], question['level'], question['type'],
             question['_id']))

        for supporting_title in supporting_titles:
            add_paragraph = False
            if supporting_title not in all_paragraph_titles:
                # in this
                all_paragraph_titles[supporting_title] = len(all_paragraph_titles)
                add_paragraph = True

            title_node_id = 'p' + str(all_paragraph_titles[supporting_title])

            if add_paragraph:
                vertices.append((title_node_id, 'p', supporting_title, None, None, None, None))

            # The question requires the information in this context/paragraph
            edges.append((question_node_id, title_node_id))

    # TODO:Better refactoring?

    vertices = spark.createDataFrame(vertices, vertices_scheme)
    edges = spark.createDataFrame(edges, edges_scheme)
    graph = GraphFrame(vertices, edges)

    no_nodes, no_edges, no_instances = None, None, None
    if verbose_console:
        no_nodes = graph.vertices.count()
        no_edges = graph.edges.count()
        no_instances = dataset.count()

    if verbose_console:
        print(f"Total no questions: {no_instances}")
        print(f"No vertices: {no_nodes}")
        print(f"No edges: {no_edges}")

    return graph


def show_question_paragraph_graph_df(dataset_folder: str, dataset_name: str):
    """
    3.Direct connection by using the titles of the supporting contexts as an edge between questions
    """

    graph = get_question_paragraph_graph(dataset_folder, dataset_name,
                                         max_no_nodes=None)

    df_questions_same_relevant_paragraph = graph.find("(q1)-[]->(p);(q2)-[]->(p)"). \
        filter("q1.node_type='q'").filter("q2.node_type='q'"). \
        filter("p.node_type='p'"). \
        filter("q1.id < q2.id"). \
        dropDuplicates(['q1', 'q2', 'p'])

    info = dict()

    for pattern in df_questions_same_relevant_paragraph.collect():
        paragraph_title = pattern['p']['node_text']
        if paragraph_title not in info:
            info[paragraph_title] = set()

        info[paragraph_title].add(pattern['q1']['dataset_id'])
        info[paragraph_title].add(pattern['q2']['dataset_id'])

    serializable_info = dict()
    for title, ids in info.items():
        serializable_info[title] = list(ids)

    with open(OUTPUT_FILE_FOLDER + "/" + dataset_name + ' ' + QUESTION_PARAGRAPH_GRAPH_NAME + ".json", 'w',
              encoding="utf-8") as file:
        json.dump(serializable_info, file)


def main():
    dataset_folder = DATASETS_FOLDER
    dataset_name = 'hotpot_train_v1.1'

    show_question_paragraph_graph_df(dataset_folder, dataset_name)


if __name__ == '__main__':
    main()


:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b051b4dc-749d-4c5d-910b-4ece7bf4789f;1.0
	confs: [default]
	found graphframes#graphframes;0.8.4-spark3.5-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 106ms :: artifacts dl 4ms
	:: modules in use:
	graphframes#graphframes;0.8.4-spark3.5-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------

Total no questions: 90447
No vertices: 196017
No edges: 180894


25/01/15 14:31:21 WARN TaskSetManager: Stage 12 contains a task of very large size (1418 KiB). The maximum recommended task size is 1000 KiB.
25/01/15 14:31:22 WARN TaskSetManager: Stage 13 contains a task of very large size (1418 KiB). The maximum recommended task size is 1000 KiB.
                                                                                