Code for tthe Question Answer Graph(QA),Question Paragraph Graph(QP) and Question Answer Paragraph(QAP).

# Code for setting up GraphFrames and useful functions

In [1]:
# Used mv /home/ubuntu/jupyter/graphframes-0.8.4-spark3.5-s_2.12.jar /usr/local/spark/jars/ to move graphframes jar
# Tried adding it to spark-defaults.conf but don't have write permission

In [2]:
# %pip install wordnet

In [3]:
# %pip install nltk

In [1]:
import time
import typing
from typing import Union

import pyspark.sql.functions as F
from pyspark.sql import SparkSession

from pyvis.network import Network
import networkx as nx
from IPython.display import display, HTML

In [2]:
spark = (SparkSession.builder
    .appName("Simple Graphs")
    .config('spark.executor.instances','2')
    .config('spark.executor.memory','8G')
    .config('spark.executor.cores','6')
    .config('spark.driver.memory','4G')
    .config('spark.dynamicAllocation.enabled','false') # musai??
    .master('spark://master:7077')
    .config("spark.jars.packages", "graphframes:graphframes:0.8.4-spark3.5-s_2.12")
    .getOrCreate()
)
from graphframes import GraphFrame

DATASETS_FOLDER='/home/ubuntu/data'
OUTPUT_FILE_FOLDER = 'results'
QUESTION_ANSWER_GRAPH_NAME = 'QA Graph'
QUESTION_PARAGRAPH_GRAPH_NAME = 'QP Graph'
QUESTION_ANSWER_PARAGRAPH_GRAPH_NAME = 'QAP Graph'
ANSWER_SIMILARITY_GRAPH_NAME = 'AS Graph'

GRAPH_SAVE_FOLDER='/home/ubuntu/graphs/'

GRAPH_NODE_SIZE = 5

:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-467253e6-f52d-4b9e-b134-c7b38550afeb;1.0
	confs: [default]
	found graphframes#graphframes;0.8.4-spark3.5-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 112ms :: artifacts dl 4ms
	:: modules in use:
	graphframes#graphframes;0.8.4-spark3.5-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------

In [3]:
import json

def read_dataset_with_pyspark(dataset_folder: str, dataset_name: str, dataset_extension: str = ".parquet"):
    dataset_path = ("" if dataset_folder == "" else dataset_folder + "/") + dataset_name

    if dataset_extension == '.json':
        return spark.read.json(dataset_path, multiLine=True)
    elif dataset_extension == '.parquet':
        return spark.read.parquet(dataset_path)
    return None
    

def save_graph(graph,full_graph_folder,full_graph_name):
    vertices_path = full_graph_folder + '/' + full_graph_name + ' Vertices'
    edges_path = full_graph_folder + '/' + full_graph_name + ' Edges'

    graph.vertices.write.parquet(vertices_path)
    graph.edges.write.parquet(edges_path)
    

def load_graph(full_graph_folder,full_graph_name):
    vertices_path = full_graph_folder + '/' + full_graph_name + ' Vertices'
    edges_path = full_graph_folder + '/' + full_graph_name + ' Edges'

    vertices=spark.read.parquet(vertices_path)
    edges=spark.read.parquet(edges_path)

    return GraphFrame(vertices,edges)
    

# Question Answer Graph (QA)

In [4]:
def get_question_answer_graph(dataset_folder: str, dataset_name: str,
                              max_no_nodes: Union[int, None] = None,
                              verbose_console: bool = True, verbose_file: bool = False,
                              output_file: Union[None, typing.TextIO] = None
                              ) -> GraphFrame:
    # dataset = read_dataset(dataset_folder, dataset_name)
    dataset = read_dataset_with_pyspark(dataset_folder, dataset_name)

    vertices = []
    vertices_scheme = ['id', 'node_type', 'node_text', 'question_level', 'question_type']
    edges = []
    edges_scheme = ["src", "dst"]

    all_answers = dict()

    for index, question in enumerate(dataset.toLocalIterator()):

        if max_no_nodes is not None and len(vertices) >= max_no_nodes:
            break

        add_answer_node = False
        if question['answer'] not in all_answers:
            all_answers[question['answer']] = len(all_answers)
            add_answer_node = True

        answer_node_id = 'a' + str(all_answers[question['answer']])

        question_node_id = f'q{index}'

        if add_answer_node:
            vertices.append((answer_node_id, 'a', question['answer'], None, None))

        vertices.append((question_node_id, 'q', question['question'], question['level'], question['type']))

        # The question has the answer
        edges.append((question_node_id, answer_node_id))

    vertices = spark.createDataFrame(vertices, vertices_scheme)
    edges = spark.createDataFrame(edges, edges_scheme)
    graph = GraphFrame(vertices, edges)

    no_nodes, no_edges, no_instances = None, None, None
    if verbose_console or verbose_file:
        no_nodes = graph.vertices.count()
        no_edges = graph.edges.count()
        no_instances = dataset.count()

    if verbose_console:
        print(f"Total no questions: {no_instances}")
        print(f"No vertices: {no_nodes}")
        print(f"No edges: {no_edges}")

    if verbose_file:
        print(f"Total no questions: {no_instances}", file=output_file)
        print(f"No vertices: {no_nodes}", file=output_file)
        print(f"No edges: {no_edges}", file=output_file)

    return graph


def show_question_answer_graph(dataset_folder: str, dataset_name: str,
                               max_no_nodes=1000,
                               verbose_console=True):
    graph = get_question_answer_graph(dataset_folder, dataset_name, max_no_nodes, verbose_console, False, None)

    df_connected_components = graph.connectedComponents(algorithm="graphx").toLocalIterator()

    nx_graph = nx.empty_graph()
    for row in df_connected_components:
        node_id = row['id']
        group = row['component']
        node_type = row['node_type']
        title, shape = None, None
        if node_type == 'q':
            question, level, question_type = row['node_text'], row['question_level'], \
                                             row['question_type']
            title = f'Question:{question}\nLevel:{level}\nType:{question_type}'
            shape = 'square'
        elif node_type == 'a':
            answer = row['node_text']
            title = f'Answer:{answer}'
            shape = 'circle'
        else:
            print("Error!Node type unknown!")
            exit(1)
        nx_graph.add_node(node_id, size=GRAPH_NODE_SIZE, title=title, group=group, shape=shape)

    for edge in graph.edges.collect():
        node1 = edge['src']
        node2 = edge['dst']
        nx_graph.add_edge(node1, node2)

    nt = Network('900px', '1700px', select_menu=True, cdn_resources='remote')
    nt.from_nx(nx_graph)

    html = nt.generate_html()
    with open(OUTPUT_FILE_FOLDER + "/" + QUESTION_ANSWER_GRAPH_NAME + " Visualization.html", mode='w',
              encoding='utf-8') as html_file:
        html_file.write(html)
    display(HTML(html))


def show_question_answer_graph_df(dataset_folder: str, dataset_name: str,
                                  max_no_patterns_to_print: int = 20,
                                  verbose_console: bool = True, verbose_file: bool = False):
    """
    Idea:Create a bipartite graph where in 1 group we have questions and in the other group we have answers.
    Link them.This shows us the One-To-Many relationships and Many-To-One relationships.
    """

    # Do nothing ,since we don't return anything
    if not verbose_console and not verbose_file:
        return

    output_file = None
    if verbose_file:
        output_file = open(OUTPUT_FILE_FOLDER + "/" + dataset_name + ' ' + QUESTION_ANSWER_GRAPH_NAME + ".txt", 'w',
                           encoding="utf-8")

    graph = get_question_answer_graph(dataset_folder, dataset_name, max_no_nodes=None, verbose_console=verbose_console,
                                      verbose_file=verbose_file, output_file=output_file)

    degree_answer = graph.degrees.join(graph.vertices, on="id").filter(F.col('node_type') == 'a').dropDuplicates().agg(
        F.max(F.col('degree')).alias('max_degree'),
        F.min(F.col('degree')).alias('min_degree')
    ).first()

    if verbose_console:
        print(f"\nMax no questions with the same answer: {degree_answer[0]}")
        print(f"Min no questions with the same answer: {degree_answer[1]}")

    if verbose_file:
        print(f"\nMax no questions with the same answer: {degree_answer[0]}", file=output_file)
        print(f"Min no questions with the same answer: {degree_answer[1]}", file=output_file)

    degree_question = graph.degrees.join(graph.vertices, on="id").filter(
        F.col('node_type') == 'q').dropDuplicates().agg(
        F.max(F.col('degree')).alias('max_degree'),
        F.min(F.col('degree')).alias('min_degree')
    ).first()

    if verbose_console:
        print(f"\nMax no answers for a question: {degree_question[0]}")
        print(f"Min no answers for a question: {degree_question[1]}")

    if verbose_file:
        print(f"\nMax no answers for a question: {degree_question[0]}", file=output_file)
        print(f"Min no answers for a question: {degree_question[1]}", file=output_file)

    # Get the questions with the same answer

    if verbose_console:
        print("\nFinding 2 questions with the same answer")
    if verbose_file:
        print("\nFinding 2 questions with the same answer", file=output_file)

    df_questions_same_answer = graph.find("(q1)-[]->(a);(q2)-[]->(a)"). \
        filter("q1.node_type='q'").filter("q2.node_type='q'"). \
        filter("a.node_type='a'"). \
        filter("q1.id < q2.id")
    # dropDuplicates(['q1', 'q2', 'a'])
    # df_questions_same_answer.show(truncate=False)

    # The last duplicate conditions find pairs (q1,q2,a) with unique a,so we can see more interesting patters
    # Otherwise,yes and no questions are dominant
    df_questions_unique_same_answer = df_questions_same_answer.dropDuplicates(['a'])

    for pattern in df_questions_unique_same_answer.take(max_no_patterns_to_print):
        if verbose_console:
            print(f"\nQuestion 1: {pattern['q1']['node_text']}")
            print(f"Question 2: {pattern['q2']['node_text']}")
            print(f"Answer: {pattern['a']['node_text']}")

        if verbose_file:
            print(f"\nQuestion 1: {pattern['q1']['node_text']}", file=output_file)
            print(f"Question 2: {pattern['q2']['node_text']}", file=output_file)
            print(f"Answer: {pattern['a']['node_text']}", file=output_file)

    # Get the questions that have the same answer,level and type
    if verbose_console:
        print("\nQuestions with the same answer,type and level")

    if verbose_file:
        print("\nQuestions with the same answer,type and level", file=output_file)

    df_questions_same_question_same_answer = df_questions_same_answer. \
        filter("q1.question_level=q2.question_level"). \
        filter("q1.question_type=q2.question_type"). \
        dropDuplicates(['a'])

    for pattern in df_questions_same_question_same_answer.take(max_no_patterns_to_print):
        if verbose_console:
            print(f"\nType: {pattern['q1']['question_type']} , Level: {pattern['q1']['question_level']}")
            print(f"Question 1: {pattern['q1']['node_text']}")
            print(f"Question 2: {pattern['q2']['node_text']}")
            print(f"Answer: {pattern['a']['node_text']}")

        if verbose_file:
            print(f"\nType: {pattern['q1']['question_type']} , Level: {pattern['q1']['question_level']}",
                  file=output_file)
            print(f"Question 1: {pattern['q1']['node_text']}", file=output_file)
            print(f"Question 2: {pattern['q2']['node_text']}", file=output_file)
            print(f"Answer: {pattern['a']['node_text']}", file=output_file)

    # df_questions2.show(truncate=False)

    if verbose_file:
        output_file.close()



def show_question_answer_graph(dataset_folder: str, dataset_name: str,
                               max_no_nodes=1000,
                               verbose_console=True):
    graph = get_question_answer_graph(dataset_folder, dataset_name, max_no_nodes, verbose_console, False, None)

    df_connected_components = graph.connectedComponents(algorithm="graphx").collect()

    nx_graph = nx.empty_graph()
    for row in df_connected_components:
        node_id = row['id']
        group = row['component']
        node_type = row['node_type']
        title, shape = None, None
        if node_type == 'q':
            question, level, question_type = row['node_text'], row['question_level'], \
                                             row['question_type']
            title = f'Question:{question}\nLevel:{level}\nType:{question_type}'
            shape = 'square'
        elif node_type == 'a':
            answer = row['node_text']
            title = f'Answer:{answer}'
            shape = 'circle'
        else:
            print("Error!Node type unknown!")
            exit(1)
        nx_graph.add_node(node_id, size=GRAPH_NODE_SIZE, title=title, group=group, shape=shape)

    for edge in graph.edges.collect():
        node1 = edge['src']
        node2 = edge['dst']
        nx_graph.add_edge(node1, node2)

    nt = Network('900px', '1700px', select_menu=True, cdn_resources='remote')
    nt.from_nx(nx_graph)

    html = nt.generate_html()
    with open(OUTPUT_FILE_FOLDER + "/" + QUESTION_ANSWER_GRAPH_NAME + " Visualization.html", mode='w',
              encoding='utf-8') as html_file:
        html_file.write(html)
    display(HTML(html))


In [5]:
def main():
    dataset_name='hotpot_train_v1.1'
    # dataset_name='hotpot_dev_distractor_v1'
    max_no_patterns_to_print=100

    start_time = time.time()
    
    show_question_answer_graph_df(DATASETS_FOLDER, dataset_name,
                              max_no_patterns_to_print=max_no_patterns_to_print,
                              verbose_console=True, verbose_file=True)

    print(f"Execution time: {time.time() - start_time} s")

# main()

25/01/15 21:14:49 WARN TaskSetManager: Stage 13 contains a task of very large size (1028 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Total no questions: 90447
No vertices: 143684
No edges: 90447


25/01/15 21:14:54 WARN TaskSetManager: Stage 23 contains a task of very large size (1028 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Max no questions with the same answer: 2748
Min no questions with the same answer: 1


25/01/15 21:14:56 WARN TaskSetManager: Stage 32 contains a task of very large size (1028 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Max no answers for a question: 1
Min no answers for a question: 1

Finding 2 questions with the same answer


25/01/15 21:14:57 WARN TaskSetManager: Stage 41 contains a task of very large size (1028 KiB). The maximum recommended task size is 1000 KiB.
25/01/15 21:14:58 WARN TaskSetManager: Stage 42 contains a task of very large size (1028 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Question 1: Which magazine was started first Arthur's Magazine or First for Women?
Question 2: Which magazine was created first, Chatelaine or Arthur's Magazine?
Answer: Arthur's Magazine

Question 1: What retailer is the second-largest in the United States and has a commercial featuring the American artist who was 1st runner-up in the 2005 USA Weekend Magazine's songwriting competition?
Question 2: What is the name of the business that George Dayton founded and later Bruce Dayton expanded?
Answer: Target Corporation

Question 1: Who is the Las Vegas businessman and reality television personality, currently retired, that co-owns a pawn shop Chumlee currently works at?
Question 2: What is the name of the grandfather of one of the panelist on "Pawnography", as well as the co-owner of a Las-Vegas store?
Answer: Richard Benjamin Harrison

Question 1: What party did the man who has served as a US representative , and had  a Minnesota politician and member of the Minnesota House of Represen

25/01/15 21:15:10 WARN TaskSetManager: Stage 60 contains a task of very large size (1028 KiB). The maximum recommended task size is 1000 KiB.
25/01/15 21:15:10 WARN TaskSetManager: Stage 61 contains a task of very large size (1028 KiB). The maximum recommended task size is 1000 KiB.
25/01/15 21:15:11 WARN TaskSetManager: Stage 63 contains a task of very large size (1028 KiB). The maximum recommended task size is 1000 KiB.


Type: comparison , Level: medium
Question 1: Which magazine was started first Arthur's Magazine or First for Women?
Question 2: Which magazine was created first, Chatelaine or Arthur's Magazine?
Answer: Arthur's Magazine

Type: bridge , Level: medium
Question 1: Who is the Las Vegas businessman and reality television personality, currently retired, that co-owns a pawn shop Chumlee currently works at?
Question 2: What is the name of the grandfather of one of the panelist on "Pawnography", as well as the co-owner of a Las-Vegas store?
Answer: Richard Benjamin Harrison

Type: bridge , Level: medium
Question 1: What is one of the state Democratic party affiliates of a different name that has a member who is president of the Saint Paul City Council?
Question 2: Which party is the politician who authorized the Minnesota Civil War Commemoration Task Force a member of?
Answer: Minnesota Democratic–Farmer–Labor Party

Type: bridge , Level: hard
Question 1: Which institute is this Indian theore

                                                                                

In [5]:
def main_show_QA():
    dataset_name='hotpot_train_v1.1'
    max_no_nodes=250

    start_time = time.time()
    
    show_question_answer_graph(DATASETS_FOLDER, dataset_name,
                               max_no_nodes=max_no_nodes,
                               verbose_console=True)

    print(f"Execution time: {time.time() - start_time} s")

main_show_QA()

                                                                                

Total no questions: 90447
No vertices: 251
No edges: 130


                                                                                

Execution time: 28.358465671539307 s


# Question Paragraph Graph(QP)

In [6]:
def get_question_paragraph_graph(dataset_folder: str, dataset_name: str,
                                 max_no_nodes: Union[int, None] = None,
                                 verbose_console: bool = True, verbose_file: bool = False,
                                 output_file: Union[None, typing.TextIO] = None) -> GraphFrame:
    # dataset = read_dataset(dataset_folder, dataset_name)
    dataset = read_dataset_with_pyspark(dataset_folder, dataset_name)

    vertices = []
    vertices_scheme = ['id', 'node_type', 'node_text', 'question_answer', 'question_level', 'question_type']
    edges = []
    edges_scheme = ["src", "dst"]

    # TODO:The number of nodes is suspiciously no of questions * 3...Which implies that there aren't any 2 paragraphs with the same title...
    #  Probably not true,meaning the GraphFrame doesn't eliminate duplicate nodes with the same id...Fix

    all_paragraph_titles = dict()

    for index, question in enumerate(dataset.collect()):
        question_node_id = 'q' + str(index)

        if max_no_nodes is not None and index >= max_no_nodes:
            break

        supporting_titles = set()
        for supporting_fact in question['supporting_facts']:
            supporting_title = supporting_fact[0]
            supporting_titles.add(supporting_title)

        vertices.append(
            (question_node_id, 'q', question['question'], question['answer'], question['level'], question['type']))

        for supporting_title in supporting_titles:
            add_paragraph = False
            if supporting_title not in all_paragraph_titles:
                # in this
                all_paragraph_titles[supporting_title] = len(all_paragraph_titles)
                add_paragraph = True

            title_node_id = 'p' + str(all_paragraph_titles[supporting_title])

            if add_paragraph:
                vertices.append((title_node_id, 'p', supporting_title, None, None, None))

            # The question requires the information in this context/paragraph
            edges.append((question_node_id, title_node_id))

    # TODO:Better refactoring?

    vertices = spark.createDataFrame(vertices, vertices_scheme)
    edges = spark.createDataFrame(edges, edges_scheme)
    graph = GraphFrame(vertices, edges)

    no_nodes, no_edges, no_instances = None, None, None
    if verbose_console or verbose_file:
        no_nodes = graph.vertices.count()
        no_edges = graph.edges.count()
        no_instances = dataset.count()

    if verbose_console:
        print(f"Total no questions: {no_instances}")
        print(f"No vertices: {no_nodes}")
        print(f"No edges: {no_edges}")

    if verbose_file:
        print(f"Total no questions: {no_instances}", file=output_file)
        print(f"No vertices: {no_nodes}", file=output_file)
        print(f"No edges: {no_edges}", file=output_file)

    return graph


def show_question_paragraph_graph(dataset_folder: str, dataset_name: str,
                                  max_no_nodes: Union[int, None] = None,
                                  verbose_console: bool = True):
    graph = get_question_paragraph_graph(dataset_folder, dataset_name, max_no_nodes, verbose_console, False, None)

    df_connected_components = graph.connectedComponents(algorithm="graphx").collect()

    nx_graph = nx.empty_graph()
    for row in df_connected_components:
        node_id = row['id']
        group = row['component']
        node_type = row['node_type']
        title, shape = None, None
        if node_type == 'q':
            question, answer, level, question_type = row['node_text'], row['question_answer'], row['question_level'], \
                                                     row['question_type']
            title = f'Question:{question}\nAnswer:{answer}\nLevel:{level}\nType:{question_type}'
            shape = 'square'
        elif node_type == 'p':
            paragraph_title = row['node_text']
            title = paragraph_title
            shape = 'circle'
        else:
            print("Error!Node type unknown!")
            exit(1)
        nx_graph.add_node(node_id, size=GRAPH_NODE_SIZE, title=title, group=group, shape=shape)

    for edge in graph.edges.collect():
        node1 = edge['src']
        node2 = edge['dst']
        nx_graph.add_edge(node1, node2)

    nt = Network('900px', '1700px', select_menu=True, cdn_resources='remote')
    nt.from_nx(nx_graph)

    # nt.show_buttons(filter_=['physics'])
    # nt.show('Question Context Graph Visualization.html', notebook=False)

    # Enable Physics for better graph layout
    # nt.enable_physics(True)

    # Add user interaction controls (UI box)
    # nt.set_options("""
    #                 {
    #                   "interaction": {
    #                     "hover": true,          // Enables hover over nodes
    #                     "navigationButtons": true, // Adds zoom and pan controls to UI
    #                     "keyboard": true        // Enables keyboard shortcuts
    #                   },
    #                   "manipulation": {
    #                     "enabled": true         // Enables node and edge manipulation
    #                   }
    #                 }
    #                 """
    #                )

    html = nt.generate_html()
    with open(OUTPUT_FILE_FOLDER + "/" + QUESTION_PARAGRAPH_GRAPH_NAME + " Visualization.html", mode='w',
              encoding='utf-8') as html_file:
        html_file.write(html)
    display(HTML(html))



def show_question_paragraph_graph_df(dataset_folder: str, dataset_name: str,
                                     max_no_patterns_to_print: int = 20,
                                     verbose_console: bool = True, verbose_file: bool = False):
    """
    3.Direct connection by using the titles of the supporting contexts as an edge between questions
    """

    # Do nothing ,since we don't return anything
    if not verbose_console and not verbose_file:
        return

    output_file = None
    if verbose_file:
        output_file = open(OUTPUT_FILE_FOLDER + "/" + dataset_name + ' ' + QUESTION_PARAGRAPH_GRAPH_NAME + ".txt", 'w',
                           encoding="utf-8")

    graph = get_question_paragraph_graph(dataset_folder, dataset_name,
                                         max_no_nodes=None,
                                         verbose_console=verbose_console, verbose_file=verbose_file,
                                         output_file=output_file)

    # degrees.agg(max(col('degree')).alias('max_degree'))
    # TODO:Make this count more efficient by first filtering
    question_degree_info = graph.degrees.join(graph.vertices, on="id").filter(
        F.col('node_type') == 'q').dropDuplicates(['id']).agg(
        F.max(F.col('degree')).alias('max_degree'),
        F.min(F.col('degree')).alias('min_degree')
    ).first()

    if verbose_console:
        print(f"\nMax no paragraphs that are relevant to a question: {question_degree_info[0]}")
        print(f"Min no paragraphs that are relevant to a question: {question_degree_info[1]}")

    if verbose_file:
        print(f"\nMax no paragraphs that are relevant to a question: {question_degree_info[0]}", file=output_file)
        print(f"Min no paragraphs that are relevant to a question: {question_degree_info[1]}", file=output_file)

    paragraphs_degree_info = graph.degrees.join(graph.vertices, on="id").filter(
        F.col('node_type') == 'p').dropDuplicates(['id']).agg(
        F.max(F.col('degree')).alias('max_degree'),
        F.min(F.col('degree')).alias('min_degree')
    ).first()

    if verbose_console:
        print(f"\nMax no question that consider a paragraph: {paragraphs_degree_info[0]}")
        print(f"Min no questions that consider a paragraph: {paragraphs_degree_info[1]}")

    if verbose_file:
        print(f"\nMax no question that consider a paragraph: {paragraphs_degree_info[0]}", file=output_file)
        print(f"Min no questions that consider a paragraph: {paragraphs_degree_info[1]}", file=output_file)

    if verbose_console:
        print("\nFinding 2 questions that have a common relevant paragraph")

    if verbose_file:
        print("\nFinding 2 question that have a common relevant paragraph", file=output_file)

    df_questions_same_relevant_paragraph = graph.find("(q1)-[]->(p);(q2)-[]->(p)"). \
        filter("q1.node_type='q'").filter("q2.node_type='q'"). \
        filter("p.node_type='p'"). \
        filter("q1.id < q2.id"). \
        dropDuplicates(['q1', 'q2', 'p'])

    for pattern in df_questions_same_relevant_paragraph.take(max_no_patterns_to_print):
        if verbose_console:
            print(f"\nQuestion 1: {pattern['q1']['node_text']}")
            print(f"Question 2: {pattern['q2']['node_text']}")
            print(f"Paragraph title: {pattern['p']['node_text']}")

        if verbose_file:
            print(f"\nQuestion 1: {pattern['q1']['node_text']}", file=output_file)
            print(f"Question 2: {pattern['q2']['node_text']}", file=output_file)
            print(f"Paragraph title: {pattern['p']['node_text']}", file=output_file)

    # df_questions_same_relevant_paragraph.show(truncate=False)

    # df_questions1 = df_questions_same_relevant_paragraph.filter("q1.question_level=q2.question_level").filter(
    #     "q1.question_type=q2.question_type").orderBy('c')

    # df_questions1.show()
    if verbose_console:
        print("\nFinding 2 questions that have the same 2 paragraphs as common")

    if verbose_file:
        print("\nFinding 2 questions that have the same 2 paragraphs as common", file=output_file)

    df_two_questions_same_two_paragraph = graph.find("(q1)-[]->(p1);(q1)-[]->(p2);(q2)-[]->(p1);(q2)-[]->(p2)"). \
        filter("q1.node_type='q'").filter("q2.node_type='q'"). \
        filter("p1.node_type='p'").filter("p2.node_type='p'"). \
        filter("q1.id<q2.id"). \
        filter("p1.id<p2.id"). \
        dropDuplicates(['q1', 'q2', 'p1', 'p2']). \
        orderBy('q1')

    # df_questions_same_context.show(truncate=False)

    for pattern in df_two_questions_same_two_paragraph.take(max_no_patterns_to_print):
        if verbose_console:
            print(f"\nQuestion 1: {pattern['q1']['node_text']}")
            print(f"Question 2: {pattern['q2']['node_text']}")
            print(f"Paragraph 1 Title: {pattern['p1']['node_text']}")
            print(f"Paragraph 2 Title: {pattern['p2']['node_text']}")

        if verbose_file:
            print(f"\nQuestion 1: {pattern['q1']['node_text']}", file=output_file)
            print(f"Question 2: {pattern['q2']['node_text']}", file=output_file)
            print(f"Paragraph 1 Title: {pattern['p1']['node_text']}", file=output_file)
            print(f"Paragraph 2 Title: {pattern['p2']['node_text']}", file=output_file)

In [7]:
def main():
    dataset_name='hotpot_train_v1.1'
    max_no_patterns_to_print=100

    start_time = time.time()
    
    show_question_paragraph_graph_df(DATASETS_FOLDER, dataset_name,
                              max_no_patterns_to_print=max_no_patterns_to_print,
                              verbose_console=True, verbose_file=True)

    print(f"Execution time: {time.time() - start_time} s")

# main()

25/01/15 21:23:59 WARN TaskSetManager: Stage 83 contains a task of very large size (1258 KiB). The maximum recommended task size is 1000 KiB.


Total no questions: 90447
No vertices: 196017
No edges: 180894


25/01/15 21:24:00 WARN TaskSetManager: Stage 93 contains a task of very large size (1258 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Max no paragraphs that are relevant to a question: 2
Min no paragraphs that are relevant to a question: 2


25/01/15 21:24:02 WARN TaskSetManager: Stage 102 contains a task of very large size (1258 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Max no question that consider a paragraph: 36
Min no questions that consider a paragraph: 1

Finding 2 questions that have a common relevant paragraph


25/01/15 21:24:03 WARN TaskSetManager: Stage 111 contains a task of very large size (1258 KiB). The maximum recommended task size is 1000 KiB.
25/01/15 21:24:03 WARN TaskSetManager: Stage 112 contains a task of very large size (1258 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Question 1: Were Erich Fromm and R. S. Thomas both priests?
Question 2: Were both Charles Nordhoff and Erich Fromm American writers?
Paragraph title: Erich Fromm

Question 1: The 2008–09 Genoa C.F.C. season, was its best season in recent history and finished fifth in Serie A, Topscorer Diego Milito hit the back of the net how many times?
Question 2: What is the name of this Argentine former footballer, who played with Internazionale Milano and made a single goal in the 2010 Coppa Italia Final?
Paragraph title: Diego Milito

Question 1: in the 2010 FIFA World Cup Final, Spain defeated the Netherlands 1–0 with a goal from a player who was born in which year ?
Question 2: The 2010 FIFA World Cup Final had a single goal by a player who plays for what team?
Paragraph title: Andrés Iniesta

Question 1: What is the nationality of the striker who scored 19 league goals for Blackburn Rovers in the 2007-08 season ?
Question 2: What nationality is the striker who scored 19 goals in the 2007–08 B

25/01/15 21:24:06 WARN TaskSetManager: Stage 136 contains a task of very large size (1258 KiB). The maximum recommended task size is 1000 KiB.
25/01/15 21:24:07 WARN TaskSetManager: Stage 137 contains a task of very large size (1258 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


Question 1: The Oberoi family is part of a hotel company that has a head office in what city?
Question 2: Which Indian family founded the Trident Hotel brand?
Paragraph 1 Title: The Oberoi Group
Paragraph 2 Title: Oberoi family

Question 1: In which 2015 British-American romantic drama film directed by Todd Haynes did John Magaro star in?
Question 2: The American film, television and stage actor John Magaro appeared in which film directed by Todd Haynes alongside with Rooney Mara?
Paragraph 1 Title: Carol (film)
Paragraph 2 Title: John Magaro

Question 1: In which 2015 British-American romantic drama film directed by Todd Haynes did John Magaro star in?
Question 2: who wrote the movie which John Magaro starred alongside
Paragraph 1 Title: Carol (film)
Paragraph 2 Title: John Magaro

Question 1: Which actress, known for her role as Denise Huxtable, directed the video to "It Never Rains (In Southern California)"?
Question 2:  "It Never Rains (In Sourthern California)" was directed by Li

In [8]:
def main_show_QP():
    dataset_name='hotpot_train_v1.1'
    max_no_nodes=250

    start_time = time.time()
    
    show_question_paragraph_graph(DATASETS_FOLDER, dataset_name,
                                  max_no_nodes=250,
                                  verbose_console=True)

    print(f"Execution time: {time.time() - start_time} s")

main_show_QP()

                                                                                

Total no questions: 90447
No vertices: 748
No edges: 500


Execution time: 17.743735313415527 s


# Question Answer Paragraph Graph (QAP)

In [9]:
def get_question_answer_paragraph_graph(dataset_folder: str, dataset_name: str,
                                        max_no_nodes: Union[int, None] = None,
                                        verbose_console: bool = True, verbose_file: bool = False,
                                        output_file: Union[None, typing.TextIO] = None) -> GraphFrame:
    # TODO:Add all the paragraphs ,not just the ones supporting paragraphs?

    # dataset = read_dataset(dataset_folder, dataset_name)
    dataset = read_dataset_with_pyspark(dataset_folder, dataset_name)

    vertices = []
    vertices_scheme = ['id', 'node_type', 'node_text', 'question_level', 'question_type']
    edges = []
    edges_scheme = ["src", "dst"]

    all_answers = dict()
    all_paragraph_titles = dict()

    for index, question in enumerate(dataset.toLocalIterator()):
        if max_no_nodes is not None and len(vertices) >= max_no_nodes:
            break

        answer_text = question['answer']
        add_answer_node = False
        if answer_text not in all_answers:
            all_answers[answer_text] = len(all_answers)
            add_answer_node = True
        answer_node_id = 'a' + str(all_answers[answer_text])

        question_node_id = 'q' + str(index)

        # Add the nodes Q,A and the edges Q->A
        if add_answer_node:
            vertices.append((answer_node_id, 'a', answer_text, None, None))

        vertices.append((question_node_id, 'q', question['question'], question['level'], question['type']))
        edges.append((question_node_id, answer_node_id))

        # Get the paragraphs for each question
        supporting_titles = set()
        for supporting_fact in question['supporting_facts']:
            supporting_title = supporting_fact[0]
            supporting_titles.add(supporting_title)

        # Add the nodes P(paragraphs)
        for supporting_title in supporting_titles:
            if supporting_title not in all_paragraph_titles:
                all_paragraph_titles[supporting_title] = len(all_paragraph_titles)

                title_node_id = 'p' + str(all_paragraph_titles[supporting_title])
                vertices.append((title_node_id, 'p', supporting_title, None, None))

        # Add the edges A->P,P->Q
        for supporting_title in supporting_titles:
            title_node_id = 'p' + str(all_paragraph_titles[supporting_title])

            edges.append((answer_node_id, title_node_id))
            edges.append((title_node_id, question_node_id))

    vertices = spark.createDataFrame(vertices, vertices_scheme)
    edges = spark.createDataFrame(edges, edges_scheme)
    graph = GraphFrame(vertices, edges)

    no_nodes, no_edges, no_instances = None, None, None
    if verbose_console or verbose_file:
        no_nodes = graph.vertices.count()
        no_edges = graph.edges.count()
        no_instances = dataset.count()

    if verbose_console:
        print(f"Total no questions: {no_instances}")
        print(f"No vertices: {no_nodes}")
        print(f"No edges: {no_edges}")

    if verbose_file:
        print(f"Total no questions: {no_instances}", file=output_file)
        print(f"No vertices: {no_nodes}", file=output_file)
        print(f"No edges: {no_edges}", file=output_file)

    return graph


def show_question_answer_paragraph_graph(dataset_folder: str, dataset_name: str,
                                         max_no_nodes: Union[int, None] = None,
                                         verbose_console: bool = True):
    graph = get_question_answer_paragraph_graph(dataset_folder, dataset_name, max_no_nodes, verbose_console, False,
                                                None)

    df_connected_components = graph.connectedComponents(algorithm="graphx").collect()

    nx_graph = nx.empty_graph()
    for row in df_connected_components:
        node_id = row['id']
        group = row['component']
        node_type = row['node_type']
        title, shape = None, None
        if node_type == 'q':
            question, level, question_type = row['node_text'], row['question_level'], \
                                             row['question_type']
            title = f'Question:{question}\nLevel:{level}\nType:{question_type}'
            shape = 'square'
        elif node_type == 'a':
            answer = row['node_text']
            title = f'Answer:{answer}'
            shape = 'triangle'
        elif node_type == 'p':
            paragraph_title = row['node_text']
            title = paragraph_title
            shape = 'circle'
        else:
            print("Error!Node type unknown!")
            exit(1)
        nx_graph.add_node(node_id, size=GRAPH_NODE_SIZE, title=title, group=group, shape=shape)

    for edge in graph.edges.collect():
        node1 = edge['src']
        node2 = edge['dst']
        nx_graph.add_edge(node1, node2)

    nt = Network('900px', '1700px', select_menu=True, cdn_resources='remote')
    nt.from_nx(nx_graph)

    html = nt.generate_html()
    with open(OUTPUT_FILE_FOLDER + "/" + QUESTION_ANSWER_PARAGRAPH_GRAPH_NAME + " Visualization.html", mode='w',
              encoding='utf-8') as html_file:
        html_file.write(html)
    display(HTML(html))



def show_question_answer_paragraph_graph_df(dataset_folder: str, dataset_name: str,
                                            max_no_patterns_to_print: int = 20,
                                            verbose_console: bool = True, verbose_file: bool = False):
    """
    2.Create a tripartitate graph,where we have 3 types of nodes:Questions,Answers and Contexts.Add edges Question->Answer ,Answer->Context and
    Context->Question.Extract information by using motifs,such as finding 2 questions that give the same answer and have relevant to the answer
    the same paragraph.
    """
    # Do nothing ,since we don't return anything
    if not verbose_console and not verbose_file:
        return

    output_file = None
    if verbose_file:
        output_file = open(
            OUTPUT_FILE_FOLDER + "/" + dataset_name + ' ' + QUESTION_ANSWER_PARAGRAPH_GRAPH_NAME + ".txt", 'w')

    graph = get_question_answer_paragraph_graph(dataset_folder, dataset_name,
                                                max_no_nodes=None,
                                                verbose_console=verbose_console, verbose_file=verbose_file,
                                                output_file=output_file
                                                )

    print(
        "Finding 2 questions that have the same answer and a paragraph that is relevant for both the 2 questions")

    df_two_questions_answer_paragraph = graph.find(
        "(q1)-[]->(a);(q2)-[]->(a);(a)-[]->(p);(p)-[]->(q1);(p)-[]->(q2)").filter(
        "q1.id < q2.id").filter("q1.node_type='q'").filter("q2.node_type='q'"). \
        filter("a.node_type='a'").filter("p.node_type='p'"). \
        dropDuplicates(['q1', 'q2', 'a', 'p'])

    no_patters = df_two_questions_answer_paragraph.count()
    if no_patters == 0:
        if verbose_console:
            print("No patters found!")
        if verbose_file:
            print("No patters found!", file=output_file)
    else:
        if verbose_console:
            print(f"Total no patters found: {no_patters}")
        if verbose_file:
            print(f"Total no patters found: {no_patters}", file=output_file)

        for pattern in df_two_questions_answer_paragraph.take(min(max_no_patterns_to_print, no_patters)):
            if verbose_console:
                print(f"\nQuestion 1: {pattern['q1']['node_text']}")
                print(f"Question 2: {pattern['q2']['node_text']}")
                print(f"Answer: {pattern['a']['node_text']}")
                print(f"Paragraph title: {pattern['p']['node_text']}")

            if verbose_file:
                print(f"\nQuestion 1: {pattern['q1']['node_text']}", file=output_file, flush=True)
                print(f"Question 2: {pattern['q2']['node_text']}", file=output_file)
                print(f"Answer: {pattern['a']['node_text']}", file=output_file)
                print(f"Paragraph title: {pattern['p']['node_text']}", file=output_file)


In [8]:
def save_QAP():
    # f1->Full QAP Graph for the train dataset
    
    dataset_name='hotpot_train_v1.1'
    max_no_nodes=None

    graph_folder=GRAPH_SAVE_FOLDER
    graph_name='QAP Graph f1'
    
    start_time = time.time()

    graph = get_question_answer_paragraph_graph(DATASETS_FOLDER, dataset_name, max_no_nodes, True, False,
                                            None)
    
    save_graph(graph,graph_folder,graph_name)
    
    print(f"Execution time: {time.time() - start_time} s")

# save_QAP()

25/01/16 09:44:15 WARN TaskSetManager: Stage 32 contains a task of very large size (1273 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Total no questions: 90447
No vertices: 249254
No edges: 452235


25/01/16 09:44:17 WARN TaskSetManager: Stage 41 contains a task of very large size (1273 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Execution time: 29.63927721977234 s


                                                                                

In [12]:
def main():
    dataset_name='hotpot_train_v1.1'
    max_no_patterns_to_print=20
    
    start_time = time.time()
    
    show_question_answer_paragraph_graph_df(DATASETS_FOLDER, dataset_name,
                              max_no_patterns_to_print=max_no_patterns_to_print,
                              verbose_console=True, verbose_file=True)

    print(f"Execution time: {time.time() - start_time} s")

main()

25/01/16 11:31:13 WARN TaskSetManager: Lost task 5.2 in stage 93.0 (TID 339) (172.16.30.34 executor 0): java.io.IOException: No space left on device
	at java.base/java.io.FileOutputStream.writeBytes(Native Method)
	at java.base/java.io.FileOutputStream.write(FileOutputStream.java:354)
	at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:59)
	at org.apache.spark.io.MutableCheckedOutputStream.write(MutableCheckedOutputStream.scala:43)
	at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:81)
	at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:127)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:225)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:178)
	at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:81)
	at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:127)
	at java.base/java.io.DataOu

Total no questions: 90447
No vertices: 249254
No edges: 452235
Finding 2 questions that have the same answer and a paragraph that is relevant for both the 2 questions


25/01/16 12:03:40 WARN TaskSetManager: Stage 117 contains a task of very large size (1273 KiB). The maximum recommended task size is 1000 KiB.
25/01/16 12:03:40 WARN TaskSetManager: Stage 118 contains a task of very large size (1273 KiB). The maximum recommended task size is 1000 KiB.
25/01/16 12:03:41 WARN TaskSetManager: Stage 120 contains a task of very large size (1273 KiB). The maximum recommended task size is 1000 KiB.
25/01/16 12:35:38 WARN TaskSetManager: Lost task 2.0 in stage 144.0 (TID 527) (172.16.30.34 executor 0): java.io.IOException: No space left on device
	at java.base/java.io.FileOutputStream.writeBytes(Native Method)
	at java.base/java.io.FileOutputStream.write(FileOutputStream.java:354)
	at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:59)
	at org.apache.spark.io.MutableCheckedOutputStream.write(MutableCheckedOutputStream.scala:43)
	at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:81)
	at java.ba

Py4JJavaError: An error occurred while calling o282.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 144.0 failed 4 times, most recent failure: Lost task 1.3 in stage 144.0 (TID 568) (172.16.30.120 executor 1): java.io.IOException: No space left on device
	at java.base/java.io.FileOutputStream.writeBytes(Native Method)
	at java.base/java.io.FileOutputStream.write(FileOutputStream.java:354)
	at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:59)
	at org.apache.spark.io.MutableCheckedOutputStream.write(MutableCheckedOutputStream.scala:43)
	at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:81)
	at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:127)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:225)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:178)
	at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:81)
	at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:127)
	at java.base/java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.spark.sql.catalyst.expressions.UnsafeRow.writeToStream(UnsafeRow.java:519)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$1.writeValue(UnsafeRowSerializer.scala:69)
	at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:312)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:171)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2780)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2779)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2779)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1242)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3048)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2971)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: java.io.IOException: No space left on device
	at java.base/java.io.FileOutputStream.writeBytes(Native Method)
	at java.base/java.io.FileOutputStream.write(FileOutputStream.java:354)
	at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:59)
	at org.apache.spark.io.MutableCheckedOutputStream.write(MutableCheckedOutputStream.scala:43)
	at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:81)
	at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:127)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:225)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:178)
	at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:81)
	at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:127)
	at java.base/java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.spark.sql.catalyst.expressions.UnsafeRow.writeToStream(UnsafeRow.java:519)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$1.writeValue(UnsafeRowSerializer.scala:69)
	at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:312)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:171)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)


25/01/16 14:11:05 WARN TaskSetManager: Lost task 11.3 in stage 144.0 (TID 570) (172.16.30.120 executor 1): TaskKilled (Stage cancelled: Job aborted due to stage failure: Task 1 in stage 144.0 failed 4 times, most recent failure: Lost task 1.3 in stage 144.0 (TID 568) (172.16.30.120 executor 1): java.io.IOException: No space left on device
	at java.base/java.io.FileOutputStream.writeBytes(Native Method)
	at java.base/java.io.FileOutputStream.write(FileOutputStream.java:354)
	at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:59)
	at org.apache.spark.io.MutableCheckedOutputStream.write(MutableCheckedOutputStream.scala:43)
	at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:81)
	at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:127)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:225)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:178)
	

In [11]:
def main_show_QAP():
    dataset_name='hotpot_train_v1.1'
    max_no_nodes=250

    start_time = time.time()
    
    show_question_answer_paragraph_graph(DATASETS_FOLDER, dataset_name,
                                  max_no_nodes=250,
                                  verbose_console=True)

    print(f"Execution time: {time.time() - start_time} s")

main_show_QAP()

                                                                                

Total no questions: 90447
No vertices: 252
No edges: 320


Execution time: 9.404969453811646 s
