Code for Topic Modelling added over the Question Graph(Graph where we add a Topic node and connect question node to the topic node accordingly) and the Question Paragraph Graph(associates over the QP graph the topics given,to see how distributed and balanced they are)
This code was tested locally.

In [None]:
import time
import typing
from typing import Union

import pyspark.sql.functions as F
from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .appName("Simple Graphs")
    .config('spark.executor.instances','2')
    .config('spark.executor.memory','12G')
    .config('spark.executor.cores','4')
    .config('spark.driver.memory','6G')
    .config('spark.dynamicAllocation.enabled','false') # musai??
    .master('spark://master:7077')
    .config("spark.jars.packages", "graphframes:graphframes:0.8.4-spark3.5-s_2.12")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .getOrCreate()
)
from graphframes import GraphFrame

OUTPUT_FILE_FOLDER = 'results'

TOPIC_MODELING_QUESTION_GRAPH_NAME = 'TPQ Graph'
TOPIC_MODELING_QUESTION_PARAGRAPH_GRAPH_NAME = 'TPQP Graph'

# TODO:Adjust edge thickness to look better?
# EDGE_IS_SUPPORTING_THICKNESS = 2
# EDGE_IS_NOT_SUPPORTING_THICKNESS = 2

GRAPH_QUESTION_NODE_SIZE = 3
GRAPH_TOPIC_NODE_SIZE = 8

GRAPH_PARAGRAPH_NODE_SIZE = 5

def get_topic_index(topic_distribution: np.ndarray) -> int:
    return int(np.argmax(topic_distribution).item())

# Topic Modelling with Questions Graph

In [None]:
def get_topic_index(topic_distribution: np.ndarray) -> int:
    return int(np.argmax(topic_distribution).item())


def get_topic_modeling_question_graph(dataset_folder: str, dataset_name: str, dataset_extension: str = '.json',
                                      lda_no_topics: int = 10, lda_max_no_iterations: int = 10,
                                      max_no_vertices: Union[int, None] = None,
                                      verbose_console: bool = True, verbose_file: bool = False,
                                      output_file: Union[None, typing.TextIO] = None) -> GraphFrame:
    dataset_path = ("" if dataset_folder == "" else dataset_folder + "/") + dataset_name + dataset_extension

    # TODO:Consider saving the LDA results in a file?
    dataset_df = spark.read.json(dataset_path, multiLine=True)
    dataset_df = dataset_df.withColumn("question_text", (F.col("question"))
                                       .cast("string"))

    # Tokenization
    tokenizer = Tokenizer(inputCol="question_text", outputCol="words")
    tokenized_df = tokenizer.transform(dataset_df)

    # Get counts in the question text for each word in the raw_features
    vectorizer = CountVectorizer(inputCol="words", outputCol="raw_features")
    vectorized_model = vectorizer.fit(tokenized_df)
    vectorized_df = vectorized_model.transform(tokenized_df)

    idf = IDF(inputCol="raw_features", outputCol="features")
    idf_model = idf.fit(vectorized_df)
    features_df = idf_model.transform(vectorized_df)

    # features_df.show()

    features_df = features_df.drop('context', 'question', 'supporting_facts', 'words', 'raw_features')

    lda = LDA(k=lda_no_topics, maxIter=lda_max_no_iterations, featuresCol="features")
    lda_model = lda.fit(features_df)
    topic_distribution_df = lda_model.transform(features_df)

    select_topic_for_question_function = F.udf(get_topic_index, IntegerType())
    question_topic_index_df = topic_distribution_df.withColumn('topic_index',
                                                               select_topic_for_question_function(
                                                                   topic_distribution_df['topicDistribution']))

    question_topic_index_df = question_topic_index_df.drop('features', 'topicDistribution')
    question_topic_index_df.show()

    # TODO:Attempt to generate the graph directly from df instead of traversing the df since this
    #  should offer a speed up...
    vertices = []
    vertices_scheme = ['id', 'answer', 'level', 'type', 'question', 'topic']

    edges = []
    edges_scheme = ['src', 'dst']

    topic_ids = dict()
    for index in range(lda_no_topics):
        topic_id = 't' + str(index)
        vertices.append((topic_id, '', '', 'topic', '', index))
        topic_ids[index] = topic_id

    current_index = 0
    for row in question_topic_index_df.collect():
        node_id = 'q' + str(current_index)
        node_topic = row['topic_index']
        vertices.append(
            (node_id, row['answer'], row['level'], row['type'], row['question_text'], node_topic))
        edges.append((node_id, topic_ids[node_topic]))
        current_index += 1

        if max_no_vertices is not None and current_index >= max_no_vertices:
            break

    vertices_df = spark.createDataFrame(vertices, vertices_scheme)
    edges_df = spark.createDataFrame(edges, edges_scheme)

    # edges = []
    # edges_scheme = ['src', 'dst']
    # # TODO:Extract a df from the question_topic_index_df in order to make this be faster...
    # for row in question_topic_index_df.drop('answer', 'level', 'type', 'question_text').collect():
    #     edges.append((str(row['_id']), str(row['topic_index'])))
    # edges_df = spark.createDataFrame(edges, edges_scheme)

    # For the edges,only add between a topic node and nodes with similar outputs

    # Get the edges only between a topic node and a question with that topic
    # edges_df = question_topic_index_df.drop('answer', 'level', 'type', 'question_text').withColumnRenamed('topic_index',
    #                                                                                                       'dst').withColumn(
    #     'src', F.col('_id'))
    # edges = []
    # edges_scheme = ['src', 'dst']
    # # TODO:Extract a df from the question_topic_index_df in order to make this be faster...
    # for row in question_topic_index_df.drop('answer', 'level', 'type', 'question_text').collect():
    #     edges.append((str(row['_id']), str(row['topic_index'])))
    # edges_df = spark.createDataFrame(edges, edges_scheme)
    # edges_df = edges_df.withColumn("src", F.col("src").cast(StringType())) \
    #     .withColumn("dst", F.col("dst").cast(StringType()))
    #
    # edges_df.show()
    #
    # exit(1)

    # Add a node for each topic at hand
    # for topic_id in range(lda_no_topics):
    #     new_row = spark.createDataFrame([Row(str(topic_id), '', '', 'topic', '', str(topic_id))],
    #                                     question_topic_index_df.columns)
    #     question_topic_index_df = question_topic_index_df.union(new_row)

    # exit(1)

    # graph = GraphFrame(question_topic_index_df, edges_df)
    graph = GraphFrame(vertices_df, edges_df)

    no_questions, no_nodes, no_edges = None, None, None
    if verbose_console or verbose_file:
        no_questions = question_topic_index_df.count()
        no_nodes = graph.vertices.count()
        no_edges = graph.edges.count()

    if verbose_console:
        print(f"Total no questions: {no_questions}")
        print(f"No vertices: {no_nodes}")
        print(f"No edges: {no_edges}")

    if verbose_file:
        print(f"Total no questions: {no_questions}", file=output_file)
        print(f"No vertices: {no_nodes}", file=output_file)
        print(f"No edges: {no_edges}", file=output_file)

    return graph


def show_topic_modeling_question_graph(dataset_folder: str, dataset_name: str, dataset_extension: str = '.json',
                                       lda_no_topics: int = 10, lda_max_no_iterations: int = 10,
                                       max_no_vertices: Union[int, None] = None,
                                       verbose_console: bool = True, verbose_file: bool = False):
    graph = get_topic_modeling_question_graph(dataset_folder, dataset_name, dataset_extension,
                                              lda_no_topics=lda_no_topics, lda_max_no_iterations=lda_max_no_iterations,
                                              max_no_vertices=max_no_vertices,
                                              verbose_console=verbose_console, verbose_file=verbose_file,
                                              output_file=None)
    nx_graph = nx.empty_graph()
    for row in graph.vertices.collect():
        node_type = row['type']
        node_id = row['id']
        group = row['topic']
        title, shape, size = None, None, None

        if node_type == 'topic':
            size = GRAPH_TOPIC_NODE_SIZE
            title = f"Topic {node_id[1:]}"
            shape = 'circle'
        else:
            answer, level, question_type, question = row['answer'], row['level'], \
                                                     row['type'], row['question']
            title = f'Question:{question}\nAnswer:{answer}\nLevel:{level}\nType:{question_type}'

            size = GRAPH_QUESTION_NODE_SIZE

            shape = 'square'

        nx_graph.add_node(node_id, size=size, title=title, group=group, shape=shape)

    for edge in graph.edges.collect():
        node1 = edge['src']
        node2 = edge['dst']
        nx_graph.add_edge(node1, node2)

    nt = Network('900px', '1700px', select_menu=True, cdn_resources='remote')
    nt.from_nx(nx_graph)

    #  TODO:Make this also work with the physics and UI menu
    html = nt.generate_html()
    with open(OUTPUT_FILE_FOLDER + "/" + TOPIC_MODELING_QUESTION_GRAPH_NAME + " Visualization.html", mode='w',
              encoding='utf-8') as html_file:
        html_file.write(html)
    display(HTML(html))

In [None]:
def main():
    dataset_folder = 'datasets'
    dataset_name = 'hotpot_dev_distractor_v1'

    lda_no_topics = 20
    lda_max_no_iterations = 10

    start_time = time.time()

    show_topic_modeling_question_graph(dataset_folder, dataset_name,
                                       lda_no_topics=lda_no_topics, lda_max_no_iterations=lda_max_no_iterations,
                                       max_no_vertices=500)

    execution_time = time.time() - start_time

    print(f"Execution time: {execution_time} s")


main()

# Topic Modelling with Questions and Paragraphs

In [None]:
def get_topic_modeling_QP_graph(dataset_folder: str, dataset_name: str, dataset_extension: str = '.json',
                                lda_no_topics: int = 10, lda_max_no_iterations: int = 10,
                                max_no_vertices: Union[int, None] = None,
                                verbose_console: bool = True, verbose_file: bool = False,
                                output_file: Union[None, typing.TextIO] = None) -> GraphFrame:
    # dataset_path = ("" if dataset_folder == "" else dataset_folder + "/") + dataset_name + dataset_extension

    # TODO:Consider saving the LDA results in a file?
    # TODO:Is this better to use for RaaS than normal reading??

    dataset = read_dataset(dataset_folder, dataset_name, dataset_extension)

    # questions = []
    # questions_scheme = ['id', 'question', 'answer', 'level', 'type']
    #
    # paragraphs = []
    # paragraphs_scheme = ['id', 'title', 'paragraph', 'is_supporting', 'question_id']
    # TODO:Better way to represent this?Having 2 dfs has problems with CountVectorizer...
    lda_data = []
    lda_data_scheme = ['id', 'text']

    # Assumptions:Since for paragraphs we don't show/save them,only use them for LDA,there is
    # no need to save them in the second field as 'text',thus the second field is just for the question
    question_paragraph = []
    question_paragraph_scheme = ['id', 'question', 'answer', 'level', 'type', 'title', 'is_supporting',
                                 'question_id']
    paragraph_ids = dict()
    for index, question in enumerate(dataset):
        question_id = 'q' + str(index)

        # Add info to df for paragraph
        supporting_paragraphs = set([supporting_fact[0] for supporting_fact in question['supporting_facts']])
        for (title, paragraph) in question['context']:
            is_supporting_paragraph = title in supporting_paragraphs
            paragraph_text = ''.join(paragraph)

            if title not in paragraph_ids:
                paragraph_ids[title] = 'p' + str(len(paragraph_ids))
            paragraph_id = paragraph_ids[title]

            question_paragraph.append(
                (paragraph_id, '', '', '', '', title, is_supporting_paragraph, question_id))

            lda_data.append((paragraph_id, paragraph_text))

        # Add info relevant to question
        question_text = question['question']
        question_answer, question_level, question_type = question['answer'], question['level'], question['type']
        question_paragraph.append(
            (question_id, question_text, question_answer, question_level, question_type, '', '', ''))

        lda_data.append((question_id, question_text))

        if max_no_vertices is not None and len(question_paragraph) >= max_no_vertices:
            break

    # Strictly info necessary for LDA
    lda_question_paragraph_df = spark.createDataFrame(lda_data, lda_data_scheme)

    # The rest of the info
    question_paragraph_df = spark.createDataFrame(question_paragraph, question_paragraph_scheme)

    # question_paragraph_df.show(truncate=False)

    # TODO:Consider creating 2 dfs,one for paragraphs and another for questions,and then adding Topic Modeling
    #  This should be more eficient,but is also harder to implement than it should be :(
    # dataset_df = spark.read.json(dataset_path, multiLine=True)
    #
    # df1 = dataset_df.withColumn('title-paragraph', F.explode(F.col('context')))
    # df2 = df1.withColumn('paragraph', F.col('title-paragraph').getItem(1))
    # df3 = df2.withColumn('paragraph2', (F.col("paragraph")).cast('string'))
    # df3.drop('context', 'title-paragraph', 'paragraph', 'text').show(truncate=False)

    # Tokenization
    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    tokenized_paragraph_df = tokenizer.transform(lda_question_paragraph_df).drop('text')

    #
    # # Get counts in the question text for each word in the raw_features
    vectorizer = CountVectorizer(inputCol="tokens", outputCol="raw_features")
    vectorized_model = vectorizer.fit(tokenized_paragraph_df)
    vectorized_df = vectorized_model.transform(tokenized_paragraph_df).drop('tokens')

    idf = IDF(inputCol="raw_features", outputCol="features")
    idf_model = idf.fit(vectorized_df)
    features_df = idf_model.transform(vectorized_df).drop('raw_features')

    # features_df.show()

    lda = LDA(k=lda_no_topics, maxIter=lda_max_no_iterations, featuresCol="features")
    lda_model = lda.fit(features_df)
    topic_distribution_df = lda_model.transform(features_df).drop('features')

    select_topic_for_question_function = F.udf(get_topic_index, IntegerType())
    topic_association_df = topic_distribution_df.withColumn('topic',
                                                            select_topic_for_question_function(
                                                                topic_distribution_df['topicDistribution'])).drop(
        'topicDistribution')

    qp_topics_df = question_paragraph_df.join(topic_association_df, 'id')

    qp_topics_df.show(truncate=False)

    # vertices = []
    # vertices_scheme = ['id', 'text', 'answer', 'level', 'type', 'title', 'topic']
    #
    # edges = []
    # edges_scheme = ['src', 'dst', 'supporting']
    #
    # current_index = 0
    # for row in qp_topics_df.collect():
    #     node_id = row['id']
    #     vertices.append(
    #         (node_id, row['text'], row['answer'], row['level'], row['type'], row['title'], row['topic']))
    #     if 'p' in node_id:
    #         edges.append((node_id, row['question_id'], row['is_supporting']))

    # vertices_df = spark.createDataFrame(vertices, vertices_scheme)
    # edges_df = spark.createDataFrame(edges, edges_scheme)

    vertices_df = qp_topics_df.drop('question_id', 'is_supporting')
    edges_df = qp_topics_df.drop('question', 'answer', 'level', 'type', 'title', 'topic'). \
        filter(F.col('id').startswith('p')). \
        withColumnRenamed('question_id', 'dst'). \
        withColumnRenamed('id', 'src')

    graph = GraphFrame(vertices_df, edges_df)

    no_nodes, no_edges = None, None
    if verbose_console or verbose_file:
        no_nodes = graph.vertices.count()
        no_edges = graph.edges.count()

    if verbose_console:
        print(f"Total no questions: {len(dataset)}")
        print(f"No vertices: {no_nodes}")
        print(f"No edges: {no_edges}")

    if verbose_file:
        print(f"Total no questions: {len(dataset)}", file=output_file)
        print(f"No vertices: {no_nodes}", file=output_file)
        print(f"No edges: {no_edges}", file=output_file)

    return graph


def show_topic_modeling_QP_graph(dataset_folder: str, dataset_name: str, dataset_extension: str = '.json',
                                 lda_no_topics: int = 10, lda_max_no_iterations: int = 10,
                                 max_no_vertices: Union[int, None] = None,
                                 verbose_console: bool = True, verbose_file: bool = False):
    graph = get_topic_modeling_QP_graph(dataset_folder, dataset_name, dataset_extension,
                                        lda_no_topics=lda_no_topics, lda_max_no_iterations=lda_max_no_iterations,
                                        max_no_vertices=max_no_vertices,
                                        verbose_console=verbose_console, verbose_file=verbose_file,
                                        output_file=None)
    nx_graph = nx.empty_graph()
    for row in graph.vertices.collect():
        node_id = row['id']
        node_type = node_id[:1]
        group = row['topic']
        title, shape, size = None, None, None

        if node_type == 'p':
            size = GRAPH_PARAGRAPH_NODE_SIZE
            title = f"{row['title']}"
            shape = 'circle'
        elif node_type == 'q':
            answer, level, question_type, question = row['answer'], row['level'], \
                                                     row['type'], row['question']
            title = f'Question:{question}\nAnswer:{answer}\nLevel:{level}\nType:{question_type}'

            size = GRAPH_QUESTION_NODE_SIZE

            shape = 'square'

        nx_graph.add_node(node_id, size=size, title=title, group=group, shape=shape)

    for edge in graph.edges.collect():
        node1 = edge['src']
        node2 = edge['dst']
        title = ('Is supporting' if edge['is_supporting'] == 'true' else '')

        # edge_thickness = (
        #     EDGE_IS_SUPPORTING_THICKNESS if edge['is_supporting'] == 'true' else EDGE_IS_NOT_SUPPORTING_THICKNESS)
        # value = edge_thickness,
        # TODO:Refactor this in a better way
        if edge['is_supporting'] == 'true':
            nx_graph.add_edge(node1, node2, title=title)
        else:
            nx_graph.add_edge(node1, node2)

    nt = Network('900px', '1700px', select_menu=True, cdn_resources='remote')
    nt.from_nx(nx_graph)

    #  TODO:Make this also work with the physics and UI menu
    html = nt.generate_html()
    with open(OUTPUT_FILE_FOLDER + "/" + TOPIC_MODELING_QUESTION_PARAGRAPH_GRAPH_NAME + " Visualization.html", mode='w',
              encoding='utf-8') as html_file:
        html_file.write(html)
    display(HTML(html))


def show_topic_modeling_QP_graph_df(dataset_folder: str, dataset_name: str, dataset_extension: str = '.json',
                                    lda_no_topics: int = 10, lda_max_no_iterations: int = 10,
                                    max_no_vertices: Union[int, None] = None,
                                    verbose_console: bool = True, verbose_file: bool = False):
    # Do nothing ,since we don't return anything
    if not verbose_console and not verbose_file:
        return

    output_file = None
    if verbose_file:
        output_file = open(
            OUTPUT_FILE_FOLDER + "/" + dataset_name + ' ' + TOPIC_MODELING_QUESTION_PARAGRAPH_GRAPH_NAME + ".txt", 'w',
            encoding="utf-8")

    graph = get_topic_modeling_QP_graph(dataset_folder, dataset_name, dataset_extension,
                                        lda_no_topics=lda_no_topics, lda_max_no_iterations=lda_max_no_iterations,
                                        max_no_vertices=max_no_vertices,
                                        verbose_console=verbose_console, verbose_file=verbose_file,
                                        output_file=None)

    if verbose_console:
        print("Finding 2 paragraphs that are ")
    if verbose_file:
        pass


In [None]:
def main():
    dataset_folder = 'datasets'
    dataset_name = 'hotpot_dev_distractor_v1'

    lda_no_topics = 20
    lda_max_no_iterations = 10

    start_time = time.time()

    # show_topic_modeling_question_graph(dataset_folder, dataset_name,
    #                                    lda_no_topics=lda_no_topics, lda_max_no_iterations=lda_max_no_iterations,
    #                                    max_no_vertices=500)

    show_topic_modeling_QP_graph(dataset_folder, dataset_name,
                                 lda_no_topics=lda_no_topics, lda_max_no_iterations=lda_max_no_iterations,
                                 max_no_vertices=25)

    execution_time = time.time() - start_time

    print(f"Execution time: {execution_time} s")


main()