Create NER Graph.COde for saving and loading from an already processed graph is also included for better efficiency.

## Load spark and utils

In [1]:
import sparknlp
from pyspark.ml.clustering import KMeans
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, FloatType
import nltk
from nltk.corpus import stopwords
from sparknlp.base import DocumentAssembler, Pipeline
from sparknlp.annotator import (
    Tokenizer,
    WordEmbeddingsModel,
    NerDLModel,
    NerConverter, BertEmbeddings
)

import time
import typing
from typing import Union

import pyspark.sql.functions as F

from pyvis.network import Network
import networkx as nx
from IPython.display import display, HTML

import os

# nltk.download('stopwords')
StopWords = stopwords.words("english")
no_groups = 10

spark = (SparkSession.builder
    .appName("HotpotQA Clustering")
    .config('spark.executor.instances','2')
    .config('spark.executor.memory','12G')
    .config("spark.driver.memory", "4G")
    .config('spark.executor.cores','4')
    .config('spark.dynamicAllocation.enabled','false') # musai??
    .master('spark://master:7077')
    .config("spark.jars.packages", "graphframes:graphframes:0.8.4-spark3.5-s_2.12,com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.2")
    .getOrCreate()
        )

  # .config("spark.jars.packages", "graphframes:graphframes:0.8.4-spark3.5-s_2.12")

from graphframes import GraphFrame


def load_graph(full_graph_folder: str,
                                 full_graph_name: str):
    vertices_path = full_graph_folder + '/' + full_graph_name + ' Vertices'
    edges_path = full_graph_folder + '/' + full_graph_name + ' Edges'

    vertices = spark.read.parquet(vertices_path)
    edges = spark.read.parquet(edges_path)

    return GraphFrame(vertices, edges)
    

# # Load the HotpotQA JSON Data
# dataset_name='hotpot_dev_distractor_v1'
# dataset_extension='.json'
# dataset_path='/home/ubuntu/data/' + dataset_name + dataset_extension
# df = spark.read.json(dataset_path, multiLine=True).withColumn("text_q",
#                                                               col("question")
#                                                               )

:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
graphframes#graphframes added as a dependency
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ac919849-b387-4a67-b342-88bac9a3bbeb;1.0
	confs: [default]
	found graphframes#graphframes;0.8.4-spark3.5-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.5.2 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-s3;1.12.500 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.500 in central
	found com.amazonaws#aws-java-sdk-core;1.12.500 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13

## Create pipeline

In [None]:
import numpy as np
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors, VectorUDT

documentAssembler = DocumentAssembler() \
    .setInputCol("text_q") \
    .setOutputCol("document")

# Tokenize
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("tokens")

# Get the embeddings using glove_100d
embeddings = (WordEmbeddingsModel.pretrained('glove_100d', 'en')
              .setInputCols(["document", 'tokens'])) \
              .setOutputCol("embeddings")
# embeddings = BertEmbeddings.pretrained('bert_base_uncased', 'en') \
#     .setInputCols(["document", 'token']) \
#     .setOutputCol("embeddings")

public_ner = NerDLModel.pretrained('ner_dl', 'en') \
    .setInputCols(["document", "tokens", "embeddings"]) \
    .setOutputCol("ner")

ner_converter = NerConverter() \
    .setInputCols(["document", "tokens", "ner"]) \
    .setOutputCol("entities")

# Define and fit the pipeline
nlpPipeline = Pipeline(stages=[documentAssembler,
                               tokenizer,
                               embeddings,
                               public_ner,
                               ner_converter
                               ])
pipelineModel = nlpPipeline.fit(df)
ner_distribution_df = pipelineModel.transform(df)
# result_with_entities = ner_distribution_df.select("text_q", "entities.result").rdd.map(
#     lambda row: (row['text_q'], row['result'])
# ).collect()
# questions = [entry[0] for entry in result_with_entities]


########################################################################################################################
from pyspark.sql.functions import col, expr
ner_distribution_df.printSchema()
# Extract relevant columns
tokens_and_embeddings_df = ner_distribution_df.select(
    "question",
    "context",
    "level",
    "type",
    'supporting_facts',
    "entities.result",    # List of entities
    col("entities.begin").alias("entities_begin"),   # Entity begin indices
    col("entities.end").alias("entities_end"),       # Entity end indices
)

## Info about extracted info using pipeline

In [3]:
import pyspark.sql.functions as F
print(tokens_and_embeddings_df.columns)

['question', 'context', 'level', 'type', 'supporting_facts', 'result', 'entities_begin', 'entities_end']


## Get supporting titles(for keeping only supporting paragraphs)

In [10]:
# tokens_and_embeddings_df.select('supporting_facts').show(5,truncate=False)

# For testing/limiting
max_no_questions=1000

df1=tokens_and_embeddings_df.limit(max_no_questions).withColumn('row_id',F.monotonically_increasing_id()).withColumnRenamed('result','q_entities')

paragraphs_df = df1.select(
    'question',
    'q_entities',
    'level',
    'type',
    'row_id',
    F.posexplode(F.col('context')).alias('paragraph_id','title-paragraph')
)

paragraphs_df = paragraphs_df.withColumn('paragraph', F.col('title-paragraph').getItem(1)).withColumn('title',F.col('title-paragraph').getItem(0)).drop('title-paragraph')

df2=df1.select('supporting_facts','row_id')
df2=df2.withColumn('supporting_pairs',F.explode(F.col('supporting_facts'))).drop('supporting_facts')
supporting_titles_df=df2.withColumn('supporting_title',F.col('supporting_pairs').getItem(0)).drop('supporting_pairs').dropDuplicates(['supporting_title','row_id'])

# Drop the paragraphs that are not suportting
support_paragraphs=paragraphs_df.join(supporting_titles_df,(supporting_titles_df.supporting_title==paragraphs_df.title)&(supporting_titles_df.row_id==paragraphs_df.row_id),'inner').select(paragraphs_df['*'])

print(support_paragraphs.columns)

['question', 'q_entities', 'level', 'type', 'row_id', 'paragraph_id', 'paragraph', 'title']


## 1.Sentence Level Links

In [6]:
# .withColumnRenamed('position','paragraph_id')

def get_sentence_entities_df():
    df3=support_paragraphs.drop('q_entities','level','type').withColumn('paragraph_json',F.from_json(F.col('paragraph'), ArrayType(StringType()))).drop('paragraph')
    # df3=df3.withColumn('sentence',F.explode(F.col('paragraph_json')))
    # df3.select('sentence','row_id').show(5,truncate=False)
    
    df3 = df3.select(
        'question',
        'row_id',
        'paragraph_id',
         F.posexplode(F.col('paragraph_json')).alias('sentence_id','sentence')
    )
    # paragraphs_df = paragraphs_df.withColumn('paragraph', F.col('title-paragraph').getItem(1)).withColumn('title',F.col('title-paragraph').getItem(0)).drop('title-paragraph')
    # df3.select('row_id','paragraph_id','sentence_id','sentence').show(truncate=False)
    
    sentence_entities_df=df3.withColumnRenamed('sentence','text_q')
    sentence_entities_df=pipelineModel.transform(sentence_entities_df).withColumnRenamed('text_q','sentence').drop('sentence')

    sentence_entities_df=sentence_entities_df.select(
        F.col('row_id'),
        F.col('paragraph_id'),
        F.col('sentence_id'),
        F.col('entities.result').alias('s_entities')
    ).dropDuplicates(['row_id','paragraph_id','sentence_id'])
    
    # sentence_entities_df.show(truncate=False

    return sentence_entities_df


def get_split_sentence_entities_df():
    sentence_entities_df=get_sentence_entities_df()
    
    split_sentence_entities_df=sentence_entities_df.select(
        F.col('row_id'),
        F.col('paragraph_id'),
        F.col('sentence_id'),
        F.posexplode(F.col('s_entities')).alias('entity_pos','entity'),
    ).withColumn('id',F.concat(F.col("row_id"),F.lit("-"), F.col("paragraph_id"),F.lit("-"),F.col('sentence_id'),F.lit('-'),F.col('entity_pos'))).dropDuplicates(['id'])\
    .withColumn('entity_type',F.lit('s'))

    return split_sentence_entities_df


def get_sentence_links_df():
    split_sentence_entities_df=get_split_sentence_entities_df()
    
    sentence_links_df = (
        split_sentence_entities_df.alias("df1")
        .join(
            split_sentence_entities_df.alias("df2"),
            (F.col("df1.row_id") == F.col("df2.row_id")) &  
            (F.col("df1.paragraph_id") == F.col("df2.paragraph_id")) & 
            (F.col("df1.sentence_id") == F.col("df2.sentence_id")) & 
            (F.col("df1.entity") != F.col("df2.entity")),
            "inner"
        )
        .select(
            F.col("df1.entity").alias("entity1"),
            F.col("df2.entity").alias("entity2"),
            F.col('df1.id').alias('src'),
            F.col('df2.id').alias('dst')
        )
    )

    return sentence_links_df


get_sentence_entities_df().show(truncate=False)
get_split_sentence_entities_df().show(truncate=False)
get_sentence_links_df().show(truncate=False)


# print(df3.columns)
# df3=df3.select('paragraph','row_id').withColumn('paragraph_json',F.from_json(F.col('paragraph'), ArrayType(StringType()))).drop('paragraph')
# df3=df3.withColumn('sentence',F.explode(F.col('paragraph_json')))
# df3.select('sentence','row_id').show(5,truncate=False)

# Drop the paragraphs that are not suportting
# df3=paragraphs_df.join(supporting_titles_df,(supporting_titles_df.supporting_title==paragraphs_df.title)&(supporting_titles_df.row_id==paragraphs_df.row_id),'inner').select(paragraphs_df['*'])

                                                                                

+------+------------+-----------+--------------------------------------------------------------------------------------------------------------+
|row_id|paragraph_id|sentence_id|s_entities                                                                                                    |
+------+------------+-----------+--------------------------------------------------------------------------------------------------------------+
|0     |1           |0          |[Scott Derrickson, American]                                                                                  |
|0     |1           |1          |[Los Angeles, California]                                                                                     |
|0     |1           |2          |[Emily Rose, Deliver Us, Marvel Cinematic Universe, Doctor Strange]                                           |
|0     |4           |0          |[Edward Davis Wood Jr, American]                                                                 

                                                                                

+------+------------+-----------+----------+-------------------------+-------+-----------+
|row_id|paragraph_id|sentence_id|entity_pos|entity                   |id     |entity_type|
+------+------------+-----------+----------+-------------------------+-------+-----------+
|0     |1           |0          |0         |Scott Derrickson         |0-1-0-0|s          |
|0     |1           |0          |1         |American                 |0-1-0-1|s          |
|0     |1           |1          |0         |Los Angeles              |0-1-1-0|s          |
|0     |1           |1          |1         |California               |0-1-1-1|s          |
|0     |1           |2          |0         |Emily Rose               |0-1-2-0|s          |
|0     |1           |2          |1         |Deliver Us               |0-1-2-1|s          |
|0     |1           |2          |2         |Marvel Cinematic Universe|0-1-2-2|s          |
|0     |1           |2          |3         |Doctor Strange           |0-1-2-3|s          |

[Stage 36:>                                                         (0 + 1) / 1]

+-------------------------+-------------------------+-------+-------+
|entity1                  |entity2                  |src    |dst    |
+-------------------------+-------------------------+-------+-------+
|Scott Derrickson         |American                 |0-1-0-0|0-1-0-1|
|American                 |Scott Derrickson         |0-1-0-1|0-1-0-0|
|Los Angeles              |California               |0-1-1-0|0-1-1-1|
|California               |Los Angeles              |0-1-1-1|0-1-1-0|
|Emily Rose               |Deliver Us               |0-1-2-0|0-1-2-1|
|Emily Rose               |Marvel Cinematic Universe|0-1-2-0|0-1-2-2|
|Emily Rose               |Doctor Strange           |0-1-2-0|0-1-2-3|
|Deliver Us               |Emily Rose               |0-1-2-1|0-1-2-0|
|Deliver Us               |Marvel Cinematic Universe|0-1-2-1|0-1-2-2|
|Deliver Us               |Doctor Strange           |0-1-2-1|0-1-2-3|
|Marvel Cinematic Universe|Emily Rose               |0-1-2-2|0-1-2-0|
|Marvel Cinematic Un

                                                                                

## 2.Context Level Links

In [7]:
def get_context_level_links_df():
    split_sentence_entities_df=get_split_sentence_entities_df()

    context_level_links_df=split_sentence_entities_df.alias('df1').join(
        split_sentence_entities_df.alias('df2'),
        (F.col('df1.row_id')==F.col('df2.row_id')) &
        (F.col('df1.paragraph_id')!=F.col('df2.paragraph_id')) &
        (F.col('df1.entity')==F.col('df2.entity')),
        'inner'
    ).select(
        F.col("df1.entity").alias("entity1"),
        F.col("df2.entity").alias("entity2"),
        F.col('df1.id').alias('src'),
        F.col('df2.id').alias('dst'),
    )

    return context_level_links_df
    

get_context_level_links_df().show(truncate=False)

                                                                                

+----------------+----------------+-------+-------+
|entity1         |entity2         |src    |dst    |
+----------------+----------------+-------+-------+
|American        |American        |0-1-0-1|0-4-0-1|
|American        |American        |0-4-0-1|0-1-0-1|
|American        |American        |1-1-0-1|1-6-0-1|
|American        |American        |1-6-0-1|1-1-0-1|
|Applegate       |Applegate       |2-2-0-2|2-8-0-3|
|Applegate       |Applegate       |2-8-0-3|2-2-0-2|
|Istanbul        |Istanbul        |3-5-0-7|3-6-0-6|
|Istanbul        |Istanbul        |3-6-0-6|3-5-0-7|
|Turkey          |Turkey          |3-5-0-8|3-6-0-7|
|Turkey          |Turkey          |3-6-0-7|3-5-0-8|
|Turkish         |Turkish         |3-5-0-1|3-6-0-1|
|Turkish         |Turkish         |3-6-0-1|3-5-0-1|
|Adriana Trigiani|Adriana Trigiani|4-3-0-0|4-9-0-2|
|Adriana Trigiani|Adriana Trigiani|4-9-0-2|4-3-0-0|
|American        |American        |4-3-0-2|4-9-0-1|
|American        |American        |4-9-0-1|4-3-0-2|
+-----------

## 3.Paragraph Level Links

In [8]:
# .withColumnRenamed('position','paragraph_id')

def get_title_entities_df():
    df3=support_paragraphs.drop('q_entities','level','type')
    # df3=df3.withColumn('sentence',F.explode(F.col('paragraph_json')))
    # df3.select('sentence','row_id').show(5,truncate=False)
    
    df3 = support_paragraphs.select(
        'question',
        'row_id',
        'paragraph_id',
        'title'
    )
    # paragraphs_df = paragraphs_df.withColumn('paragraph', F.col('title-paragraph').getItem(1)).withColumn('title',F.col('title-paragraph').getItem(0)).drop('title-paragraph')
    # df3.select('row_id','paragraph_id','sentence_id','sentence').show(truncate=False)
    
    title_entities_df=df3.withColumnRenamed('title','text_q')
    title_entities_df=pipelineModel.transform(title_entities_df).withColumnRenamed('text_q','title')
    
    title_entities_df=title_entities_df.select(
        F.col('title'),
        F.col('row_id'),
        F.col('paragraph_id'),
        F.col('entities.result').alias('t_entities')
    ).dropDuplicates(['row_id','paragraph_id']).withColumn('sentence_id',F.lit(-1))

    return title_entities_df


def get_split_title_entities_df():
    title_entities_df=get_title_entities_df()

    split_title_entities_df=title_entities_df.select(
        F.col('row_id'),
        F.col('paragraph_id'),
        F.col('sentence_id'),
        F.posexplode(F.col('t_entities')).alias('entity_pos','entity')
    ).withColumn('id',F.concat(F.col("row_id"),F.lit("-"), F.col("paragraph_id"),F.lit("-"),F.col('sentence_id'),F.lit('-'),F.col('entity_pos'))).dropDuplicates(['id'])\
    .withColumn('entity_type',F.lit('t'))

    return split_title_entities_df
    

def get_paragraph_level_links_df():
    split_title_entities_df=get_split_title_entities_df()

    split_sentence_entities_df=get_split_sentence_entities_df()    

    paragraph_level_links_df=split_title_entities_df.alias('df1').join(
        split_sentence_entities_df.alias('df2'),
        (F.col('df1.row_id')==F.col('df2.row_id')) &
        (F.col('df1.paragraph_id')==F.col('df2.paragraph_id')) &
        (F.col('df1.entity')==F.col('df2.entity')),
        'inner'
    ).select(
        F.col("df1.entity").alias("entity1"),
        F.col("df2.entity").alias("entity2"),
        F.col('df1.id').alias('src'),
        F.col('df2.id').alias('dst'),
    )
    
    return paragraph_level_links_df


get_title_entities_df().show(truncate=False)
get_split_title_entities_df().show(truncate=False)
get_paragraph_level_links_df().show(truncate=False)

                                                                                

+-------------------------+------+------------+-----------------------+-----------+
|title                    |row_id|paragraph_id|t_entities             |sentence_id|
+-------------------------+------+------------+-----------------------+-----------+
|Scott Derrickson         |0     |1           |[Scott Derrickson]     |-1         |
|Ed Wood                  |0     |4           |[Ed Wood]              |-1         |
|Shirley Temple           |1     |1           |[Shirley Temple]       |-1         |
|Kiss and Tell (1945 film)|1     |6           |[]                     |-1         |
|The Hork-Bajir Chronicles|2     |2           |[Hork-Bajir Chronicles]|-1         |
|Animorphs                |2     |8           |[]                     |-1         |
|Laleli Mosque            |3     |5           |[Laleli Mosque]        |-1         |
|Esma Sultan Mansion      |3     |6           |[Esma Sultan Mansion]  |-1         |
|Adriana Trigiani         |4     |3           |[Adriana Trigiani]     |-1   

                                                                                

+------+------------+-----------+----------+---------------------+--------+-----------+
|row_id|paragraph_id|sentence_id|entity_pos|entity               |id      |entity_type|
+------+------------+-----------+----------+---------------------+--------+-----------+
|0     |1           |-1         |0         |Scott Derrickson     |0-1--1-0|t          |
|0     |4           |-1         |0         |Ed Wood              |0-4--1-0|t          |
|1     |1           |-1         |0         |Shirley Temple       |1-1--1-0|t          |
|2     |2           |-1         |0         |Hork-Bajir Chronicles|2-2--1-0|t          |
|3     |5           |-1         |0         |Laleli Mosque        |3-5--1-0|t          |
|3     |6           |-1         |0         |Esma Sultan Mansion  |3-6--1-0|t          |
|4     |3           |-1         |0         |Adriana Trigiani     |4-3--1-0|t          |
|4     |9           |-1         |0         |Big Stone Gap        |4-9--1-0|t          |
+------+------------+-----------

[Stage 60:>                                                         (0 + 1) / 1]

+---------------------+---------------------+--------+-------+
|entity1              |entity2              |src     |dst    |
+---------------------+---------------------+--------+-------+
|Scott Derrickson     |Scott Derrickson     |0-1--1-0|0-1-0-0|
|Hork-Bajir Chronicles|Hork-Bajir Chronicles|2-2--1-0|2-2-0-0|
|Laleli Mosque        |Laleli Mosque        |3-5--1-0|3-5-0-0|
|Esma Sultan Mansion  |Esma Sultan Mansion  |3-6--1-0|3-6-0-0|
|Adriana Trigiani     |Adriana Trigiani     |4-3--1-0|4-3-0-0|
|Big Stone Gap        |Big Stone Gap        |4-9--1-0|4-9-0-0|
+---------------------+---------------------+--------+-------+



                                                                                

## 4.Title Level Links

In [11]:
# Connect title entities that appear in other titles,not just in this context
# This adds a 'sparse' connection between contexts

def get_title_level_links_df():
    split_title_entities_df=get_split_title_entities_df()

    title_level_links_df=split_title_entities_df.alias('df1').join(
        split_title_entities_df.alias('df2'),
        (F.col('df1.id')!=F.col('df2.id')) &
        (F.col('df1.entity')==F.col('df2.entity')),
        'inner'
    ).select(
        F.col("df1.entity").alias("entity1"),
        F.col("df2.entity").alias("entity2"),
        F.col('df1.id').alias('src'),
        F.col('df2.id').alias('dst'),
    )

    return title_level_links_df


get_title_level_links_df().show(truncate=False)

[Stage 74:>                                                         (0 + 1) / 1]

+----------------------+----------------------+----------+----------+
|entity1               |entity2               |src       |dst       |
+----------------------+----------------------+----------+----------+
|New South Wales       |New South Wales       |125-2--1-0|435-8--1-1|
|New South Wales       |New South Wales       |125-2--1-0|174-7--1-1|
|California            |California            |128-8--1-1|838-2--1-1|
|Avenue                |Avenue                |13-3--1-0 |785-1--1-0|
|Numb                  |Numb                  |133-7--1-0|509-4--1-0|
|Rihanna               |Rihanna               |133-7--1-1|509-4--1-1|
|Texas Tech Red Raiders|Texas Tech Red Raiders|136-3--1-0|662-1--1-0|
|Stapleton Cotton      |Stapleton Cotton      |137-5--1-0|934-6--1-0|
|Viscount Combermere   |Viscount Combermere   |137-5--1-1|934-6--1-1|
|Manchester United F.C |Manchester United F.C |14-5--1-0 |158-0--1-0|
|Kingdom               |Kingdom               |147-8--1-0|316-0--1-0|
|Kansas             

                                                                                

## 5.Question Level Links

In [14]:
# Add links between the entities of each question to the entities found in the support paragraphs
# This connects a question to the respective 'relevant' entities in the paragraphs

def get_question_entities_df():
    question_entities_df=support_paragraphs.drop('paragraph_id','paragraph','title')

    return question_entities_df

def get_split_question_entities_df():
    question_entities_df=get_question_entities_df()

    split_question_entities_df=question_entities_df.select(
        F.col('row_id'),
        F.posexplode(F.col('q_entities')).alias('entity_pos','entity'),
    ).withColumn('entity_type',F.lit('q'))\
    .withColumn('paragraph_id',F.lit(-1))\
    .withColumn('sentence_id',F.lit(-1))\
    .withColumn('id',F.concat(F.col("row_id"),F.lit("-"), F.col("paragraph_id"),F.lit("-"),F.col('sentence_id'),F.lit('-'),F.col('entity_pos'))).dropDuplicates(['id'])

    return split_question_entities_df
    

def get_question_level_links_df():
    split_sentence_entities_df=get_split_sentence_entities_df()
    split_question_entities_df=get_split_question_entities_df()

    question_level_links_df=split_sentence_entities_df.alias('df1').join(
        split_question_entities_df.alias('df2'),
        (F.col('df1.row_id')==F.col('df2.row_id')) &
        (F.col('df1.entity')==F.col('df2.entity')),
        'inner'
    ).select(
        F.col("df1.entity").alias("entity1"),
        F.col("df2.entity").alias("entity2"),
        F.col('df1.id').alias('src'),
        F.col('df2.id').alias('dst'),
    )

    return question_level_links_df



get_question_entities_df().show(truncate=False)
get_split_question_entities_df().show(truncate=False)
get_question_level_links_df().show(truncate=False)

                                                                                

+-------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------+-----+----------+------+
|question                                                                                                                                               |q_entities                          |level|type      |row_id|
+-------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------+-----+----------+------+
|Were Scott Derrickson and Ed Wood of the same nationality?                                                                                             |[Scott Derrickson, Ed Wood]         |hard |comparison|0     |
|Were Scott Derrickson and Ed Wood of the same nationality?                                                                                 

                                                                                

+------+----------+---------------------------+-----------+------------+-----------+-----------+
|row_id|entity_pos|entity                     |entity_type|paragraph_id|sentence_id|id         |
+------+----------+---------------------------+-----------+------------+-----------+-----------+
|0     |0         |Scott Derrickson           |q          |-1          |-1         |0--1--1-0  |
|0     |1         |Ed Wood                    |q          |-1          |-1         |0--1--1-1  |
|1     |0         |Corliss Archer             |q          |-1          |-1         |1--1--1-0  |
|1     |1         |Kiss                       |q          |-1          |-1         |1--1--1-1  |
|10    |0         |Lawrence                   |q          |-1          |-1         |10--1--1-0 |
|10    |1         |Kansas                     |q          |-1          |-1         |10--1--1-1 |
|10    |2         |Kansas City                |q          |-1          |-1         |10--1--1-2 |
|100   |0         |Indians    

                                                                                

+---------------------------+---------------------------+---------+-----------+
|entity1                    |entity2                    |src      |dst        |
+---------------------------+---------------------------+---------+-----------+
|Scott Derrickson           |Scott Derrickson           |0-1-0-0  |0--1--1-0  |
|Corliss Archer             |Corliss Archer             |1-6-0-3  |1--1--1-0  |
|Kiss                       |Kiss                       |1-6-0-0  |1--1--1-1  |
|Lawrence                   |Lawrence                   |10-9-1-3 |10--1--1-0 |
|Lawrence                   |Lawrence                   |10-9-1-0 |10--1--1-0 |
|Kansas                     |Kansas                     |10-9-1-1 |10--1--1-1 |
|Kansas                     |Kansas                     |10-9-0-4 |10--1--1-1 |
|Kansas                     |Kansas                     |10-9-0-2 |10--1--1-1 |
|Kansas City                |Kansas City                |10-9-2-3 |10--1--1-2 |
|Kansas City                |Kansas City

## Creating NER Graph(putting all the code together for better efficiency)

In [21]:
# After creating a dependency tree,we order the portions of the code accordingly
GRAPH_SAVE_FOLDER='/home/ubuntu/graphs/'
NER_GRAPH_NAME='NER Graph'


def save_NER_graph(graph_name:str,max_no_questions:int):
    ##########################
    # Create df only with supporting paragraphs
    
    df1=tokens_and_embeddings_df 
    
    if max_no_questions is not None:
        df1=df1.limit(max_no_questions)
    
    df1=df1.withColumn('row_id',F.monotonically_increasing_id()).withColumnRenamed('result','q_entities')
    
    paragraphs_df = df1.select(
        'question',
        'q_entities',
        'level',
        'type',
        'row_id',
        F.posexplode(F.col('context')).alias('paragraph_id','title-paragraph')
    )
    
    paragraphs_df = paragraphs_df.withColumn('paragraph', F.col('title-paragraph').getItem(1)).withColumn('title',F.col('title-paragraph').getItem(0)).drop('title-paragraph')
    
    df2=df1.select('supporting_facts','row_id')
    df2=df2.withColumn('supporting_pairs',F.explode(F.col('supporting_facts'))).drop('supporting_facts')
    supporting_titles_df=df2.withColumn('supporting_title',F.col('supporting_pairs').getItem(0)).drop('supporting_pairs').dropDuplicates(['supporting_title','row_id'])
    
    # Drop the paragraphs that are not suportting
    support_paragraphs=paragraphs_df.join(supporting_titles_df,(supporting_titles_df.supporting_title==paragraphs_df.title)&(supporting_titles_df.row_id==paragraphs_df.row_id),'inner').select(paragraphs_df['*'])

    ##########################
    # Get Entities for a sentence as a list

    df3=support_paragraphs.drop('q_entities','level','type').withColumn('paragraph_json',F.from_json(F.col('paragraph'), ArrayType(StringType()))).drop('paragraph')
    
    df3 = df3.select(
        'question',
        'row_id',
        'paragraph_id',
         F.posexplode(F.col('paragraph_json')).alias('sentence_id','sentence')
    )
    
    sentence_entities_df=df3.withColumnRenamed('sentence','text_q')
    sentence_entities_df=pipelineModel.transform(sentence_entities_df).withColumnRenamed('text_q','sentence').drop('sentence')

    sentence_entities_df=sentence_entities_df.select(
        F.col('row_id'),
        F.col('paragraph_id'),
        F.col('sentence_id'),
        F.col('entities.result').alias('s_entities')
    ).dropDuplicates(['row_id','paragraph_id','sentence_id'])


    ##########################
    # Get Entities with ids from where they were taken from sentences in paragraphs
    split_sentence_entities_df=sentence_entities_df.select(
        F.col('row_id'),
        F.col('paragraph_id'),
        F.col('sentence_id'),
        F.posexplode(F.col('s_entities')).alias('entity_pos','entity'),
    ).withColumn('id',F.concat(F.col("row_id"),F.lit("-"), F.col("paragraph_id"),F.lit("-"),F.col('sentence_id'),F.lit('-'),F.col('entity_pos'))).dropDuplicates(['id'])\
    .withColumn('entity_type',F.lit('s'))


    ##########################
    # Get Entities with ids from where they were taken from titles of paragraphs as a list
    df3 = support_paragraphs.select(
        'question',
        'row_id',
        'paragraph_id',
        'title'
    )

    title_entities_df=df3.withColumnRenamed('title','text_q')
    title_entities_df=pipelineModel.transform(title_entities_df).withColumnRenamed('text_q','title')
    
    title_entities_df=title_entities_df.select(
        F.col('title'),
        F.col('row_id'),
        F.col('paragraph_id'),
        F.col('entities.result').alias('t_entities')
    ).dropDuplicates(['row_id','paragraph_id']).withColumn('sentence_id',F.lit(-1))


    ##########################
    # Get Entities with ids from where they were taken from titles of paragraphs

    split_title_entities_df=title_entities_df.select(
        F.col('row_id'),
        F.col('paragraph_id'),
        F.col('sentence_id'),
        F.posexplode(F.col('t_entities')).alias('entity_pos','entity')
    ).withColumn('id',F.concat(F.col("row_id"),F.lit("-"), F.col("paragraph_id"),F.lit("-"),F.col('sentence_id'),F.lit('-'),F.col('entity_pos'))).dropDuplicates(['id'])\
    .withColumn('entity_type',F.lit('t'))

    
    ##########################
    # Get Sentence Level Links
    sentence_level_links_df = (
        split_sentence_entities_df.alias("df1")
        .join(
            split_sentence_entities_df.alias("df2"),
            (F.col("df1.row_id") == F.col("df2.row_id")) &  
            (F.col("df1.paragraph_id") == F.col("df2.paragraph_id")) & 
            (F.col("df1.sentence_id") == F.col("df2.sentence_id")) & 
            (F.col("df1.entity") != F.col("df2.entity")),  # Avoid duplicate and self-pairs
            "inner"
        )
        .select(
            F.col("df1.entity").alias("entity1"),
            F.col("df2.entity").alias("entity2"),
            F.col('df1.id').alias('src'),
            F.col('df2.id').alias('dst')
        )
    )

    ##########################
    # Get Context Level Links
    context_level_links_df=split_sentence_entities_df.alias('df1').join(
        split_sentence_entities_df.alias('df2'),
        (F.col('df1.row_id')==F.col('df2.row_id')) &
        (F.col('df1.paragraph_id')!=F.col('df2.paragraph_id')) &
        (F.col('df1.entity')==F.col('df2.entity')),
        'inner'
    ).select(
        F.col("df1.entity").alias("entity1"),
        # F.col("df2.entity").alias("entity2"),
        F.lit('').alias('entity2'),
        F.col('df1.id').alias('src'),
        F.col('df2.id').alias('dst'),
    )

    ##########################
    # Get Paragraph Level Links
    paragraph_level_links_df=split_title_entities_df.alias('df1').join(
        split_sentence_entities_df.alias('df2'),
        (F.col('df1.row_id')==F.col('df2.row_id')) &
        (F.col('df1.paragraph_id')==F.col('df2.paragraph_id')) &
        (F.col('df1.entity')==F.col('df2.entity')) &
        (F.col('df1.sentence_id')!=F.col('df2.sentence_id')),
        'inner'
    ).select(
        F.col("df1.entity").alias("entity1"),
        F.lit('').alias('entity2'),
        F.col('df1.id').alias('src'),
        F.col('df2.id').alias('dst'),
    )

    ##########################
    # Get Title Level Links

    title_level_links_df=split_title_entities_df.alias('df1').join(
        split_title_entities_df.alias('df2'),
        (F.col('df1.id')!=F.col('df2.id')) &
        (F.col('df1.entity')==F.col('df2.entity')),
        'inner'
    ).select(
        F.col("df1.entity").alias("entity1"),
        F.col("df2.entity").alias("entity2"),
        F.col('df1.id').alias('src'),
        F.col('df2.id').alias('dst'),
    )

    ##########################
    # Get Question Level Links

    question_entities_df=support_paragraphs.drop('paragraph_id','paragraph','title')

    split_question_entities_df=question_entities_df.select(
        F.col('row_id'),
        F.posexplode(F.col('q_entities')).alias('entity_pos','entity'),
    ).withColumn('entity_type',F.lit('q'))\
    .withColumn('paragraph_id',F.lit(-1))\
    .withColumn('sentence_id',F.lit(-1))\
    .withColumn('id',F.concat(F.col("row_id"),F.lit("-"), F.col("paragraph_id"),F.lit("-"),F.col('sentence_id'),F.lit('-'),F.col('entity_pos'))).dropDuplicates(['id'])
    
    question_level_links_df=split_sentence_entities_df.alias('df1').join(
        split_question_entities_df.alias('df2'),
        (F.col('df1.row_id')==F.col('df2.row_id')) &
        (F.col('df1.entity')==F.col('df2.entity')),
        'inner'
    ).select(
        F.col("df1.entity").alias("entity1"),
        F.col("df2.entity").alias("entity2"),
        F.col('df1.id').alias('src'),
        F.col('df2.id').alias('dst'),
    )

    # Create and save graph
    
    vertices=split_sentence_entities_df.union(split_title_entities_df).union(split_question_entities_df)
    edges=sentence_level_links_df.union(context_level_links_df).union(paragraph_level_links_df).union(title_level_links_df).union(question_level_links_df)

    vertices_path = GRAPH_SAVE_FOLDER +NER_GRAPH_NAME + graph_name + ' Vertices'
    edges_path = GRAPH_SAVE_FOLDER +NER_GRAPH_NAME + graph_name + ' Edges'

    vertices.write.parquet(vertices_path)
    edges.write.parquet(edges_path)

    print(f'{graph_name} with max {max_no_questions} questions')
    print(f'No nodes: {vertices.count()}')
    print(f'No edges: {edges.count()}')

In [23]:
import time

start_time=time.time()
# save_NER_graph(max_no_questions=None,graph_name=' f3')

execution_time=time.time()-start_time
print("Execution time ",execution_time)

# f3->Graph for dev with Title and Question entities
# f3 with max None questions
                                                                                
# No nodes: 171020                                                                       
# No edges: 638296

                                                                                

 f3 with max None questions


                                                                                

No nodes: 171020


                                                                                

No edges: 638296
Execution time  383.3619194030762


## Load NER Graph and save interesting subgraphs

In [2]:
import codecs

GRAPH_SAVE_FOLDER='/home/ubuntu/graphs/'
NER_GRAPH_NAME='NER Graph'

GRAPH_NODE_SIZE = 5

PAGE_RANK_NODE_SCALING_SIZE = 5
AS_LENGTH_EDGE = 500

OUTPUT_FILE_FOLDER='results/Louvain/NER Subgraphs dev'

def create_txt_for_subgraph(subgraph, component_id: int):
    no_vertices = subgraph.vertices.count()
    no_edges = subgraph.edges.count()
    subgraph_name = f"Component {component_id} - {no_vertices}.txt"

    directory = OUTPUT_FILE_FOLDER  + '/'
    # Create folders if they don't exist
    if not os.path.exists(directory):
        os.makedirs(directory)
    file_path = directory + subgraph_name

    start_index = 1
    answer_index = dict()

    with codecs.open(file_path, 'w', "utf-8") as file:
        file.write(f'p {no_vertices} {no_edges}\n')

        for row in subgraph.vertices.collect():
            node_id = row['id']

            if node_id not in answer_index:
                answer_index[node_id] = start_index
                start_index += 1

            entity,entity_type =  row['entity'],row['entity_type']
            entity_type=('Sentence' if entity_type=='s' else ('Title' if entity_type=='t' else 'Question'))
        
            node_index = answer_index[node_id]

            file.write(f'v {node_index} {entity} {entity_type}\n')

        for row in subgraph.edges.collect():
            node_id1 = row['src']
            node_id2 = row['dst']

            file.write(f'e {answer_index[node_id1]} {answer_index[node_id2]} 1\n')

def create_html_for_subgraph(subgraph, component_id: int):
    no_vertices = subgraph.vertices.count()

    nx_graph = nx.empty_graph()
    for row in subgraph.vertices.collect():
        node_id = row['id']

        entity,entity_type =  row['entity'],row['entity_type']
        entity_type=('Sentence' if entity_type=='s' else ('Title' if entity_type=='t' else 'Question'))

        page_rank = row['pagerank']
        group = row['label']

        label = f'Entity:{entity}\nType:{entity_type}\nPage Rank:{page_rank:.2f}\nLabel:{group}'
        shape = 'square'

        node_size = int(round(PAGE_RANK_NODE_SCALING_SIZE * page_rank))

        nx_graph.add_node(node_id, size=node_size, title=label, label=label, shape=shape, group=group)

    for edge in subgraph.edges.collect():
        node1 = edge['src']
        node2 = edge['dst']
        weight = edge['weight']

        nx_graph.add_edge(node1, node2, label=weight)

    nt = Network('900px', '1700px', select_menu=True, cdn_resources='remote')
    nt.from_nx(nx_graph)

    for edge in subgraph.edges.collect():
        node1 = edge['src']
        node2 = edge['dst']
        nx_graph.add_edge(node1, node2, length=AS_LENGTH_EDGE)

    nt = Network('900px', '1700px', select_menu=True, cdn_resources='remote')
    nt.from_nx(nx_graph)

    # nt.enable_physics(True)

    nt.set_options("""
                    {
                      "interaction": {
                        "hover": true,          
                        "navigationButtons": true,
                        "keyboard": true        
                      },
                      "manipulation": {
                        "enabled": true         
                      },
                      "physics": {
                        "enabled": true,
                        "repulsion": {
                          "nodeDistance": 450,
                          "springLength": 500
                        }
                      },
                      "layout": {
                        "hierarchical": {
                          "enabled": false
                        }
                      }
                    }
                    """
                   )

    html_name = f"Component {component_id} - {no_vertices} nodes.html"
    html_directory = OUTPUT_FILE_FOLDER + '/'
    # Create folders if they don't exist
    if not os.path.exists(html_directory):
        os.makedirs(html_directory)

    html_path = html_directory + html_name

    html = nt.generate_html()
    with open(html_path, mode='w',
              encoding='utf-8') as html_file:
        html_file.write(html)
    # display(HTML(html))


def show_ner_subgraphs(subgraphs_min_no_nodes: int = 4, subgraphs_max_no_nodes=20,
                       subgraphs_min_no_edges: int = 4,
                         page_rank_beta: float = 0.15, page_rank_max_no_iterations: int = 10,
                         lpa_max_no_iterations: int = 10,
                         verbose_console: bool = True,
                         graph=None):
    df_connected_components = graph.connectedComponents(algorithm="graphx")

    # df_connected_components.show()

    df_connected_components_grouped = df_connected_components.groupBy("component")

    df_connected_components_sizes = df_connected_components_grouped.agg(F.count("*").alias("size"))

    df_connected_components_max_min = df_connected_components_sizes.agg(F.max(F.col('size')).alias('max_no_nodes'),
                                                                        F.min(F.col('size')).alias(
                                                                            'min_no_nodes')).first()
    max_size_connected_component, min_size_connected_component = df_connected_components_max_min[0], \
                                                                 df_connected_components_max_min[1]

    if verbose_console:
        print(f"Max no nodes in a component: {max_size_connected_component}")
        print(f"Min no nodes in a component: {min_size_connected_component}")

    df_connected_components = df_connected_components.select(['id', 'component'])

    # df_max_components = df_connected_components_sizes.filter(F.col("size") == max_size_connected_component).select(
    #     "component").collect()

    for component in df_connected_components_sizes.toLocalIterator():
        component_id, component_size = component['component'], component['size']

        if component_size < subgraphs_min_no_nodes or component_size > subgraphs_max_no_nodes:
            continue

        # print(f"Nodes for the component {component_id}")

        subgraph_vertices = graph.vertices.join(df_connected_components,
                                                on='id').filter(
            F.col('component') == component_id).dropDuplicates(['id'])

        # TODO:Attempt more efficient way to get the nodes and edges...
        # subgraph_vertices.show(truncate=False)
        # subgraph = graph.filterVertices(F.col('id')).dropIsolatedVertices()

        vertices_ids_src = subgraph_vertices.select('id').withColumnRenamed('id', 'src')
        vertices_ids_dst = subgraph_vertices.select('id').withColumnRenamed('id', 'dst')

        subgraph_edges = graph.edges.join(vertices_ids_src, on='src', how='inner').join(vertices_ids_dst, on='dst',
                                                                                        how='inner')

        no_edges = subgraph_edges.count()
        if no_edges < subgraphs_min_no_edges:
            continue

        subgraph = GraphFrame(subgraph_vertices, subgraph_edges)
        labels_df = subgraph.labelPropagation(maxIter=lpa_max_no_iterations).select(['id', 'label'])

        subgraph_vertices = subgraph_vertices.join(labels_df, on='id')
        subgraph = GraphFrame(subgraph_vertices, subgraph_edges)

        subgraph = subgraph.pageRank(resetProbability=page_rank_beta, maxIter=page_rank_max_no_iterations)

        create_html_for_subgraph(subgraph, component_id)
        create_txt_for_subgraph(subgraph,component_id)

In [5]:
def main():
    start_time = time.time()
    
    # dataset_folder=DATASETS_FOLDER
    # dataset_name='hotpot_train_v1.1'
    
    full_graph_folder='/home/ubuntu/graphs/'

    # Full graph obtained from dev dataset
    full_graph_name='NER Graph f3'

    graph=load_graph(full_graph_folder,full_graph_name)
    
    show_ner_subgraphs(subgraphs_min_no_nodes=4, subgraphs_max_no_nodes=100,
                       subgraphs_min_no_edges=4,
                         page_rank_beta=0.15, page_rank_max_no_iterations=50,
                         lpa_max_no_iterations=35,
                         verbose_console=True,
                         graph=graph)
    
    print(f"Execution time: {time.time() - start_time} s")

# main()

In [3]:
def main2():
    start_time = time.time()
    
    # dataset_folder=DATASETS_FOLDER
    # dataset_name='hotpot_train_v1.1'
    
    full_graph_folder='/home/ubuntu/graphs/'

    # Full graph obtained from dev dataset
    full_graph_name='NER Graph f3'

    graph=load_graph(full_graph_folder,full_graph_name)
    
    show_ner_subgraphs(subgraphs_min_no_nodes=100, subgraphs_max_no_nodes=1000,
                       subgraphs_min_no_edges=4,
                         page_rank_beta=0.15, page_rank_max_no_iterations=1,
                         lpa_max_no_iterations=1,
                         verbose_console=True,
                         graph=graph)
    
    print(f"Execution time: {time.time() - start_time} s")

main2()

                                                                                

Max no nodes in a component: 14399
Min no nodes in a component: 1


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ubuntu/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

                                                                                