# Checking GraphFrames functionality

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession.builder
    .appName("Simple Graphs")
    .config('spark.executor.instances','2')
    .config('spark.executor.memory','4G')
    .config('spark.executor.cores','4')
    .config('spark.dynamicAllocation.enabled','false') # musai??
    .master('spark://master:7077')
    .config("spark.jars.packages", "graphframes:graphframes:0.8.4-spark3.5-s_2.12")
    .getOrCreate()
)
from graphframes import GraphFrame


:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-062bc678-34e4-4886-a3ef-b3bfd625525c;1.0
	confs: [default]
	found graphframes#graphframes;0.8.4-spark3.5-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 120ms :: artifacts dl 4ms
	:: modules in use:
	graphframes#graphframes;0.8.4-spark3.5-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------

In [3]:
def main():
    vertices = spark.createDataFrame(
        [("1", "Alice"), ("2", "Bob"), ("3", "Charlie")], ["id", "name"]
    )

    edges = spark.createDataFrame(
        [("1", "2", "friend"), ("2", "3", "follow")], ["src", "dst", "relationship"]
    )

    graph = GraphFrame(vertices, edges)
    graph.vertices.show()
    graph.edges.show()

    # Filtering vertices and edges
    filtered_vertices = graph.vertices.filter("id > 1")
    filtered_edges = graph.edges.filter("relationship == 'friend'")

    filtered_vertices.show()
    filtered_edges.show()

main()

                                                                                

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
+---+-------+



                                                                                

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  1|  2|      friend|
|  2|  3|      follow|
+---+---+------------+

+---+-------+
| id|   name|
+---+-------+
|  2|    Bob|
|  3|Charlie|
+---+-------+

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  1|  2|      friend|
+---+---+------------+



# Uploading and getting a dataset from hdfs

In [4]:
# hdfs dfs -put /home/ubuntu/jupyter/datasets/hotpot_dev_distractor_v1.json /home/ubuntu/dev_dataset/
# hdfs dfs -put /home/ubuntu/jupyter/datasets/hotpot_dev_distractor_v1.json /home/ubuntu/hotpot_dev_distractor_v1
# hdfs dfs -rm -R /home/ubuntu/hotpot_dev_distractor_v1

# In the end:
# hdfs dfs -put /home/ubuntu/jupyter/datasets/hotpot_dev_distractor_v1.json /home/ubuntu/data/hotpot_dev_distractor_v1.json
# hdfs dfs -put /home/ubuntu/jupyter/datasets/hotpot_train_v1.1.json /home/ubuntu/data/hotpot_train_v1.1.json
# hdfs dfs -cp /home/ubuntu/data/hotpot_train_v1.1/ /home/ubuntu/jupyter/datasets/

In [3]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .appName("Simple Graphs")
    .config('spark.executor.instances','2')
    .config('spark.executor.memory','4G')
    .config('spark.executor.cores','4')
    .config('spark.dynamicAllocation.enabled','false') # musai??
    .master('spark://master:7077')
    .config("spark.jars.packages", "graphframes:graphframes:0.8.4-spark3.5-s_2.12")
    .getOrCreate()
)

In [5]:
dataset_name='hotpot_dev_distractor_v1.json'

dataset_path='/home/ubuntu/data/'+ dataset_name


dataset=spark.read.json(dataset_path, multiLine=True)



dataset.limit(5).show()

[Stage 3:>                                                          (0 + 1) / 1]

+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|                 _id|              answer|             context|level|            question|    supporting_facts|      type|
+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|5a8b57f25542995d1...|                 yes|[[Ed Wood (film),...| hard|Were Scott Derric...|[[Scott Derrickso...|comparison|
|5a8c7595554299585...|   Chief of Protocol|[[Meet Corliss Ar...| hard|What government p...|[[Kiss and Tell (...|    bridge|
|5a85ea09554299477...|           Animorphs|[[Andre Norton Aw...| hard|What science fant...|[[The Hork-Bajir ...|    bridge|
|5adbf0a255429947f...|                  no|[[Esma Sultan (da...| hard|Are the Laleli Mo...|[[Laleli Mosque, ...|comparison|
|5a8e3ea95542995a2...|Greenwich Village...|[[Just Another Ro...| hard|The director of t...|[[Big Stone Gap (...|    bridge|
+-------

                                                                                

# To save with a better format(parquet)

In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .appName("Simple Graphs")
    .config('spark.executor.instances','2')
    .config('spark.executor.memory','4G')
    .config('spark.executor.cores','4')
    .config('spark.dynamicAllocation.enabled','false') # musai??
    .master('spark://master:7077')
    .config("spark.jars.packages", "graphframes:graphframes:0.8.4-spark3.5-s_2.12")
    .getOrCreate()
)

dataset_name='hotpot_dev_distractor_v1'
dataset_extension='.json'

dataset_path='/home/ubuntu/data/'+dataset_name+dataset_extension
new_dataset_path='/home/ubuntu/data/'+dataset_name

dataset=spark.read.json(dataset_path, multiLine=True)

dataset.write.parquet(new_dataset_path, mode="overwrite")

:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-655b53b4-6e3b-489d-ba06-8124154629d6;1.0
	confs: [default]
	found graphframes#graphframes;0.8.4-spark3.5-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 111ms :: artifacts dl 4ms
	:: modules in use:
	graphframes#graphframes;0.8.4-spark3.5-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------

In [8]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .appName("Simple Graphs")
    .config('spark.executor.instances','2')
    .config('spark.executor.memory','4G')
    .config('spark.executor.cores','4')
    .config('spark.dynamicAllocation.enabled','false') # musai??
    .master('spark://master:7077')
    .config("spark.jars.packages", "graphframes:graphframes:0.8.4-spark3.5-s_2.12")
    .getOrCreate()
)

dataset_name='hotpot_dev_distractor_v1.parquet'

dataset_path='/home/ubuntu/data/'+dataset_name


dataset=spark.read.parquet(dataset_path)



dataset.limit(5).show()



+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|                 _id|              answer|             context|level|            question|    supporting_facts|      type|
+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|5a8b57f25542995d1...|                 yes|[[Ed Wood (film),...| hard|Were Scott Derric...|[[Scott Derrickso...|comparison|
|5a8c7595554299585...|   Chief of Protocol|[[Meet Corliss Ar...| hard|What government p...|[[Kiss and Tell (...|    bridge|
|5a85ea09554299477...|           Animorphs|[[Andre Norton Aw...| hard|What science fant...|[[The Hork-Bajir ...|    bridge|
|5adbf0a255429947f...|                  no|[[Esma Sultan (da...| hard|Are the Laleli Mo...|[[Laleli Mosque, ...|comparison|
|5a8e3ea95542995a2...|Greenwich Village...|[[Just Another Ro...| hard|The director of t...|[[Big Stone Gap (...|    bridge|
+-------

                                                                                