In [1]:
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.functions import col, size
import pyspark.sql.functions as fn

In [2]:
# Start spark in local mode using 100gb of memory
# local mode only runs on a single node, but it will utilize all cores (We have 48!)
conf = SparkConf().setAppName("test").set('spark.driver.memory','64g')
#.setMaster("yarn") # this is used when we run on hadoop, ignore for now

sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

print("Spark Version: ", sc.version)
print("defaultParallelism: ", sc.defaultParallelism)
print("Spark WebURLL ", sc.uiWebUrl) # you can view running jobs here, but I am only able to connect to it via VNC rn, maybe SSH tunneling will fix this? idk

Spark Version:  2.4.4
defaultParallelism:  48
Spark WebURLL  http://c251-132.wrangler.tacc.utexas.edu:4041


In [3]:
sc._conf.getAll() # See all the current Spark configuration settings

[('spark.app.id', 'local-1572129035065'),
 ('spark.driver.port', '33885'),
 ('spark.driver.host', 'c251-132.wrangler.tacc.utexas.edu'),
 ('spark.driver.memory', '64g'),
 ('spark.app.name', 'test'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.local.dir', '/data/06271/cju256/temp'),
 ('spark.ui.showConsoleProgress', 'true')]

In [4]:
import json
from pyspark.sql.types import StructType

# load data from the json file (we can also do the csv when we have it)

schema_json = sqlContext.read.text("/data/06271/cju256/flat.schema").first()[0]
schema = StructType.fromJson(json.loads(schema_json))

#flat_df = sqlContext.read.json('/data/06271/cju256/ut_venmo_2018_flat.json', schema = schema)
flat_df = sqlContext.read.json('/data/06271/cju256/one_mil_flat.json', schema = schema)

In [5]:
ten_k_path = '/data/06271/cju256/ten_k_flat.json'
tenk_df = sqlContext.read.json(ten_k_path)

In [5]:
flat_df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- actor_about: string (nullable = true)
 |-- actor_cancelled: boolean (nullable = true)
 |-- actor_date_created: string (nullable = true)
 |-- actor_email: string (nullable = true)
 |-- actor_external_id: string (nullable = true)
 |-- actor_firstname: string (nullable = true)
 |-- actor_friends: string (nullable = true)
 |-- actor_id: long (nullable = true)
 |-- actor_is_business: boolean (nullable = true)
 |-- actor_lastname: string (nullable = true)
 |-- actor_name: string (nullable = true)
 |-- actor_num_friends: long (nullable = true)
 |-- actor_phone: string (nullable = true)
 |-- actor_picture: string (nullable = true)
 |-- actor_username: string (nullable = true)
 |-- comments_count: long (nullable = true)
 |-- created_time: string (nullable = true)
 |-- likes_count: long (nullable = true)
 |-- mentions_count: long (nullable = true)
 |-- message: string (nullable = true)
 |-- payment_id: long (nullable = true)
 |-- permalink: string (nu

In [6]:
actor_cols = [
    "actor_id as id", 
    "actor_external_id as external_id",
    "actor_username as username",
    "actor_about as about",
    "actor_cancelled as cancelled",
    "actor_date_created as date_created",
    "actor_email as email",
    "actor_firstname as firstname",
    "actor_lastname as lastname",
    "actor_name as name",
    "actor_friends as friends",
    "actor_is_business as is_business",
    "actor_num_friends as num_friends",
    "actor_phone as phone",
    "actor_picture as picture"
]

actors = flat_df.selectExpr(actor_cols)

In [7]:
target_cols = [
    "target_id as id", 
    "target_external_id as external_id",
    "target_username as username",
    "target_about as about",
    "target_cancelled as cancelled",
    "target_date_created as date_created",
    "target_email as email",
    "target_firstname as firstname",
    "target_lastname as lastname",
    "target_name as name",
    "target_friends as friends",
    "target_is_business as is_business",
    "target_num_friends as num_friends",
    "target_phone as phone",
    "target_picture as picture"
]

targets = flat_df.selectExpr(target_cols)

In [8]:
all_users = actors.union(targets).dropDuplicates(["id"])

In [56]:
#all_users.count()

23133264

In [9]:
all_users.coalesce(1).write.format('json').save('/data/06271/cju256/nodes_subset')

In [10]:
edge_cols = [
    "actor_id as src",
    "target_id as dst",
    "type",
    "message",
    "likes_count",
    "comments_count",
    "mentions_count",
    "created_time",
    "unix_time",
    "updated_time",
    "permalink",
    "payment_id",
    "_id"
]

edges = flat_df.selectExpr(edge_cols)

In [11]:
edges.coalesce(1).write.format('json').save('/data/06271/cju256/edges_subset')