In [1]:
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.functions import col, size

In [2]:
# Start spark in local mode using 100gb of memory
# local mode only runs on a single node, but it will utilize all cores (We have 48!)
conf = SparkConf().setAppName("test").set('spark.driver.memory','8g')
#.setMaster("yarn") # this is used when we run on hadoop, ignore for now

sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

print("Spark Version: ", sc.version)
print("defaultParallelism: ", sc.defaultParallelism)
print("Spark WebURLL ", sc.uiWebUrl) # you can view running jobs here, but I am only able to connect to it via VNC rn, maybe SSH tunneling will fix this? idk

Spark Version:  2.4.4
defaultParallelism:  48
Spark WebURLL  http://c251-117.wrangler.tacc.utexas.edu:4040


In [3]:
sc._conf.getAll() # See all the current Spark configuration settings

[('spark.app.id', 'local-1570762679085'),
 ('spark.app.name', 'test'),
 ('spark.driver.port', '32866'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.memory', '8g'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.host', 'c251-117.wrangler.tacc.utexas.edu'),
 ('spark.ui.showConsoleProgress', 'true')]

In [4]:
# load data from the json file (we can also do the csv when we have it)
clean_path = '/data/06271/cju256/ut_venmo_2018_clean.json'
clean_DF = sqlContext.read.json(clean_path)

In [5]:
# simply counts the number of rows in the data, takes about 20min
clean_DF.count()

342281006

In [6]:
# get the schema of the json
clean_DF.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- $oid: string (nullable = true)
 |-- action_links: struct (nullable = true)
 |    |-- iphone_app_store_id: string (nullable = true)
 |    |-- iphone_app_store_link_text: string (nullable = true)
 |-- actor: struct (nullable = true)
 |    |-- about: string (nullable = true)
 |    |-- cancelled: boolean (nullable = true)
 |    |-- date_created: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- external_id: string (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- friends: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- is_business: boolean (nullable = true)
 |    |-- lastname: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- num_friends: long (nullable = true)
 |    |-- phone: string (nullable = true)
 |    |-- picture: string (nullable = true)
 |    |-- username: string (nullable = true)
 |-- audience: string (nullable = true)
 |-- comme

In [8]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

# this defines a function to run on every row of the data. I am doing this to look for transactions with length > 1 (split bills)
row_size = udf(lambda row: len(row.__fields__), IntegerType())

clean_DF.where(row_size(col("transactions").getItem(0)) > 1).count()

0

In [9]:
clean_DF.where(size(col("comments")) > 0).count()

9871759

In [10]:
clean_DF.where(col("likes").getItem("count") > 0).count()

42092037

In [11]:
clean_DF.where(size(col("mentions")) > 0).count()

533953

In [12]:
clean_DF.groupBy("type").count().show()

+-------+---------+
|   type|    count|
+-------+---------+
| charge| 56688775|
|payment|285592231|
+-------+---------+

