# Now that you’ve found the answers to the questions above, design two of your own questions to answer. 

For the first question, I'm going to be looking for mentions of users in comments, to build a network of connected users, which I will then graph

## Dependencies
Note: this requires graphrames to do the pagerank. To use, simply add the flag `--packages graphframes:graphframes:0.7.0-spark2.4-s_2.11` when running pyspark

In [1]:
import re
import pandas as pd
from graphframes import *
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf, col, desc, explode, lower
from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType, BooleanType, ArrayType

sqlContext = SQLContext(sc)

df = sqlContext.read.json("hdfs://orion11:15001/sampled_v3/*")
columns = [
    "distinguished",
    "downs",
    "created_utc",
    "controversiality",
    "edited",
    "gilded",
    "author_flair_css_class",
    "id",
    "author",
    "retrieved_on",
    "score_hidden",
    "subreddit_id",
    "score",
    "name",
    "author_flair_text",
    "link_id",
    "archived",
    "ups",
    "parent_id",
    "subreddit",
    "body"]

df = df.select("author", "body", "subreddit")


In [2]:
# A function to find a mention in a given comment
def find_mention(val):
    found = re.findall("/u/([a-z0-9_-]+)", val)
    
    if not found:
        return None
    
    return found

mention_udf = udf(find_mention, ArrayType(StringType()))

In [3]:
# Finds the mentions
df = df.withColumn("mention", mention_udf("body"))

# filters out comments where there's no mention
df = df.filter(col("mention").isNotNull()) 

# Apparently we need to lowercase everything. Makes life easier later
df = df.withColumn("author", lower(col("author")))
df.show()

+--------------------+--------------------+--------------------+--------------------+
|              author|                body|           subreddit|             mention|
+--------------------+--------------------+--------------------+--------------------+
|  alabamasteamroller|/u/alabamasteamro...|          Silverbugs|[alabamasteamroller]|
|       automoderator|Hello /u/azmolrj,...|              videos|           [azmolrj]|
|coloureduncommonsbot|Here's what that ...|RocketLeagueExchange|           [krustek]|
|              ruleiv|Hi /u/sd449.
Your...|     gentlemanboners|             [sd449]|
|coloureduncommonsbot|Here's what that ...|RocketLeagueExchange|           [krustek]|
|     imagesofnetwork|[Original post](h...|         ImagesOfUSA|   [nebraskateacher]|
|              hwsbot|* Username: seans...|        hardwareswap|      [10769, 10769]|
|          turbochewy|&gt;before it tri...|      NintendoSwitch|        [eugenian64]|
|        marco_santos|You should try as...|           

In [4]:
# Now start to create the edges and verticies for a graphframe
v = df.select("author").withColumnRenamed("author", "id")

e = df.select("author", "mention").withColumn("single_mention", explode("mention"))
e = e.withColumnRenamed("author", "src").withColumnRenamed("single_mention", "dst").drop("mention")
# Note: should we remove self mentions?

# Finally create the graph
g = GraphFrame(v, e)

In [None]:
# Calculate pageRank. Note: This takes FOREVER!
results = g.pageRank(resetProbability=0.15, maxIter=7)
display(results.vertices)

In [None]:
# Save the pageRank, since it takes forever I don't want to have to run it multiple times
# Save vertices and edges as Parquet to some location.
g.vertices.write.parquet("hdfs://orion11:15001/gf/vertices")
g.edges.write.parquet("hdfs://orion11:15001/gf/edges")

# Load the vertices and edges back.
sameV = sqlContext.read.parquet("hdfs://orion11:15001/gf/vertices")
sameE = sqlContext.read.parquet("hdfs://orion11:15001/gf/edges")

In [None]:
results.vertices.show()