# <u>Deloitte Case Study</u>

In [1]:
# Loading of the necessary packages and libraries
import findspark
findspark.init()

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

import pandas as pd

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [2]:
# Reading the data with spark and limiting to 10M rows
twitterDF = spark.read \
                 .option("inferSchema", "true") \
                 .csv("twitter_rv.net", sep=r'\t') \
                 .limit(10000000) # If the whole data should be processed then we leave this out

In [3]:
# Displaying the data to see if it was right loaded
twitterDF.show(5)

+---+---+
|_c0|_c1|
+---+---+
| 12| 13|
| 12| 14|
| 12| 15|
| 12| 16|
| 12| 17|
+---+---+
only showing top 5 rows



In [4]:
# Changing the name of the columns
twitterDF = twitterDF.selectExpr("_c0 as UserID", "_c1 as FollowerID")

In [5]:
# Cheching if schema was changed in a right way
twitterDF.printSchema()

root
 |-- UserID: integer (nullable = true)
 |-- FollowerID: integer (nullable = true)



In [6]:
#!pip install graphframes

In [7]:
# Importing necessary libraries
from functools import reduce
from pyspark.sql.functions import when, count, col, countDistinct, desc, first, lit, array

In [8]:
# Creation of the vertices
followers = twitterDF.select("FollowerID").distinct()
users = twitterDF.select("UserID").distinct()
vertices = users.union(followers).distinct()

In [9]:
# Creation of the edges
edges = twitterDF.withColumn("Edge", array(col("UserID"),col("FollowerID"))).select("Edge")
edges.cache()

DataFrame[Edge: array<int>]

In [10]:
# Exporting the data for further analysis
#twitterDF.toPandas().to_csv('twitter_10M.csv',index=False)

In [11]:
# Finding the accounts that are most followed
twitterDF.cache()
twitterDF.select(col('FollowerID').alias('ID')).groupBy('ID').agg(count('ID').alias('Followers')).orderBy(desc('Followers')).show(3)

+--------+---------+
|      ID|Followers|
+--------+---------+
|10316422|     2583|
|14206126|      940|
|12750862|      914|
+--------+---------+
only showing top 3 rows



In [20]:
# Finding the account that follows the most accounts
twitterDF.cache()
twitterDF.select(col('UserID').alias('ID')).groupBy('ID').agg(count('ID').alias('Followers')).orderBy(desc('Followers')).show(3)

+-----+---------+
|   ID|Followers|
+-----+---------+
|   20|  1213787|
|   13|  1031830|
|10350|  1003728|
+-----+---------+
only showing top 3 rows



In [12]:
twitterDF.columns

['UserID', 'FollowerID']

In [13]:
edges.show(5)

+--------+
|    Edge|
+--------+
|[12, 13]|
|[12, 14]|
|[12, 15]|
|[12, 16]|
|[12, 17]|
+--------+
only showing top 5 rows



In [14]:
#connections = twitterDF.rdd.map(lambda row : (row[0],row[1]))
#connections.lookup(12)

In [15]:
#from collections import defaultdict
#G = defaultdict(list)
#for s in edges:
#    G[s[0]].append(s[1])

In [16]:
# Creation of a function to iterate over the values
#def DFS(G,v,seen=None,path=None):
#    if seen is None: seen = []
#    if path is None: path = [v]

#    seen.append(v)
#    paths = []
#    for t in G.lookup(v):
#        if t not in seen:
#            t_path = path + [t]
#            paths.append(tuple(t_path))
#            paths.extend(DFS(G, t, seen[:], t_path))
#    return paths

In [17]:
#all_paths = DFS(connections, 20)
#print(all_paths)

In [18]:
# Longest circles in the graph
#def dfs(graph, start, end):
#    fringe = [(start, [])]
#    while fringe:
#        state, path = fringe.pop()
#        if path and state == end:
#            yield path
#            continue
#        for next_state in graph[state]:
#            if next_state in path:
#                continue
#            fringe.append((next_state, path+[next_state]))
            
#cycles = [[node]+path  for node in graph for path in dfs(graph, node, node)]