In [1]:
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("DF-Popular-Superhero").config("spark.sql.crossJoin.enabled","true").getOrCreate()

# Load and Verify Data

Marvel-Graph --> Super hero ID followed by their super hero friends ID. One superhero can have friends spanning multiple lines.

Marvel-Names --> Super hero details

In [3]:
# Create schema when reading u.data
schema = StructType([StructField('id', IntegerType(), True),
                     StructField('name', StringType(), True)])

In [4]:
heroNames = spark.read.csv('resources/Marvel-Names.txt',sep = ' ', schema = schema)
heroFriends = spark.read.text('resources/Marvel-Graph.txt')

In [5]:
heroNames.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)



In [6]:
heroFriends.printSchema()

root
 |-- value: string (nullable = true)



In [7]:
heroNames.head(3)

[Row(id=1, name='24-HOUR MAN/EMMANUEL'),
 Row(id=2, name='3-D MAN/CHARLES CHAN'),
 Row(id=3, name='4-D MAN/MERCURIO')]

In [8]:
heroFriends.head(3)

[Row(value='5988 748 1722 3752 4655 5743 1872 3413 5527 6368 6085 4319 4728 1636 2397 3364 4001 1614 1819 1585 732 2660 3952 2507 3891 2070 2239 2602 612 1352 5447 4548 1596 5488 1605 5517 11 479 2554 2043 17 865 4292 6312 473 534 1479 6375 4456 '),
 Row(value='5989 4080 4264 4446 3779 2430 2297 6169 3530 3272 4282 6432 2548 4140 185 105 3878 2429 1334 4595 2767 3956 3877 4776 4946 3407 128 269 5775 5121 481 5516 4758 4053 1044 1602 3889 1535 6038 533 3986 '),
 Row(value='5982 217 595 1194 3308 2940 1815 794 1503 5197 859 5096 6039 2664 651 2244 528 284 1449 1097 1172 1092 108 3405 5204 387 4607 4545 3705 4930 1805 4712 4404 247 4754 4427 1845 536 5795 5978 533 3984 6056 ')]

In [9]:
heroNames.columns

['id', 'name']

In [10]:
heroFriends.columns

['value']

In [11]:
heroNames.describe().show()

+-------+------------------+--------+
|summary|                id|    name|
+-------+------------------+--------+
|  count|             19428|   19427|
|   mean|            9714.5|Infinity|
| stddev|5608.5248506180305|     NaN|
|    min|                 1| 2001 10|
|    max|             19428| �GAMORA|
+-------+------------------+--------+



In [12]:
heroFriends.describe().show()

+-------+--------------------+
|summary|               value|
+-------+--------------------+
|  count|                6589|
|   mean|              2998.0|
| stddev|   1734.968203358974|
|    min|1 1999 6471 6463 ...|
|    max|999 1628 2960 377...|
+-------+--------------------+



# Split Marvel-Graph data to find number of connections for each super hero

In [13]:
connections = heroFriends.withColumn("id",f.split(f.col("value")," ")[0])\
                         .withColumn("connections",f.size(f.split(f.col("value")," ")) -1) \
                         .groupBy("id").agg(f.sum("connections").alias("connections"))
connections.show(3)

+----+-----------+
|  id|connections|
+----+-----------+
| 691|          7|
|1159|         12|
|3959|        143|
+----+-----------+
only showing top 3 rows



## Find most popular superhero

In [14]:
mostPopular = connections.sort(f.col("connections").desc()).first()
print(mostPopular)

Row(id='859', connections=1937)


In [15]:
mostPopularName = heroNames.filter(f.col("id")==mostPopular[0]).select("name").first()
print(mostPopularName)

Row(name='CAPTAIN AMERICA')


In [16]:
print(mostPopularName[0] +" is the most popular superhero with "+ str(mostPopular[1]) + " co-appearances")

CAPTAIN AMERICA is the most popular superhero with 1937 co-appearances


## Find most obscure superhero

### With window function

In [17]:
from pyspark.sql.window import Window
w=Window().orderBy("connections")
mostObscure = connections.withColumn("leastCount", f.first("connections").over(w)).filter("connections=leastCount").drop("leastCount")
mostObscure.show()

+----+-----------+
|  id|connections|
+----+-----------+
| 467|          1|
| 577|          1|
|3490|          1|
|3489|          1|
|2139|          1|
|1089|          1|
|1841|          1|
|4517|          1|
|5028|          1|
| 835|          1|
|1408|          1|
|4784|          1|
|4945|          1|
|4602|          1|
|6411|          1|
|3014|          1|
|3298|          1|
|2911|          1|
|2117|          1|
+----+-----------+



In [18]:
print("Below is the list of most popular obscure superhero with minimum co-appearances")

Below is the list of most popular obscure superhero with minimum co-appearances


In [19]:
mostObscure.join(heroNames,mostObscure.id==heroNames.id).select(mostObscure.id,heroNames.name,mostObscure.connections).show()

+----+--------------------+-----------+
|  id|                name|connections|
+----+--------------------+-----------+
| 467|        BERSERKER II|          1|
| 577|              BLARE/|          1|
|3490|MARVEL BOY II/MARTIN|          1|
|3489|MARVEL BOY/MARTIN BU|          1|
|2139|      GIURESCU, RADU|          1|
|1089|       CLUMSY FOULUP|          1|
|1841|              FENRIS|          1|
|4517|              RANDAK|          1|
|5028|           SHARKSKIN|          1|
| 835|     CALLAHAN, DANNY|          1|
|1408|         DEATHCHARGE|          1|
|4784|                RUNE|          1|
|4945|         SEA LEOPARD|          1|
|4602|         RED WOLF II|          1|
|6411|              ZANTOR|          1|
|3014|JOHNSON, LYNDON BAIN|          1|
|3298|          LUNATIK II|          1|
|2911|                KULL|          1|
|2117|GERVASE, LADY ALYSSA|          1|
+----+--------------------+-----------+



### Without window function

In [20]:
minConnectCount = connections.agg(f.min("connections")).first()[0]

In [21]:
minConnections = connections.filter(f.col("connections")==minConnectCount)

In [22]:
minConnectionsWithNames = minConnections.join(heroNames,"id")

In [23]:
print("The following characters have only "+ str(minConnectCount) + " co-appearances")

The following characters have only 1 co-appearances


In [24]:
minConnectionsWithNames.select('name').show()

+--------------------+
|                name|
+--------------------+
|        BERSERKER II|
|              BLARE/|
|MARVEL BOY II/MARTIN|
|MARVEL BOY/MARTIN BU|
|      GIURESCU, RADU|
|       CLUMSY FOULUP|
|              FENRIS|
|              RANDAK|
|           SHARKSKIN|
|     CALLAHAN, DANNY|
|         DEATHCHARGE|
|                RUNE|
|         SEA LEOPARD|
|         RED WOLF II|
|              ZANTOR|
|JOHNSON, LYNDON BAIN|
|          LUNATIK II|
|                KULL|
|GERVASE, LADY ALYSSA|
+--------------------+



In [25]:
spark.stop()