In [1]:
import findspark

findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, StringType

# Popular Movies 

In [None]:
spark = SparkSession.builder.appName("Popular Movies").getOrCreate()

schema = StructType([
    StructField("userID", IntegerType(), True),
    StructField("movieID", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("timestamp", LongType(), True),
])

In [None]:
moviesDF = spark.read.option("sep", "\t").schema(schema).csv("./resources/datasets/ml-100k/u.data")

In [None]:
topMoviesID = moviesDF.groupBy("movieID").count().orderBy(func.desc("count"))

In [None]:
topMoviesID.show()

In [None]:
spark.stop()

# Movie IDs to Movie Names

In [None]:
import codecs

In [None]:
def loadMovieNames():
    moviesNames = {}
    
    with codecs.open("./resources/datasets/ml-100k/u.item", "r", encoding="ISO-8859-1", errors='ignore') as f:
        for line in f:
            print(line)
            fields = line.split('|')
            moviesNames[int(fields[0])] = fields[1]
        return moviesNames

# Superhero dataset

In [3]:
spark = SparkSession.builder.appName("MostPopularSuperhero").getOrCreate()

In [4]:
schema = StructType([
                     StructField("id", IntegerType(), True),
                     StructField("name", StringType(), True)])

In [None]:
names = spark.read.schema(schema).option("sep", " ").csv("./resources/datasets/Marvel+Names.txt")

In [None]:
lines = spark.read.text("./resources/datasets/Marvel+Graph.txt")

In [None]:
connections = lines.withColumn("id", func.split(func.trim(func.col("value")), " ")[0]) \
    .withColumn("connections", func.size(func.split(func.trim(func.col("value")), " ")) - 1) \
    .groupBy("id").agg(func.sum("connections").alias("connections"))

In [None]:
mostPopular = connections.sort(func.col("connections").desc()).first()

In [None]:
mostPopularName = names.filter(func.col("id") == mostPopular[0]).select("name").first()

In [None]:
print(mostPopularName[0] + " is the most popular superhero with " + str(mostPopular[1]) + " co-appearances.")