In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round

#load from postgres
#master = "local[*] creates as many worker threads as logical cores on your machine"
spark = SparkSession.builder.appName("Riot_Api2").config("spark.jars", "postgresql-42.2.25.jre7.jar").master(master = "local[*]").getOrCreate()

# Load from JSON

In [2]:
import pyspark.sql.functions as F
#read Matchdata from JSON source
matches_from_json = spark.read.format("json").option("inferSchema", "true").option("multiLine", "true").load("backups/matchlist.json")

In [None]:
#alot of useless/heavily nested information
#print("loaded " + str(matches_from_json.count()) + " matches")
matches_from_json.printSchema()

In [None]:
#extract the wanted columns, still nested
participants_nested = matches_from_json.select(col("metadata.participants"), col("metadata.matchId"))
#explode into columns for each player to keep matchId unique
participants_unique = participants_nested.select("matchId", participants_nested.participants[0], participants_nested.participants[1], participants_nested.participants[2], participants_nested.participants[3], participants_nested.participants[4], participants_nested.participants[5], participants_nested.participants[6], participants_nested.participants[7], participants_nested.participants[8], participants_nested.participants[9])
participants_unique.printSchema()

In [None]:
#extract the player Details plus the matchId
player_details = matches_from_json.select(col("info.participants"), col("metadata.matchId"))
#the details are in a struct in an array so we need to apply both array flattening and struct flattening
player_details = player_details.select(player_details.matchId, F.explode(player_details.participants))
player_details = player_details.select(col("matchId"), col("col.*"))
#finally we drop the unwanted data
player_details = player_details.drop("challenges", "perks")
#our df is ready to be used in our database
#print("A total of  " + str(player_details.count()) + " player details is available")

player_details.printSchema()

In [None]:
matches = matches_from_json.select(col("metadata.matchId"), col("info.*")).drop("participants", "teams")
matches.printSchema()

In [None]:
matches_from_json.count()

# Load from DB

In [0]:
#get the matches table
matches = spark.read.format("jdbc").options(
    url='jdbc:postgresql://localhost:5432/RIOT2', # jdbc:postgresql://<host>:<port>/<database>
    dbtable='matches',
    user='postgres',
    password='1234',
    driver='org.postgresql.Driver').load()
matches.printSchema()

In [None]:
matches.groupBy("gameMode").count().show()

In [None]:
#filter out tutorial games
matches.createOrReplaceTempView("matches")

matches = spark.sql("SELECT * FROM matches WHERE gamemode == 'ARAM' OR gamemode == 'CLASSIC';")
matches.groupBy("gameMode").count().show()

In [None]:
matches.take(1)

In [48]:
#Making Timestamps readable. We have to divide by 1000 and round because the server takes millisecond timestamps
matches = matches.withColumn('gameStartTimestamp', col('gameStartTimestamp')/1000)
matches = matches.withColumn('gameStartTimestamp', round('gameStartTimestamp'))
matches = matches.withColumn('gameStartTimestamp', col('gameStartTimestamp').cast("timestamp"))
matches = matches.withColumn('gameCreation', col('gameCreation')/1000)
matches = matches.withColumn('gameCreation', round('gameCreation'))
matches = matches.withColumn('gameCreation', col('gameCreation').cast("timestamp"))
matches = matches.withColumn('gameEndTimestamp', col('gameEndTimestamp')/1000)
matches = matches.withColumn('gameEndTimestamp', round('gameEndTimestamp'))
matches = matches.withColumn('gameEndTimestamp', col('gameEndTimestamp').cast("timestamp"))


In [None]:
matches.select('gameStartTimestamp', 'gameEndTimestamp').sort('gameStartTimestamp' ,ascending=True).show(600)

In [64]:
from pyspark.sql.functions import concat, lit, floor
#Translate gameDuration(seconds) into readable Game Time (mm:ss)
matches2 = matches.withColumn('gameTime', concat(floor(col('gameDuration')/60), lit(':'),col('gameDuration')%60))

In [None]:
matches2.select('gameTime').show()