In [1]:
from bs4 import BeautifulSoup
import json
import pandas as pd

In [2]:
def get_and_convert_pbp(pageProps):
    
    pbp = pageProps['playByPlay']['actions']

    df = pd.read_json(json.dumps(pbp),orient='records')
    df['game_id'] = pageProps['playByPlay']['gameId']

    return df

In [3]:
def save_pbp(df,filepath):
    # filename is the gameid
    
    filename = df['game_id'].iloc[0]+'.csv'
    # filepath = ''

    df.to_csv(filepath+filename,index=False)
        
    return 

In [4]:
import requests
class Play_by_play:
    # https://www.nba.com/game/<matchup, home team>-<game_id>/box-score

    def __init__(self):
        self.root = "https://www.nba.com/game/" #SACvsPOR-xxxxxxxxxx/box-score"
        self.tail = "/box-score" 

    def build_url(self,matchup, game_id):

        return self.root + matchup + '-' + game_id + self.tail


    def get_source(self, matchup, game_id):
        url = self.build_url(matchup,game_id)
        response = requests.get(url)

        return response.content
    
    def get_pages(self, df):
        for row in df.iterrows():
            yield self.get_source(row[1]['match_up'], row[1]['game_id'])

In [None]:
from sqlalchemy import create_engine

def make_engine(user, pswd, db):

    return create_engine("mariadb+mariadbconnector://"\
                        +user+":"\
                        +pswd+"@127.0.0.1:3306/"+db)

In [5]:
import pandas as pd
def extract():
    command = "select distinct game_id, replace(replace(match_up,'.',''),' ','') "
    command += "as match_up from box_scores where match_up regexp 'vs' and game_id not in "
    command += "(select distinct game_id from play_by_plays) order by game_id limit 10;"

    engine = make_engine(environ.get('USER'),environ.get('PSWD'),'nba')

    df = pd.read_sql(command,engine)

    engine.dispose()

    pbp = Play_by_play()

    for page in pbp.get_pages(df):
        save_pbp(extract_application(page))

In [None]:
from os.path import expanduser, join as osjoin

In [7]:
from pyspark.sql.types import ArrayType,StructField,StringType,StructType,IntegerType,FloatType

In [8]:
def extract_cols(pbp):
    
    pbp = pbp.withColumn('action_id',pbp.actions.getItem('actionId'))

    db = pbp.withColumn('player_id', pbp.actions.getItem('personId')).\
        withColumn('team_id', pbp.actions.getItem('teamId')).\
        withColumn('period', pbp.actions.getItem('period')).\
        withColumn('clock', pbp.actions.getItem('clock')).\
        withColumn('x_loc', pbp.actions.getItem('xLegacy')).\
        withColumn('y_loc', pbp.actions.getItem('yLegacy')).\
        withColumn('shot_distance', pbp.actions.getItem('shotDistance')).\
        withColumn('shot_result', pbp.actions.getItem('shotResult')).\
        withColumn('field_goal', pbp.actions.getItem('isFieldGoal')).\
        withColumn('home_score', pbp.actions.getItem('scoreHome').cast('integer')).\
        withColumn('away_score', pbp.actions.getItem('scoreAway').cast('integer')).\
        withColumn('total_points', pbp.actions.getItem('pointsTotal')).\
        withColumn('location', pbp.actions.getItem('location')).\
        withColumn('description', pbp.actions.getItem('description')).\
        withColumn('action_type', pbp.actions.getItem('actionType')).\
        withColumn('sub_type', pbp.actions.getItem('subType'))

    return db

In [9]:
def extract_schema():
    schema = StructType([StructField('actions',ArrayType(
        StructType([StructField('actionId',IntegerType()),StructField('actionNumber',IntegerType()),
                    StructField('actionType',StringType()),StructField('clock',StringType()),
                    StructField('description',StringType()),StructField('isFieldGoal',IntegerType()),
                    StructField('location',StringType()),StructField('period',IntegerType()),
                    StructField('personId',IntegerType()),StructField('playerName',StringType()),
                    StructField('playerNameI',StringType()),StructField('pointsTotal',IntegerType()),
                    StructField('scoreAway',StringType()),StructField('scoreHome',StringType()),
                    StructField('shotDistance',IntegerType()),StructField('shotResult',StringType()),
                    StructField('subType',StringType()),StructField('teamId',IntegerType()),
                    StructField('teamTricode',StringType()),StructField('videoAvailable',IntegerType()),
                    StructField('xLegacy',IntegerType()),StructField('yLegacy',IntegerType())]
                )
            )
        ),
    StructField('gameId',StringType(),False),
    StructField('source',StringType() ),
    StructField('videoAvailable',IntegerType())])

    return schema

In [10]:
def transform():
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import explode, regexp_extract

    app_name = 'pbp'
    directory = expanduser(osjoin('~/spark_apps',app_name))
    input_dir = osjoin(directory,'games')
    checkpoint_dir = osjoin(directory,'checkpoints')
    output_dir = osjoin(directory,'to_db')

    spark = SparkSession.builder.appName(app_name).getOrCreate()

    df = spark.readStream.option('cleanSource','delete').schema(extract_schema()).\
        json(input_dir)
    # print(df.printSchema())
    df = df.drop('source','videoAvailable')

    table_order = ['game_id', 'action_id', 'player_id', 'team_id', 'period', 'minute',
        'seconds', 'x_loc', 'y_loc', 'shot_distance', 'shot_result',
        'field_goal', 'home_score', 'away_score', 'total_points', 'location',
        'description', 'action_type', 'sub_type']

    pbp = df.withColumn('actions',explode(df.actions)).withColumnRenamed('gameId','game_id')

    db = extract_cols(pbp)

    db = db.withColumn('minute',regexp_extract(db.clock,r'PT(\d+)M(\d+\.\d+)S',1).cast('integer'))
    db = db.withColumn('seconds',regexp_extract(db.clock,r'PT(\d+)M(\d+\.\d+)S',2).cast('float'))


    db = db[table_order]
    # print(db.printSchema())

    db.writeStream.format("parquet").option("path", output_dir).\
        option("checkpointLocation", checkpoint_dir).start()
    
    return

In [13]:
transform()

22/07/03 14:17:59 WARN Utils: Your hostname, rpi3 resolves to a loopback address: 127.0.1.1; using 172.25.14.38 instead (on interface wlan0)
22/07/03 14:17:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/07/03 14:18:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/07/03 14:18:59 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [11]:
def load_schema():
    schema = StructType([StructField('game_id',StringType()),StructField('action_id',IntegerType()),
                    StructField('player_id',IntegerType()),StructField('team_id',IntegerType()),
                    StructField('period',IntegerType()),StructField('minute',IntegerType()),
                    StructField('seconds',FloatType()),StructField('x_loc',IntegerType()),
                    StructField('y_loc',IntegerType()),StructField('shot_distance',IntegerType()),
                    StructField('shot_result',StringType()),StructField('field_goal',IntegerType()),
                    StructField('home_score',IntegerType()),StructField('away_score',IntegerType()),
                    StructField('total_points',IntegerType()),StructField('location',StringType()),
                    StructField('description',StringType()),StructField('action_type',StringType()),
                    StructField('sub_type',StringType())]
                )

    return schema

In [12]:
from os import environ

In [18]:
def write_to_db(df,epoch):
    df.persist()

    special_events = df.select('*').filter(df.team_id == 0)
    pbp = df.select('*').filter(df.team_id != 0)

    special_events.write.format('jdbc').option('url','jdbc:mysql://localhost:3306/nba').\
    option('user',environ.get('USER')).option('password',environ.get('PSWD')).\
    option("dbtable", "play_by_play_stoppages").mode('append').save()


    pbp.write.format('jdbc').option('url','jdbc:mysql://localhost:3306/nba').\
    option('user',environ.get('USER')).option('password',environ.get('PSWD')).\
    option("dbtable", "play_by_plays").mode('append').save()

    
    df.unpersist()    

In [19]:
def load():
    from pyspark.sql import SparkSession

    SparkSession.builder.config('spark.driver.extraClassPath',environ.get('SPARK_JDBC')).getOrCreate()
    spark = SparkSession.builder.appName('pbp').getOrCreate()

    df = spark.readStream.schema(load_schema()).parquet('to_db',mergeSchema=True)

    df.writeStream.outputMode('append').foreachBatch(write_to_db).start()   

    

In [20]:
load()

22/07/03 14:59:26 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-3d93207a-ed2e-44bb-8cbb-8c5469ed1ecb. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/07/03 14:59:26 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

## PBP schema:
game_id := char(10)

clock := --> minute := smallint, seconds := float 

period := smallint unsigned

teamId := int FK references teams

personId := int FK references players

xLegacy := int

yLegacy := int

shotDistance := int

shotResult := text

isFieldGoal := smallint

scoreHome := int 

scoreAway := int 

pointsTotal := int 

location := char

description := text

actionType := text

subType := text

actionId := int unsigned NOT NULL (indexes events within game's play by play)

#### create two tabels: one with player data the other with "special events" (where teamId == 0). Can join the two by union/df stacking in application

In [157]:
def extract_application(html):
        
    soup = BeautifulSoup(html)
    app_script = soup.find('script',{'type':"application/json"})

    return json.loads(app_script.decode_contents())

In [None]:
def save_pbp(data):
    # filename is the gameid
    
    data = data['props']['pageProps']['playByPlay']

    filename = 'games/'+data['gameId']+'.json'

    with open(filename,'w') as f:
        json.dump(data,f)
        
    return 