In [1]:
from pyspark.sql import SparkSession
import  pyspark.sql.functions as F

In [2]:
from os.path import expanduser, join
from jupyter_dash import JupyterDash
from dash import html, dcc
from dash.dependencies import Input, Output
import plotly.express as px

In [3]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/15 22:13:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
path = expanduser(join('~','spark_apps','pbp_outputs'))

In [5]:
df = spark.read.parquet(path)

23/05/15 22:14:03 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [6]:
df.count()

                                                                                

24606

In [7]:
pbp = df.select(F.explode('playByPlay.actions')).select('col.*')

In [8]:
pbp

DataFrame[actionId: bigint, actionNumber: bigint, actionType: string, clock: string, description: string, isFieldGoal: bigint, location: string, period: bigint, personId: bigint, playerName: string, playerNameI: string, pointsTotal: bigint, scoreAway: string, scoreHome: string, shotDistance: bigint, shotResult: string, subType: string, teamId: bigint, teamTricode: string, videoAvailable: bigint, xLegacy: bigint, yLegacy: bigint]

In [9]:
pbp.groupby('shotResult').count().orderBy('count',ascending=False).show()



+----------+-------+
|shotResult|  count|
+----------+-------+
|          |7589541|
|    Missed|2168863|
|      Made|1788746|
+----------+-------+



                                                                                

In [10]:
shots = pbp.filter('shotResult!=""')

In [11]:
harden_shots = pbp.filter('shotResult!=""').filter('playerNameI="J. Harden"')
pdf = harden_shots.toPandas()


                                                                                

In [12]:
# shots = shots.withColumn('shot')

In [13]:
to_plot = shots.groupBy('personId','playerNameI','shotResult', 'subType').count().toPandas()

                                                                                

In [14]:
to_plot['subType'] = to_plot['subType'].str.lower()

In [15]:
types = ['jump shot','dunk','layup','hook', 'tip','bank','fadeaway']

In [16]:
to_plot['shot_type'] = ''
for i in types:
    to_plot['shot_type'].mask(to_plot['subType'].str.contains(i),i,inplace = True)

In [17]:
colours = px.colors.qualitative.Set1

In [18]:
colour_map = {'Missed':colours[0],'Made':colours[1],'(?)':'lightgrey'}

In [19]:
pbp

DataFrame[actionId: bigint, actionNumber: bigint, actionType: string, clock: string, description: string, isFieldGoal: bigint, location: string, period: bigint, personId: bigint, playerName: string, playerNameI: string, pointsTotal: bigint, scoreAway: string, scoreHome: string, shotDistance: bigint, shotResult: string, subType: string, teamId: bigint, teamTricode: string, videoAvailable: bigint, xLegacy: bigint, yLegacy: bigint]

In [20]:
names = pbp.select('personId','playerNameI').filter('playerNameI!=""').distinct().orderBy('playerNameI').toPandas()

                                                                                

In [22]:
def options(names):
    return [dict(zip(['label','value'],[i.playerNameI,i.personId])) for i in names.itertuples()]

In [23]:
def drpdwn(i):
    opts = options(names)
    d = dcc.Dropdown(options = opts, value = names.loc[0,'personId'],id = i,multi = False)
    
    return d

In [24]:
plt = to_plot[to_plot['playerNameI']=='J. Harden']

In [25]:
app = JupyterDash(__name__)

app.layout = html.Div([drpdwn('names'),
    dcc.Graph(figure=px.treemap(plt, path = ['subType', 'shotResult'],
                                color='shotResult', values='count',
                                color_discrete_map=colour_map),id='graph-content')
])


@app.callback(
    Output('graph-content', 'figure'),
    Input('names', 'value')
)
def update_graph(value):
    plt = to_plot[to_plot['personId']==value]
    return px.treemap(plt, path = ['shot_type','subType', 'shotResult'],
                                color='shotResult', values='count',width=1500,height= 800,
                                color_discrete_map=colour_map)


if __name__ == '__main__':
    app.run_server(debug=True)

Dash is running on http://127.0.0.1:8050/

Dash app running on http://127.0.0.1:8050/


In [26]:
shots.filter('playerNameI="T. Young"').select('personId').distinct().show()



+--------+
|personId|
+--------+
| 1629027|
|  201152|
|    1937|
+--------+



                                                                                

In [29]:
pbp.columns

['actionId',
 'actionNumber',
 'actionType',
 'clock',
 'description',
 'isFieldGoal',
 'location',
 'period',
 'personId',
 'playerName',
 'playerNameI',
 'pointsTotal',
 'scoreAway',
 'scoreHome',
 'shotDistance',
 'shotResult',
 'subType',
 'teamId',
 'teamTricode',
 'videoAvailable',
 'xLegacy',
 'yLegacy']

In [36]:
pbp.select(*pbp.columns[6:-4]).show()

+--------+------+--------+----------+-----------+-----------+---------+---------+------------+----------+------------------+----------+
|location|period|personId|playerName|playerNameI|pointsTotal|scoreAway|scoreHome|shotDistance|shotResult|           subType|    teamId|
+--------+------+--------+----------+-----------+-----------+---------+---------+------------+----------+------------------+----------+
|        |     1|       0|          |           |          0|        0|        0|           0|          |             start|         0|
|       h|     1|     717|   Sabonis| A. Sabonis|          0|        0|        0|           0|          |                  |1610612757|
|       h|     1|     739|   Wallace| R. Wallace|          0|        0|        0|           0|          |         Lost Ball|1610612757|
|       v|     1|     361|  Robinson|C. Robinson|          0|        0|        0|           0|          |                  |1610612756|
|       v|     1|     686|   McDyess| A. McDyess