In [596]:
import os 
import plotly as py
import plotly.graph_objs as go
import pandas as pd
import numpy as np
import math 
import re
import plotly.express as px


Reading the Data

In [597]:
# os.chdir("C:\\Users\\abhin\\OneDrive\\Documents\\Technical Task")
data = pd.read_csv("match_data.csv")
data.describe()


Unnamed: 0,Time (s),Pitch_x,Pitch_y,Speed (m/s),Unnamed: 5,Unnamed: 6
count,985274.0,985274.0,985274.0,985274.0,1.0,1.0
mean,3326.746144,-7.420383,-13.59104,1.114658,0.031203,0.176644
std,1946.57303,22.886295,25.264996,1.510692,,
min,0.0,-86.957029,-51.186749,0.0,0.031203,0.176644
25%,1591.4,-20.181407,-38.526146,0.044,0.031203,0.176644
50%,3421.05,-11.946156,-17.893181,0.646,0.031203,0.176644
75%,5053.4,10.406046,7.338383,1.6035,0.031203,0.176644
max,6597.0,122.893069,158.13262,41.76231,0.031203,0.176644


For Millisecond Accuracy

In [598]:
data['Time (s)'] = round(data['Time (s)'], 1)
data['Time (s)'].describe()

count    985274.000000
mean       3326.746144
std        1946.573030
min           0.000000
25%        1591.400000
50%        3421.050000
75%        5053.400000
max        6597.000000
Name: Time (s), dtype: float64

Latest Time of Tracking

In [599]:
max(data['Time (s)'])/60 

109.95

Filtering the data representing players from the "ball" attributes.

In [600]:

unique_vals = data['participation_id'].unique()
id_list = [id for id in unique_vals if not bool(re.search("ball", id))]

print("Number of Players represented is: " + str(len(id_list)))

i = 0
players_list = ["Player" + str(i + 1) for i, id in enumerate(id_list)]

player_id_dict = dict(zip(id_list, players_list))

data['player_id'] =  data['participation_id'].map(player_id_dict)


Number of Players represented is: 16


In [601]:

unique_vals = data['participation_id'].unique()

id_list = [id for id in unique_vals if not bool(re.search("ball", id))]

print("Number of Players represented is: " + str(len(id_list)))

i = 0
players_list = ["Player" + str(i + 1) for i, id in enumerate(id_list)]

Number of Players represented is: 16


Mapping Participant ID to Player tag

In [602]:


player_data = {}

for player in players_list:
    player_data[str(player)] = data[data['player_id'] == player]

ball_data = data[data['participation_id'].str.contains('ball')]





Extracting Earliest and Latest tracking info for each Player

In [605]:
earliest_tracking = pd.DataFrame(columns = ['Player', 'Earliest Time (s)', 'Latest Time(s)', 'Starting_Position_X', 'Starting_Position_Y'])
for player in players_list:
    earliest_time = player_data[player]['Time (s)'].min()
    latest_time = player_data[player]['Time (s)'].max()
    starting_x = player_data[player][player_data[player]['Time (s)'] == earliest_time]['Pitch_x'].values[0]
    starting_y = player_data[player][player_data[player]['Time (s)'] == earliest_time]['Pitch_y'].values[0]
    earliest_tracking = pd.concat([earliest_tracking, pd.DataFrame({'Player': [player], 'Earliest Time (s)': [earliest_time], 'Latest Time(s)': [latest_time], 'Starting_Position_X': [starting_x], 'Starting_Position_Y': [starting_y]})], ignore_index=True)

earliest_tracking = earliest_tracking.sort_values(by = 'Earliest Time (s)')


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



Function to filter player data beyond the pitch bounds

In [606]:
def filter_out_of_bounds(player):
    player = player[player['Pitch_x'] >= -52.5]
    player = player[player['Pitch_x'] <= 52.5]
    player = player[player['Pitch_y'] >= -34]
    player = player[player['Pitch_y'] <= 34]
    return player



In [607]:
for player in players_list:
    player_data[player] = filter_out_of_bounds(player_data[player])


Only retaining rows which represent proper runs with continuity

In [609]:
def Assign_Sprints(player):
    player_temp = player
    current_sprint = [1] * len(player)
    index = 1
    
    for i in range(1, len(player_temp)):
        if player_temp.iloc[i]['Time (s)'] - player_temp.iloc[i-1]['Time (s)'] >= 0.2:
            index += 1
        current_sprint[i] = index
        
        
    player_temp['Sprint'] = current_sprint
    player_temp['Sprint'] = player_temp['Sprint'].astype('object')
        
    return player_temp


In [610]:
for player in players_list:
    player_data[player] = Assign_Sprints(player_data[player])

Aggregate Players Based on Sprint Data

Function to Extract Sprints which have atleast N consecutive observations (0.1 * N seconds)

In [611]:
def extract_sprint_data(player, N):
    sprints_desired = player['Sprint'].value_counts()[lambda x: x >= N]
    return(player[player['Sprint'].isin(sprints_desired.index)])

   
        

In [612]:
obs = {}
for player in players_list:
    obs[player] = len(player_data[player])
obs = dict(sorted(obs.items(), key=lambda item: item[1], reverse = True))
print(obs)

{'Player5': 56852, 'Player11': 54297, 'Player7': 53712, 'Player8': 52935, 'Player10': 52782, 'Player13': 52577, 'Player16': 46238, 'Player12': 38359, 'Player6': 36319, 'Player9': 35075, 'Player14': 26380, 'Player15': 21048, 'Player2': 16632, 'Player1': 12634, 'Player4': 8330, 'Player3': 6692}


Total Time Running on Pitch: Only counting those actions which involve a continuous run of atleast 1 minute (600 ms)

In [614]:
time_leaderboard = {}
for player in players_list:
    
    time_leaderboard[player] = len(extract_sprint_data(player_data[player], 600))/600

time_leaderboard = dict(sorted(time_leaderboard.items(), key=lambda item: item[1], reverse = True))

time_l_plot = px.bar(x = list(time_leaderboard.keys()), y = list(time_leaderboard.values()), labels = {'x': 'Player', 'y': 'Time (s)'}, title = 'Time Leaderboard')

time_l_plot.show()


Total Distance Covered by Each Player: Only those runs where it indicates of atleast a minute continously are counted.

In [615]:
def Total_Distance_LeaderBoard(player):
    run = extract_sprint_data(player_data[player], 600)
    dist = 0
    for sprint in run['Sprint'].unique():
        sprint_data = run[run['Sprint'] == sprint]
        for i in range(len(sprint_data)-1):
            dist += math.sqrt((sprint_data.iloc[i+1]["Pitch_x"] - sprint_data.iloc[i]["Pitch_x"])**2 + (sprint_data.iloc[i+1]["Pitch_y"] - sprint_data.iloc[i]["Pitch_y"])**2)
    
    return dist
    


                    

In [624]:
distance_leaderboard = {}
for player in players_list:
    distance_leaderboard[player] = Total_Distance_LeaderBoard(player)


distance_leaderboard = dict(sorted(distance_leaderboard.items(), key=lambda item: item[1], reverse = True))

dist_l_plot = px.bar(x = list(distance_leaderboard.keys()), y = list(distance_leaderboard.values()), labels = {'x': 'Player', 'y': 'Distance (m)'}, title = 'Distance Leaderboard')

dist_l_plot.show()

Counting Number of Seconds a Player's instantaneous speed is above Zone 5:

In [617]:
def zone5_count(player):
    run = extract_sprint_data(player_data[player], 10)
    dist_zone5 = 0.0
    for sprint in run['Sprint'].unique():
        sprint_data = run[run['Sprint'] == sprint]
        for i in range(len(sprint_data)-1):
            dist_ms = math.sqrt((sprint_data.iloc[i+1]["Pitch_x"] - sprint_data.iloc[i]["Pitch_x"])**2 + (sprint_data.iloc[i+1]["Pitch_y"] - sprint_data.iloc[i]["Pitch_y"])**2)*10 
            if(dist_ms>= 19.8/3.6 and dist_ms <= 25.1/3.6):
              dist_zone5 += dist_ms
              
    return dist_zone5


In [625]:
distance_zone5_leaderboard = {}

for player in players_list:
    distance_zone5_leaderboard[player] = zone5_count(player)


distance_zone5_leaderboard = dict(sorted(distance_zone5_leaderboard.items(), key=lambda item: item[1], reverse = True))
dist_5_lplot = px.bar(x = list(distance_zone5_leaderboard.keys()), y = list(distance_zone5_leaderboard.values()), labels = {'x': 'Player', 'y': 'Distance (m)'}, title = 'Distance Zone 5 Leaderboard')

dist_5_lplot.show()


In [334]:
print(distance_zone5_leaderboard)

{'Player1': 0.0, 'Player2': 0.0, 'Player3': 0.0, 'Player4': 0.0, 'Player5': 5.682304001567916, 'Player6': 0.0, 'Player7': 12.027518530698101, 'Player8': 6.549492794120983, 'Player9': 129.20382962541373, 'Player10': 0.0, 'Player11': 0.0, 'Player12': 0.0, 'Player13': 0.0, 'Player14': 0.0, 'Player15': 43.37443368899486, 'Player16': 54.81168831775435}


Top Speed Calculation based on Distance covered each second (10 ms windows)

Euclidean calculator for a segment:

In [619]:
def Euclidean_Dist(seg):
    dist = 0
    for i in range(len(seg)-1):
        dist += math.sqrt((seg.iloc[i+1]["Pitch_x"] - seg.iloc[i]["Pitch_x"])**2 + (seg.iloc[i+1]["Pitch_y"] - seg.iloc[i]["Pitch_y"])**2)
        return dist

In [620]:
def Top_Speed_Leaderboard(player):
    run = extract_sprint_data(player_data[player], 600)
    
    speeds = []
    for sprint in run['Sprint'].unique():
        sprint_data = run[run['Sprint'] == sprint]
        for i in range(len(sprint_data)-10):
            speeds.append(Euclidean_Dist(sprint_data.iloc[i:i+10]))
            #     speeds.append(math.sqrt((sprint_data.iloc[j]["Pitch_x"] - sprint_data.iloc[j+1]["Pitch_x"])**2 + (sprint_data.iloc[j]["Pitch_y"] - sprint_data.iloc[j+1]["Pitch_y"])**2))
                
    
    return speeds

In [626]:
max_speed_leaderboard = {}

for player in players_list:
    
    max_speed_leaderboard[player] = max(Top_Speed_Leaderboard(player))




max_speed_leaderboard = dict(sorted(max_speed_leaderboard.items(), key=lambda item: item[1], reverse = True))
max_speed_plot = px.bar(x = list(max_speed_leaderboard.keys()), y = list(max_speed_leaderboard.values()), labels = {'x': 'Player', 'y': 'Speed (m/s)'}, title = 'Max Speed Leaderboard')

max_speed_plot.show()

In [349]:
run = extract_sprint_data(player_data['Player12'], 600)
    
speeds = []
for sprint in run['Sprint'].unique():
        sprint_data = run[run['Sprint'] == sprint]
        for i in range(len(sprint_data)-1):
            
              speeds.append((math.sqrt((sprint_data.iloc[i+1]["Pitch_x"] - sprint_data.iloc[i]["Pitch_x"])**2 + (sprint_data.iloc[i+1]["Pitch_y"] - sprint_data.iloc[i]["Pitch_y"])**2)*10))
dfspd = pd.DataFrame(speeds, columns=['Speed'])
dfspd.index.name = 'Index'

In [65]:
def aggregate_data(player, units):
    if(units == 'seconds'):
        player_agg = player[::10]
        max_speed = player['Speed (m/s)'].rolling(window=10, min_periods=1).max()
        player_agg['Max Speed'] = max_speed[::10]
        return player_agg
    

Insights with the Ball Data

In [393]:
PLayer_1_stats = player_data['Player1'].merge(ball_data, on = 'Time (s)', how = 'inner')

PLayer_1_stats.head()

Unnamed: 0,participation_id_x,Time (s),Pitch_x_x,Pitch_y_x,Speed (m/s)_x,player_id_x,participation_id_y,Pitch_x_y,Pitch_y_y,Speed (m/s)_y,player_id_y
0,455426a2-5604-4c3a-8b89-ed0519f8d77a,73.7,-17.497522,-40.416083,0.008,Player1,ball,-1.442517,6.965262,2.799068,
1,455426a2-5604-4c3a-8b89-ed0519f8d77a,73.8,-17.500247,-40.431203,0.023,Player1,ball,-1.715508,7.027099,2.799068,
2,455426a2-5604-4c3a-8b89-ed0519f8d77a,73.9,-17.500247,-40.431203,0.021,Player1,ball,-1.988498,7.088937,2.799068,
3,455426a2-5604-4c3a-8b89-ed0519f8d77a,74.0,-17.500247,-40.431203,0.028,Player1,ball,-2.261489,7.150775,2.799068,
4,455426a2-5604-4c3a-8b89-ed0519f8d77a,74.1,-17.500247,-40.431203,0.069,Player1,ball,-2.53448,7.212613,2.799068,


In [542]:
ball_agg = aggregate_data(ball_data, 'seconds')
ball_agg_within = filter_out_of_bounds(ball_agg)

ball_agg['Within_Pitch'] = ball_agg.index.isin(ball_agg_within.index)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [544]:
ball_agg.value_counts('Within_Pitch')

Within_Pitch
True     5837
False     687
Name: count, dtype: int64

In [628]:


fig_ball = px.scatter(ball_agg, x="Pitch_x", y="Pitch_y", animation_frame="Time (s)", range_x=[-60, 60], range_y=[-40, 40], color= "Within_Pitch", title="Ball Movement")

fig_ball.add_shape(type="line", x0=-52.5, y0=-34, x1=-52.5, y1=34, line=dict(color="black", width=1))  # Left boundary
fig_ball.add_shape(type="line", x0=52.5, y0=-34, x1=52.5, y1=34, line=dict(color="black", width=1))    # Right boundary
fig_ball.add_shape(type="line", x0=-52.5, y0=-34, x1=52.5, y1=-34, line=dict(color="black", width=1))  # Bottom boundary
fig_ball.add_shape(type="line", x0=-52.5, y0=34, x1=52.5, y1=34, line=dict(color="black", width=1))    # Top boundary
fig_ball.update_layout(showlegend=True)
fig_ball.add_shape(type="circle", x0=-0.5, y0=-0.5, x1=0.5, y1=0.5, line=dict(color="red"), fillcolor="rgba(255, 0, 0, 0.1)")
fig_ball.show()

Identifying ball stagnation and filtering out noise

In [549]:

time_rest = min([x for x in range(1, len(ball_agg)-60) if Euclidean_Dist(ball_agg.iloc[x:x+60]) == 0])


In [None]:
ball_agg.iloc[time_rest:time_rest+30]

In [551]:
ball_agg = ball_agg[ball_agg['Time (s)'] <= ball_agg.iloc[time_rest]['Time (s)']]
ball_data = ball_data[ball_data['Time (s)'] <= ball_agg.iloc[time_rest]['Time (s)']]


Analysing Set Pieces

In [560]:
set_pieces = ball_agg[(ball_agg['Speed (m/s)'] < 0.1) & (ball_agg['Within_Pitch'] == True)]

In [None]:
set_pieces

In [546]:
len(ball_agg[ball_agg['Within_Pitch'] == False])/ len(ball_agg)

0.10530349478847333

In [569]:
def Assign_SetPiece(ball):
    
    current_set_piece = [1] * len(ball)
    index = 1
    for i in range(1, len(ball)):
        if ball.iloc[i]['Time (s)'] - ball.iloc[i-1]['Time (s)'] >= 2:
            index += 1
        current_set_piece[i] = index
        
        
    ball['SetPiece_Id'] = current_set_piece
    ball['SetPiece_Id'] = ball['SetPiece_Id'].astype('object')
        
    return ball

In [571]:
set_pieces = Assign_SetPiece(set_pieces)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [629]:


fig_setpieces = px.scatter(set_pieces, x="Pitch_x", y="Pitch_y", animation_frame="Time (s)", 
                 range_x=[-60, 60], range_y=[-40, 40], color="SetPiece_Id", labels = {"SetPiece_Id": "Set Piece ID"},
                 title="Set Pieces by ID")

fig_setpieces.add_shape(type="line", x0=-52.5, y0=-34, x1=-52.5, y1=34, line=dict(color="black", width=1))  
fig_setpieces.add_shape(type="line", x0=52.5, y0=-34, x1=52.5, y1=34, line=dict(color="black", width=1))    
fig_setpieces.add_shape(type="line", x0=-52.5, y0=-34, x1=52.5, y1=-34, line=dict(color="black", width=1)) 
fig_setpieces.add_shape(type="line", x0=-52.5, y0=34, x1=52.5, y1=34, line=dict(color="black", width=1))    


fig_setpieces.add_shape(type="circle", x0=-0.5, y0=-0.5, x1=0.5, y1=0.5, line=dict(color="red"), fillcolor="rgba(255, 0, 0, 0.1)")
fig_setpieces.show()

In [684]:
import csv

def dict_csv(csv_file, data):
 with open(csv_file, 'w', newline='') as file:
    
    writer = csv.DictWriter(file, fieldnames = [['player', 'Value']])
    
    # Write the header
    writer.writeheader()
    
    # Write the data
    writer.writerows(data)







In [677]:
time_leaderboard

{'Player5': 93.72,
 'Player11': 88.08333333333333,
 'Player8': 87.18166666666667,
 'Player10': 83.58833333333334,
 'Player13': 81.25666666666666,
 'Player7': 79.78,
 'Player12': 62.955,
 'Player6': 56.09,
 'Player16': 53.12833333333333,
 'Player14': 42.958333333333336,
 'Player9': 33.53333333333333,
 'Player15': 29.32,
 'Player2': 24.855,
 'Player1': 20.97833333333333,
 'Player4': 10.083333333333334,
 'Player3': 9.851666666666667}

In [687]:
pd.DataFrame(time_leaderboard.items(), columns = ['Player', 'Time (s)']).to_csv('time_leaderboard.csv', index = False)

pd.DataFrame(distance_leaderboard.items(), columns = ['Player', 'Distance (m)']).to_csv('distance_leaderboard.csv', index = False)

pd.DataFrame(distance_zone5_leaderboard.items(), columns = ['Player', 'Distance (m)']).to_csv('distance_zone5_leaderboard.csv', index = False)  

pd.DataFrame(max_speed_leaderboard.items(), columns = ['Player', 'Speed (m/s)']).to_csv('max_speed_leaderboard.csv', index = False)

pd.DataFrame(distance_zone5_leaderboard.items(), columns = ['Player', 'Distance (m)']).to_csv('distance_zone5_leaderboard.csv', index = False)

In [690]:
player_data['Player1'].columns

Index(['participation_id', 'Time (s)', 'Pitch_x', 'Pitch_y', 'Speed (m/s)',
       'Unnamed: 5', 'Unnamed: 6', 'player_id', 'Sprint'],
      dtype='object')

In [695]:

def player_movement_plotter(player):
    
    player_agg = aggregate_data(player_data[player], 'seconds')
    
    fig_player = px.scatter(player_agg, x="Pitch_x", y="Pitch_y", animation_frame="Time (s)", 
                 range_x=[-60, 60], range_y=[-40, 40],
                 title="Player Motion Tracker")
    fig_player.add_shape(type="line", x0=-52.5, y0=-34, x1=-52.5, y1=34, line=dict(color="black", width=1))  # Left boundary
    fig_player.add_shape(type="line", x0=52.5, y0=-34, x1=52.5, y1=34, line=dict(color="black", width=1))    # Right boundary
    fig_player.add_shape(type="line", x0=-52.5, y0=-34, x1=52.5, y1=-34, line=dict(color="black", width=1))  # Bottom boundary
    fig_player.add_shape(type="line", x0=-52.5, y0=34, x1=52.5, y1=34, line=dict(color="black", width=1))    # Top boundary
    fig_player.show()
    


In [697]:
player_movement_plotter('Player4')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Visualising it we are able to pick the types of setpieces.

In [623]:
import streamlit as st


In [631]:
st.plotly_chart(time_l_plot)

st.plotly_chart(dist_l_plot)

st.plotly_chart(dist_5_lplot)

st.plotly_chart(max_speed_plot)

st.plotly_chart(fig_ball)

st.plotly_chart(fig_setpieces)


DeltaGenerator()