In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
from statsbombpy import sb
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 150)

# Statsbombpy package

In [None]:
# Load competition data
competitions = sb.competitions()
print("Available competitions:")
print(competitions)


In [None]:
competitions.sort_values(by=['competition_name', 'season_name'], ascending=False).head(10)

In [None]:
# Get information about a specific competition
competition_id = 43  # 43 represents the FIFA World Cup 2018
competition_info = competitions.loc[competition_id]
print(f"\nInformation about competition {competition_info['competition_name']}:")
print(f"Country: {competition_info['country_name']}")
print(f"Gender: {competition_info['competition_gender']}")
print(f"Season: {competition_info['season_name']}")


In [None]:
def transformSeason(seasonRow):
    '''We are assuming each value we pass into this function will be 1 value from competitions['season_name']'''
    slashLocation = seasonRow.find('/')
    if slashLocation == -1:
        return seasonRow
    else:
        return seasonRow[:slashLocation]

testS = competitions['season_name'].iloc[1]
testS
transformSeason(testS)

In [None]:
# I want to find the number of unique seasons of data available for England -- how would I do that?
competitions[competitions.country_name == 'England'].shape
competitions[competitions.country_name == 'England'].season_id.nunique()


# I want to find all seasons between 1989 and 2000 -- how would I do that?
competitions['startOfSeason'] = competitions['season_name'].map(transformSeason)
# competitions['startOfSeason'] = competitions['season_name'].map(lambda x: transformSeason(x)) #### THIS IS SAME AS ABOVE


In [None]:
# competitions[(competitions['startOfSeason'] > '1989') & (competitions['startOfSeason'] < '2000')]

In [None]:
competitions[(competitions['startOfSeason'] > '2016')]

In [None]:

# Load match data for the selected competition and season
competition_id = 11
season_id = 4 
matches = sb.matches(competition_id, season_id)
print(f"\nNumber of matches in the {competitions.loc[competitions.competition_id == competition_id, 
                                    'competition_name'].iloc[0]} {season_id}: {len(matches)}")


In [None]:
matches.head()

In [None]:
matches['abs_goal_diff'] = (matches['home_score'] - matches['away_score']).abs()

In [None]:
matches.sort_values(by='abs_goal_diff', ascending=False)

In [None]:
# # how many total goals at home did each time score?
# matches.groupby(['home_team'])['home_score'].sum()
# # matches.groupby(['away_team'])['away_score'].sum()

In [None]:
# Get information about a specific match
match_id = 15973
match_info = matches.loc[matches['match_id'] == match_id].iloc[0]
print(f"\nInformation about match {match_id}:")
print(f"Home team: {match_info['home_team']}")
print(f"Away team: {match_info['away_team']}")
print(f"Match date: {match_info['match_date']}")
print(f"Stadium: {match_info['stadium']}")

In [None]:
matches.head()

In [None]:
# 21/4/2024

# Today, we want to analyze a larger cross-section of team data... first we need to build it up

# Loop thru the set of matches, find all matches where Barcelona played
# Grab the events for each of those matches, process and filter for just the shots

In [None]:
listOfBarcaMatches = matches[(matches.home_team == 'Barcelona') | (matches.away_team == 'Barcelona')].match_id.values
listOfShotDFs = []
for matchIDValue in listOfBarcaMatches:
    events = sb.events(matchIDValue)
    data = getCleanedProcessedShotData(events)
    listOfShotDFs.append(data)
    

In [None]:
def getCleanedProcessedShotData(eventData):
    '''This function takes in event data and outputs just the cleaned type==shot data'''
    event_type = 'Shot'
    eventTypeData = eventData[eventData['type'] == event_type]
    return eventTypeData.dropna(axis=1)

In [None]:
fullShotDF = pd.concat(listOfShotDFs, axis=0)

In [None]:
barcaShotDF = fullShotDF[fullShotDF.possession_team == 'Barcelona']

In [None]:
barcaShotDF.head()

In [None]:
barcaShotDF.shot_outcome.value_counts()

In [None]:
singlePlayerShots = barcaShotDF[barcaShotDF.player == 'Gerard Piqué Bernabéu']

In [None]:
def calcGoalsScoredRatio(shotOutcomeSeries):
    '''This function takes a pd.Series of a "shot_outcome" column and outputs the ratio of goals scored
    e.g. shotOutcomeSeries --> This equals eventsData.shot_outcome '''
    return ( (shotOutcomeSeries == 'Goal').sum() / (shotOutcomeSeries.count()) ).round(3)

In [None]:
def calcOutcomeRatio(dataframe, outcomeValue='Goal'):
    '''This function takes in a DATAFRAME, and a shot outcome value we are trying to find the ratio for'''
    return ( (dataframe.shot_outcome == outcomeValue).sum() / (dataframe.shot_outcome.count())).round(3)

In [None]:
print(calcGoalsScoredRatio(singlePlayerShots.shot_outcome))
calcOutcomeRatio(singlePlayerShots)

In [None]:
barcaShotDF.groupby(['player'])['shot_outcome'].count()

In [None]:
### Which shot_technique has the most success in goals scored ratio
# display(pd.DataFrame(barcaShotDF.groupby(['shot_technique']).apply(lambda x: calcOutcomeRatio(x)).sort_values(ascending=False)).head(10))


# ### Which player hits Off the T the most?
# display(pd.DataFrame(barcaShotDF.groupby(['player']).apply(lambda x: calcOutcomeRatio(x, outcomeValue = 'Off T')).sort_values(ascending=False)).head(15))


### Which player has the highest goals scored ratio per body part?
display(pd.DataFrame(barcaShotDF.groupby(['player','shot_body_part']).apply(lambda x: calcOutcomeRatio(x)).sort_values(ascending=False)))

In [None]:
#### EXERCISES FOR 21/4/2024

## First, generate data for each team in the 2018/2019 season and calculated their goals scored ratio
## Also, generate a top-3 player by goals scored ratio per team 
# (filter to only include players who have shot at least the average number of shot_attempts for the team for the season)
# i.e. you need to take the total number of shot attempts per player per match, then take the AVERAGE across all of those for the season

## Which team scores the most in the first 15 minutes of play?
## BONUS: how does this change throughout the season?
### BONUS BONUS: Can you plot this across time?

## What is the highest goal scoring ratio technique for each team? What is that value?

## For all games where Real Sociedad played, who were top goal scorers for them? What was the outcome of the games?



In [None]:
# Load event data for the selected match
events = sb.events(match_id)
print(f"\nNumber of events in match {match_id}: {len(events)}")


In [None]:
events.head()

In [None]:
events[events.type == 'Carry'].dropna(axis=1).head()

In [None]:
# Analyze event types
event_types = events['type'].value_counts()
print(f"\nEvent types and their counts:")
print(event_types)


In [None]:
# Get information about a specific event type
event_type = 'Shot'
shots = events[events['type'] == event_type]
print(f"\nNumber of {event_type} events: {len(shots)}")
# print(f"Columns available for {event_type} events:")
print(shots.columns)

cleanShots = shots.dropna(axis=1)
print(f"Columns available for {event_type} events:")
print(cleanShots.columns)

In [None]:
cleanShots

In [None]:
cleanShots.shot_outcome.unique()

In [None]:
pd.DataFrame(cleanShots.groupby(['possession_team', 'play_pattern']).apply(lambda x: (x.shot_outcome == 'Goal').sum()))

In [None]:
pd.DataFrame(cleanShots.groupby(['possession_team', 'play_pattern']).apply(lambda x: (x.shot_outcome == 'Goal').sum())).unstack(level=0)

In [None]:
# What play pattern is Barcelona most likely to score from?



# Which positions are most likely to score/have a shot on goal?

# Where are these shots made from? --> Bit later question....

In [None]:
def pullShotDataForMatchId(matchIdValue):
    events = sb.events(matchIdValue)
    event_type = 'Shot'
    shots = events[events['type'] == event_type]
    cleanShots = shots.dropna(axis=1)
    return cleanShots

In [None]:
## Let's get all of the shot events from Barcelona for this La Liga season
barcelonaMatchIds = matches.loc[(matches.away_team == 'Barcelona') | (matches.home_team == 'Barcelona'), 'match_id']
rawListDF = []
for currBarcaMatchID in barcelonaMatchIds:
    curr_df = pullShotDataForMatchId(currBarcaMatchID)
    rawListDF.append(curr_df)
    


In [None]:
fullShotDF_Barca = pd.concat(rawListDF)

In [None]:
fullShotDF_Barca.match_id.nunique()

In [None]:
pd.DataFrame(fullShotDF_Barca[fullShotDF_Barca.possession_team == 'Barcelona'].groupby(['play_pattern']).apply(lambda x: (x.shot_outcome == 'Goal').sum()))

In [None]:
# Analyze shot outcomes
shot_outcomes = shots['shot_outcome'].value_counts()
print(f"\nShot outcomes and their counts:")
print(shot_outcomes)


In [None]:
# Load lineup data for the selected match
lineup = sb.lineups(match_id)
print(f"\nNumber of players in the lineup for match {match_id}: {len(lineup)}")


In [None]:
# Get information about a specific player
team_name = 'England'  # Select the team
player_info = lineup[team_name]  # Get the DataFrame for the selected team
print(f"\nNumber of players in the lineup for {team_name}: {len(player_info)}")

# Get information about a specific player
player_index = 0  # Select the index of the player within the team
player_info = player_info.iloc[player_index]
print(f"\nInformation about player {player_info['player_name']}:")
print(f"Player ID: {player_info['player_id']}")
print(f"Jersey number: {player_info['jersey_number']}")
print(f"Country: {player_info['country']}")
print(f"Positions:")
for position in player_info['positions']:
    print(f"- {position['position']}")

In [None]:
# Analyze events for a specific player
player_id = player_info['player_id']
player_events = events[events['player'] == player_info['player_name']]
print(f"\nNumber of events for player {player_info['player_name']}: {len(player_events)}")
player_event_types = player_events['type'].value_counts()
print(f"\nEvent types and their counts for player {player_info['player_name']}:")
print(player_event_types)

# Matplotlib

#### Matplotlib is a plotting library for Python that allows you to create a wide range of static, animated, and interactive visualizations.

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Basic Line Plot
print("\nBasic Line Plot")
x = [1, 2, 3, 4, 5]
y = [2, 4, 6, 8, 10]

plt.plot(x, y)
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('Basic Line Plot')
plt.show()

In [None]:
# Scatter Plot
print("\nScatter Plot")
x = [1, 2, 3, 4, 5]
y = [2, 4, 6, 8, 10]

plt.scatter(x, y)
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('Scatter Plot')
plt.show()


In [None]:
# Bar Plot
print("\nBar Plot")
x = ['A', 'B', 'C', 'D', 'E']
y = [10, 7, 5, 8, 12]

plt.bar(x, y)
plt.xlabel('Categories')
plt.ylabel('Values')
plt.title('Bar Plot')
plt.show()

In [None]:
# Plotting events data
print("\nPlotting events data")
player_events = events[events['player'] == player_info['player_name']]
player_event_types = player_events['type'].value_counts()

plt.figure(figsize=(10, 6))
plt.bar(player_event_types.index, player_event_types.values)
plt.xlabel('Event Types')
plt.ylabel('Count')
plt.title(f"Event Types for Player {player_info['player_name']}")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Filtering events using loc
print("\nFiltering events using loc")
shots = events.loc[events['type'] == 'Shot']
print(f"Number of shot events: {len(shots)}")

# Accessing specific columns using iloc
print("\nAccessing specific columns using iloc")
shot_locations = shots.iloc[:, shots.columns.get_loc('location')]
print(f"Shot locations:\n{shot_locations.head()}")

# Grouping events by player and counting
print("\nGrouping events by player and counting")
player_event_counts = events.groupby('player')['type'].count()
print(f"Player event counts:\n{player_event_counts.head()}")


In [None]:
# Plotting player event counts
print("\nPlotting player event counts")
fig, ax = plt.subplots(figsize=(10, 6))
player_event_counts.plot(kind='bar', ax=ax)
ax.set_xlabel('Player')
ax.set_ylabel('Event Count')
ax.set_title('Player Event Counts')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# Grouping shots by player and outcome
print("\nGrouping shots by player and outcome")
shot_outcomes = shots.groupby(['player', 'shot_outcome']).size().unstack()
print(f"Shot outcomes by player:\n{shot_outcomes.head()}")

In [None]:

# Plotting shot outcomes by player
print("\nPlotting shot outcomes by player")
fig, ax = plt.subplots(figsize=(10, 6))
shot_outcomes.plot(kind='bar', stacked=True, ax=ax)
ax.set_xlabel('Player')
ax.set_ylabel('Shot Count')
ax.set_title('Shot Outcomes by Player')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Grouping passes by player and recipient
print("\nGrouping passes by player and recipient")
pass_combinations = events[events['type'] == 'Pass'].groupby(['player', 'pass_recipient']).size()
print(f"Pass combinations:\n{pass_combinations.head()}")



In [None]:
# Aggregating pass combinations
print("\nAggregating pass combinations")
pass_combinations_agg = pass_combinations.reset_index()
pass_combinations_agg['combination'] = pass_combinations_agg['player'] + ' - ' + pass_combinations_agg['pass_recipient']
pass_combinations_agg = pass_combinations_agg.groupby('combination')[0].sum().sort_values(ascending=False)
print(f"Aggregated pass combinations:\n{pass_combinations_agg.head()}")

In [None]:
# Plotting top pass combinations
print("\nPlotting top pass combinations")
top_n = 10
fig, ax = plt.subplots(figsize=(10, 6))
pass_combinations_agg.head(top_n).plot(kind='bar', ax=ax)
ax.set_xlabel('Player - Recipient')
ax.set_ylabel('Pass Count')
ax.set_title(f'Top {top_n} Pass Combinations')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Matplotlib - Subplots

In [None]:
# Extracting relevant data
print("Extracting relevant data")
shots = events.loc[events['type'] == 'Shot']
goals = shots.loc[shots['shot_outcome'] == 'Goal']
passes = events.loc[events['type'] == 'Pass']

In [None]:
# Creating a 2x2 subplot layout
fig, axs = plt.subplots(2, 2, figsize=(16, 8))

# Subplot 1: Shot and Goal Distribution by Player
print("\nPlotting shot and goal distribution by player")
player_shots = shots.groupby('player')['id'].count()
player_goals = goals.groupby('player')['id'].count()

axs[0, 0].bar(player_shots.index, player_shots.values, alpha=0.7, label='Shots')
axs[0, 0].bar(player_goals.index, player_goals.values, alpha=0.7, label='Goals')

axs[0, 0].set_xlabel('Player')
axs[0, 0].set_ylabel('Count')
axs[0, 0].set_title('Shot and Goal Distribution by Player')
axs[0, 0].legend()
plt.setp(axs[0, 0].get_xticklabels(), rotation=45, ha='right')

# Subplot 2: Pass Distribution by Player
print("\nPlotting pass distribution by player")
player_passes = passes.groupby('player')['id'].count()

axs[0, 1].bar(player_passes.index, player_passes.values, alpha=0.7)

axs[0, 1].set_xlabel('Player')
axs[0, 1].set_ylabel('Pass Count')
axs[0, 1].set_title('Pass Distribution by Player')
plt.setp(axs[0, 1].get_xticklabels(), rotation=45, ha='right')

# Subplot 3: Pass Accuracy by Player
print("\nPlotting pass accuracy by player")
player_pass_accuracy = passes.groupby('player')['pass_outcome'].apply(lambda x: (x == 'Incomplete').mean())

axs[1, 0].bar(player_pass_accuracy.index, player_pass_accuracy.values, alpha=0.7)

axs[1, 0].set_xlabel('Player')
axs[1, 0].set_ylabel('Pass Accuracy')
axs[1, 0].set_title('Pass Accuracy by Player')
plt.setp(axs[1, 0].get_xticklabels(), rotation=45, ha='right')

# Subplot 4: Any additional plot of your choice
print("\nPlotting additional plot (placeholder)")
# CHALLANGE for students: Try to think of what else could we add to this subplot

plt.tight_layout()
plt.show()

# MPL Soccer 

In [None]:
from mplsoccer import Pitch, VerticalPitch


In [None]:
# Basic pitch plot
print("Basic pitch plot")
pitch = Pitch(pitch_type='statsbomb')
fig, ax = pitch.draw(figsize=(8, 6))
plt.show()

In [None]:
# Plotting shot locations on the pitch
print("\nPlotting shot locations on the pitch")
pitch = Pitch(pitch_type='statsbomb')
fig, ax = pitch.draw(figsize=(8, 6))

shots = events.loc[events['type'] == 'Shot']
shot_locations = shots['location'].tolist()
x, y = zip(*shot_locations)

pitch.scatter(x, y, alpha=0.7, s=50, color='red', ax=ax)
ax.set_title('Shot Locations')
plt.show()

In [None]:
print("\nPlotting goal locations on the pitch")
pitch = Pitch(pitch_type='statsbomb')
fig, ax = pitch.draw(figsize=(8, 6))

goals = shots[shots['shot_outcome'] == 'Goal']
goal_locations = goals['location'].tolist()
x, y = zip(*goal_locations)

pitch.scatter(x, y, alpha=0.7, s=50, color='green', ax=ax)
ax.set_title('Goal Locations')
plt.show()

In [None]:
print("\nPlotting pass start locations on the pitch")
pitch = Pitch(pitch_type='statsbomb')
fig, ax = pitch.draw(figsize=(8, 6))

passes = events.loc[events['type'] == 'Pass']
pass_start_locations = passes['location'].tolist()
x, y = zip(*pass_start_locations)

pitch.scatter(x, y, alpha=0.7, s=50, color='blue', ax=ax)
ax.set_title('Pass Start Locations')
plt.show()

In [None]:
print("\nPlotting pass map with arrows")
pitch = Pitch(pitch_type='statsbomb')
fig, ax = pitch.draw(figsize=(10, 7))

passes = events.loc[events['type'] == 'Pass']
pass_starts = passes['location'].dropna().tolist()
pass_ends = passes['pass_end_location'].dropna().tolist()

start_x, start_y = zip(*pass_starts)
end_x, end_y = zip(*pass_ends)

pitch.arrows(start_x, start_y, end_x, end_y, width=2, headwidth=3, headlength=3, color='black', ax=ax, alpha=0.5)
ax.set_title('Pass Map with Arrows')
plt.show()
