# Finding Messi Goals

This notebook explores the statsbomb open data to extract data for all of Messi goals.

This will be use to schedule the tweets.

In [1]:
from geopy.geocoders import Nominatim
from timezonefinder import TimezoneFinder
import pytz
import pandas as pd
from statsbombpy import sb
from datetime import datetime

In [2]:
competitions = sb.competitions()



In [3]:
competitions['season_start'] = competitions.season_name.str.split("/",expand=True).iloc[:,0].astype(int)

In [4]:
MESSI_TEAMS = {
    "Barcelona": ("Spain",["2004/2005","2020/2021"]),
    "Paris Saint-Germain": ("France",["2021/2022","2022/2023"]),
    "Inter Miami":("United States of America",["2023","2026"]),
    "Argentina":("International",["2004","2026"])
}
MESSI_PLAYER_ID = 5503

In [5]:
messi_goals =[]

for team,(country,[start,end]) in MESSI_TEAMS.items():
    start_ = int(start.split("/")[0])
    end_ = int(end.split("/")[0])
    comps = competitions[(competitions.country_name==country) & competitions.season_start.between(start_,end_,inclusive="both")].sort_values("season_start")
    for i,c in comps.iterrows():
        all_matches = sb.matches(competition_id=c.competition_id,season_id=c.season_id).loc[lambda x:
                (x.home_team==team) | (x.away_team==team)].sort_values("match_date").reset_index(drop=True)
        
        for i_m,m in all_matches.iterrows():
            match_events = sb.events(match_id=m.match_id).loc[lambda x: (x.shot_outcome=="Goal") & (x.player_id==MESSI_PLAYER_ID)]
            
            if match_events.shape[0]>0:
                for ig,g in match_events.iterrows():
                    messi_goals.append([c.competition_id,c.competition_name,c.country_name,c.season_id,c.season_name,m.match_id,m.match_date,
                                        m.kick_off,m.stadium,g.id,g.period,g.minute,g.second,g.timestamp])



In [6]:
len(messi_goals)

507

In [7]:
df_messi_goals = pd.DataFrame.from_records(messi_goals,columns=
                                           ['competition_id','competition_name','competition_country_name','season_id','season_name',
                                            'match_id','match_date','kick_off',
                                            'stadium','id','period','minute','second','timestamp'])

In [8]:
df_messi_goals.to_csv("data/messi_goals.csv",index=False)

### Getting the kick-off time in UTC

In [15]:
df_messi_goals = pd.read_csv("data/messi_goals.csv")
print(df_messi_goals.shape)
df_messi_goals.head()

(507, 14)


Unnamed: 0,competition_id,competition_name,competition_country_name,season_id,season_name,match_id,match_date,kick_off,stadium,id,period,minute,second,timestamp
0,11,La Liga,Spain,37,2004/2005,68316,2005-05-01,19:00:00.000,Spotify Camp Nou,33475e03-b72c-436b-b32e-20cfaacffa74,2,90,14,00:45:14.961
1,11,La Liga,Spain,38,2005/2006,68354,2005-11-27,21:00:00.000,Spotify Camp Nou,e4aa0844-8e66-48fd-ab1a-06eeb71367b6,2,51,16,00:06:16.941
2,11,La Liga,Spain,38,2005/2006,68342,2006-01-15,19:00:00.000,Spotify Camp Nou,c024ea4d-691a-4068-809d-4472ad78db7d,2,50,45,00:05:45.673
3,11,La Liga,Spain,38,2005/2006,68324,2006-01-22,19:00:00.000,Spotify Camp Nou,788c4fdd-fa27-48d5-b499-3cf251b2d9fe,2,81,49,00:36:49.631
4,11,La Liga,Spain,38,2005/2006,68325,2006-01-29,19:00:00.000,Estadi Mallorca Son Moix,23fa10c0-c180-4c23-b2a0-67691501da4b,2,75,27,00:30:27.493


In [16]:
geolocator = Nominatim(user_agent="match_timezone_locator")
tf = TimezoneFinder()

found_timezones = {}

def find_time_zone(v):
    if v in found_timezones.keys():
        return found_timezones[v]
    location = geolocator.geocode(v)
    if location:
        tz = tf.timezone_at(lng=location.longitude, lat=location.latitude)
        found_timezones[v] = tz
        return tz
    return None

find_time_zone(df_messi_goals.iloc[0].stadium)

'Europe/Madrid'

In [17]:
df_messi_goals['timezone'] = df_messi_goals.stadium.map(find_time_zone)

In [18]:
df_messi_goals.timezone.value_counts(dropna=False)

timezone
Europe/Madrid       429
None                 41
Europe/Paris         22
Asia/Qatar            9
Atlantic/Canary       4
America/New_York      1
Europe/Moscow         1
Name: count, dtype: int64

In [19]:
df_messi_goals[df_messi_goals.timezone.isna()].stadium.value_counts()

stadium
Estadio de Mestalla                                          11
Estadio Vicente Calderón                                      8
Estadio Municipal de Ipurúa                                   8
Power Horse Stadium – Estadio de los Juegos Mediterráneos     6
Estadio Municipal El Molinón                                  4
Estadio Municipal José Zorrilla                               3
\tEstádio Cívitas Metropolitano                               1
Name: count, dtype: int64

In [20]:
df_messi_goals[df_messi_goals.timezone.isna()].competition_country_name.value_counts()

competition_country_name
Spain    41
Name: count, dtype: int64

In [21]:
df_messi_goals.loc[df_messi_goals.timezone.isna(),'timezone'] = df_messi_goals.loc[df_messi_goals.timezone.isna(),'competition_country_name'].map(find_time_zone)

In [23]:
assert df_messi_goals.timezone.isna().sum()==0

In [22]:
df_messi_goals.head(2)

Unnamed: 0,competition_id,competition_name,competition_country_name,season_id,season_name,match_id,match_date,kick_off,stadium,id,period,minute,second,timestamp,timezone
0,11,La Liga,Spain,37,2004/2005,68316,2005-05-01,19:00:00.000,Spotify Camp Nou,33475e03-b72c-436b-b32e-20cfaacffa74,2,90,14,00:45:14.961,Europe/Madrid
1,11,La Liga,Spain,38,2005/2006,68354,2005-11-27,21:00:00.000,Spotify Camp Nou,e4aa0844-8e66-48fd-ab1a-06eeb71367b6,2,51,16,00:06:16.941,Europe/Madrid


In [24]:

def local_to_utc(record):
    match_date, kick_off, timezone = record.match_date,record.kick_off,record.timezone
    local_time_str = f"{match_date} {kick_off}"
    local_time = datetime.strptime(local_time_str, "%Y-%m-%d %H:%M:%S.%f")
    local_time = pytz.timezone(timezone).localize(local_time)
    return local_time.astimezone(pytz.utc)

local_to_utc(df_messi_goals.iloc[0])

datetime.datetime(2005, 5, 1, 17, 0, tzinfo=<UTC>)

In [25]:
df_messi_goals['datetime'] = df_messi_goals.apply(local_to_utc,axis=1)

In [26]:
df_messi_goals.head()

Unnamed: 0,competition_id,competition_name,competition_country_name,season_id,season_name,match_id,match_date,kick_off,stadium,id,period,minute,second,timestamp,timezone,datetime
0,11,La Liga,Spain,37,2004/2005,68316,2005-05-01,19:00:00.000,Spotify Camp Nou,33475e03-b72c-436b-b32e-20cfaacffa74,2,90,14,00:45:14.961,Europe/Madrid,2005-05-01 17:00:00+00:00
1,11,La Liga,Spain,38,2005/2006,68354,2005-11-27,21:00:00.000,Spotify Camp Nou,e4aa0844-8e66-48fd-ab1a-06eeb71367b6,2,51,16,00:06:16.941,Europe/Madrid,2005-11-27 20:00:00+00:00
2,11,La Liga,Spain,38,2005/2006,68342,2006-01-15,19:00:00.000,Spotify Camp Nou,c024ea4d-691a-4068-809d-4472ad78db7d,2,50,45,00:05:45.673,Europe/Madrid,2006-01-15 18:00:00+00:00
3,11,La Liga,Spain,38,2005/2006,68324,2006-01-22,19:00:00.000,Spotify Camp Nou,788c4fdd-fa27-48d5-b499-3cf251b2d9fe,2,81,49,00:36:49.631,Europe/Madrid,2006-01-22 18:00:00+00:00
4,11,La Liga,Spain,38,2005/2006,68325,2006-01-29,19:00:00.000,Estadi Mallorca Son Moix,23fa10c0-c180-4c23-b2a0-67691501da4b,2,75,27,00:30:27.493,Europe/Madrid,2006-01-29 18:00:00+00:00


In [34]:
df_messi_goals.datetime.iloc[0].tz

<UTC>

In [None]:
WAITING_TIME_BY_PERIOD = [0,15,5,1,2]

[0, 15, 5]

In [41]:
pd.Series([1,2,3,4,5],index=[1,2,3,4,5]).map(lambda x: sum(WAITING_TIME_BY_PERIOD[:x]))

1     0
2    15
3    20
4    21
5    23
dtype: int64

In [48]:
df_messi_goals['goal_datetime'] = df_messi_goals.datetime + pd.to_timedelta(df_messi_goals.minute,unit="m") + pd.to_timedelta(df_messi_goals.second,unit="s") + pd.to_timedelta(df_messi_goals.period.map(lambda x: sum(WAITING_TIME_BY_PERIOD[:x])),unit="m")

In [49]:
df_messi_goals.sample(5)

Unnamed: 0,competition_id,competition_name,competition_country_name,season_id,season_name,match_id,match_date,kick_off,stadium,id,period,minute,second,timestamp,timezone,datetime,goal_datetime
107,11,La Liga,Spain,22,2010/2011,69296,2011-01-29,20:00:00.000,Estadio José Rico Pérez,a576b0ed-1e6c-4f48-91da-7547e91363e9,2,86,45,00:41:45.230,Europe/Madrid,2011-01-29 19:00:00+00:00,2011-01-29 20:41:45+00:00
63,11,La Liga,Spain,21,2009/2010,69219,2010-01-10,21:00:00.000,Estadio Heliodoro Rodríguez Lopéz,03ee1b27-967e-4157-b5bc-3be868000453,1,35,16,00:35:16.006,Atlantic/Canary,2010-01-10 21:00:00+00:00,2010-01-10 21:35:16+00:00
419,11,La Liga,Spain,42,2019/2020,303473,2019-10-06,21:00:00.000,Spotify Camp Nou,1d72f439-2ca0-4593-9138-8603ea42692f,2,77,22,00:32:22.433,Europe/Madrid,2019-10-06 19:00:00+00:00,2019-10-06 20:32:22+00:00
212,11,La Liga,Spain,24,2012/2013,267502,2013-04-27,18:00:00.000,Estadio San Mamés,bf64c0b2-02bc-4713-8064-9462b06ef50d,2,66,27,00:21:27.032,Europe/Madrid,2013-04-27 16:00:00+00:00,2013-04-27 17:21:27+00:00
302,11,La Liga,Spain,27,2015/2016,266653,2016-03-03,22:00:00.000,Estadio de Vallecas,d02078ff-a25b-4ac2-bbdc-55cae3d80785,1,22,58,00:22:58.083,Europe/Madrid,2016-03-03 21:00:00+00:00,2016-03-03 21:22:58+00:00


In [50]:
df_messi_goals.to_csv("data/messi_goals_with_goal_datetime.csv",index=False)