## Exploring the data

In [1]:
import pandas as pd

In [3]:
results_csv_path = "https://raw.githubusercontent.com/martj42/international_results/refs/heads/master/results.csv"
goalscorers_csv_path = "https://raw.githubusercontent.com/martj42/international_results/refs/heads/master/goalscorers.csv"
shootouts_csv_path = "https://raw.githubusercontent.com/martj42/international_results/refs/heads/master/shootouts.csv"

csv_paths = [results_csv_path, goalscorers_csv_path, shootouts_csv_path]

In [4]:
results_df = pd.read_csv(results_csv_path, parse_dates=['date'])
goalscorers_df = pd.read_csv(goalscorers_csv_path, parse_dates=['date'])
shootouts_df = pd.read_csv(shootouts_csv_path, parse_dates=['date'])

In [5]:
results_df.sample(5)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
38918,2015-08-31,India,Nepal,0,0,Friendly,Pune,India,False
27560,2003-10-04,Lesotho,Eswatini,5,2,Friendly,Maseru,Lesotho,False
18649,1992-12-09,Oman,United Arab Emirates,0,1,Gulf Cup,Doha,Qatar,True
7566,1969-01-30,DR Congo,Cameroon,2,1,Friendly,Kinshasa,DR Congo,False
110,1897-03-06,Northern Ireland,Wales,4,3,British Home Championship,Belfast,Ireland,False


In [6]:
goalscorers_df.columns

Index(['date', 'home_team', 'away_team', 'team', 'scorer', 'minute',
       'own_goal', 'penalty'],
      dtype='object')

In [7]:
shootouts_df.columns

Index(['date', 'home_team', 'away_team', 'winner', 'first_shooter'], dtype='object')

In [8]:
results_df.columns

Index(['date', 'home_team', 'away_team', 'home_score', 'away_score',
       'tournament', 'city', 'country', 'neutral'],
      dtype='object')

In [10]:
tournaments = results_df.tournament.unique().tolist()

# Tournaments by FIFA
FIFA_TOURNAMENTS = [t for t in tournaments if 'FIFA' in t]

print(FIFA_TOURNAMENTS)


['FIFA World Cup', 'FIFA World Cup qualification', 'FIFA 75th Anniversary Cup', 'FIFA Series']


In [11]:
world_cup_matches = results_df[results_df['tournament'] == 'FIFA World Cup']

# Unique world cups
unique_world_cups = world_cup_matches['date'].dt.year.unique()
print(unique_world_cups)

[1930 1934 1938 1950 1954 1958 1962 1966 1970 1974 1978 1982 1986 1990
 1994 1998 2002 2006 2010 2014 2018 2022]


In [12]:
# Number of matches per world cup
world_cup_matches['date'].dt.year.value_counts().sort_index()

date
1930    18
1934    17
1938    18
1950    22
1954    26
1958    35
1962    32
1966    32
1970    32
1974    38
1978    38
1982    52
1986    52
1990    52
1994    52
1998    64
2002    64
2006    64
2010    64
2014    64
2018    64
2022    64
Name: count, dtype: int64

In [15]:
goalscorers_df.columns

Index(['date', 'home_team', 'away_team', 'team', 'scorer', 'minute',
       'own_goal', 'penalty'],
      dtype='object')

In [37]:
# Merge goal scorers with world cup matches
goal_scorers_with_matches = world_cup_matches.merge(goalscorers_df, on=['date', 'home_team', 'away_team'])

goal_scorers_with_matches.shape

(2720, 14)

In [54]:
goals_by_player = goal_scorers_with_matches[goal_scorers_with_matches['scorer'] == 'Miroslav Klose']

goals_by_player.date.unique().shape

(11,)