## Exploring the data

In [2]:
import pandas as pd

In [3]:
results_csv_path = "https://raw.githubusercontent.com/martj42/international_results/refs/heads/master/results.csv"
goalscorers_csv_path = "https://raw.githubusercontent.com/martj42/international_results/refs/heads/master/goalscorers.csv"
shootouts_csv_path = "https://raw.githubusercontent.com/martj42/international_results/refs/heads/master/shootouts.csv"

csv_paths = [results_csv_path, goalscorers_csv_path, shootouts_csv_path]

In [4]:
results_df = pd.read_csv(results_csv_path, parse_dates=['date'])
goalscorers_df = pd.read_csv(goalscorers_csv_path, parse_dates=['date'])
shootouts_df = pd.read_csv(shootouts_csv_path, parse_dates=['date'])

In [5]:
results_df.sample(5)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
618,1921-04-28,Guernsey,Jersey,0,1,Muratti Vase,Guernsey,Guernsey,False
37615,2014-05-17,Mauritania,Equatorial Guinea,1,0,African Cup of Nations qualification,Nouakchott,Mauritania,False
29924,2006-05-25,Australia,Greece,1,0,Friendly,Melbourne,Australia,False
39212,2015-11-07,Hong Kong,Myanmar,5,0,Friendly,Kowloon,Hong Kong,False
39635,2016-05-29,Turkey,Montenegro,1,0,Friendly,Antalya,Turkey,False


In [8]:
goalscorers_df.columns

Index(['date', 'home_team', 'away_team', 'team', 'scorer', 'minute',
       'own_goal', 'penalty'],
      dtype='object')

In [13]:
shootouts_df.columns

Index(['date', 'home_team', 'away_team', 'winner', 'first_shooter'], dtype='object')

In [6]:
results_df.columns

Index(['date', 'home_team', 'away_team', 'home_score', 'away_score',
       'tournament', 'city', 'country', 'neutral'],
      dtype='object')

In [11]:
tournaments = results_df.tournament.unique().tolist()

# Tournaments by FIFA
FIFA_TOURNAMENTS = [t for t in tournaments if 'FIFA' in t]

print(FIFA_TOURNAMENTS)


['FIFA World Cup', 'FIFA World Cup qualification', 'FIFA 75th Anniversary Cup', 'FIFA Series']


In [7]:
world_cup_matches = results_df[results_df['tournament'] == 'FIFA World Cup']

# Unique world cups
unique_world_cups = world_cup_matches['date'].dt.year.unique()
print(unique_world_cups)

[1930 1934 1938 1950 1954 1958 1962 1966 1970 1974 1978 1982 1986 1990
 1994 1998 2002 2006 2010 2014 2018 2022]


In [9]:
# Number of matches per world cup
world_cup_matches['date'].dt.year.value_counts().sort_index()

date
1930    18
1934    17
1938    18
1950    22
1954    26
1958    35
1962    32
1966    32
1970    32
1974    38
1978    38
1982    52
1986    52
1990    52
1994    52
1998    64
2002    64
2006    64
2010    64
2014    64
2018    64
2022    64
Name: count, dtype: int64

In [12]:
goalscorers_df.columns

Index(['date', 'home_team', 'away_team', 'team', 'scorer', 'minute',
       'own_goal', 'penalty'],
      dtype='object')

In [10]:
# Merge goal scorers with world cup matches
goal_scorers_with_matches = world_cup_matches.merge(goalscorers_df, on=['date', 'home_team', 'away_team'])

goal_scorers_with_matches.shape

(2720, 14)

In [14]:
goals_by_player = goal_scorers_with_matches[goal_scorers_with_matches['scorer'] == 'Miroslav Klose']

goals_by_player.date.unique().shape

(11,)

In [15]:
results_df.tournament.unique()

array(['Friendly', 'British Home Championship', 'Évence Coppée Trophy',
       'Muratti Vase', 'Copa Lipton', 'Copa Newton',
       'Copa Premio Honor Argentino', 'Copa Premio Honor Uruguayo',
       'Far Eastern Championship Games', 'Copa Roca', 'Copa América',
       'Peace Cup', 'Open International Championship',
       'Copa Chevallier Boutell', 'Olympic Games', 'Nordic Championship',
       'Central European International Cup', 'Baltic Cup', 'Balkan Cup',
       'Central American and Caribbean Games', 'FIFA World Cup',
       'Copa Rio Branco', 'FIFA World Cup qualification',
       'Bolivarian Games', 'CCCF Championship', 'NAFC Championship',
       'Copa Oswaldo Cruz', 'Asian Games', 'Pan American Championship',
       'Copa del Pacífico', "Copa Bernardo O'Higgins",
       'AFC Asian Cup qualification', 'Atlantic Cup', 'AFC Asian Cup',
       'African Cup of Nations', 'Copa Paz del Chaco',
       'Merdeka Tournament', 'UEFA Euro qualification',
       'Southeast Asian Peninsular