# Libraries

In [89]:
import pandas as pd
from getpass import getuser
from collections import defaultdict


# Load and inspect dataset

In [90]:
# Get the current user's name
user = getuser()

# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\fifa.csv'

# Read the dataset with a different encoding
df = pd.read_csv(data_path, encoding='ISO-8859-1')




# Extract relevant columns

In [91]:
# First, let's extract the year from the 'tournament_id' column and create a new column 'year'
df['year'] = df['tournament_id'].str.extract(r'WC-(\d{4})').astype(int)

# Now, filter the data for tournaments after 1986
filtered_df = df[df['year'] > 1986]

# Extract relevant columns for goal events and match results
goals_df = filtered_df[['tournament_name', 'group_name','match_name', 'match_id', 'player_team_name','match_date', 'minute_regulation', 
               'team_id', 'own_goal']]
# Convert 'match_date' to datetime format
goals_df['match_date'] = pd.to_datetime(goals_df['match_date'], format='%m/%d/%Y')

# Sort the dataset by 'match_date' in ascending order (oldest first) and 'minute_regulation'
goals_df = goals_df.sort_values(by=['match_date', 'minute_regulation'], ascending=[True, True])

# Display the first few rows to confirm the sorting
goals_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  goals_df['match_date'] = pd.to_datetime(goals_df['match_date'], format='%m/%d/%Y')


Unnamed: 0,tournament_name,group_name,match_name,match_id,player_team_name,match_date,minute_regulation,team_id,own_goal
1328,1990 FIFA World Cup,Group B,Argentina v Cameroon,M-1990-01,Cameroon,1990-06-08,67,T-11,0
1329,1990 FIFA World Cup,Group B,Soviet Union v Romania,M-1990-02,Romania,1990-06-09,41,T-59,0
1331,1990 FIFA World Cup,Group D,United Arab Emirates v Colombia,M-1990-03,Colombia,1990-06-09,50,T-15,0
1330,1990 FIFA World Cup,Group B,Soviet Union v Romania,M-1990-02,Romania,1990-06-09,55,T-59,0
1333,1990 FIFA World Cup,Group A,Italy v Austria,M-1990-04,Italy,1990-06-09,78,T-39,0


# Calculate dynamically match outcomes after each goal

In [92]:
# Find the last match date for each tournament and group
last_dates = goals_df.groupby(['tournament_name', 'group_name'])['match_date'].max().reset_index()

# Exclude matches from the last match day
goals_before_last_day = goals_df.merge(last_dates, on=['tournament_name', 'group_name', 'match_date'], how='outer', indicator=True)
goals_before_last_day = goals_before_last_day[goals_before_last_day['_merge'] == 'left_only'].drop(columns=['_merge'])


Unnamed: 0,tournament_name,group_name,match_name,match_id,player_team_name,match_date,minute_regulation,team_id,own_goal
0,1990 FIFA World Cup,Group A,Italy v Austria,M-1990-04,Italy,1990-06-09,78,T-39,0
1,1990 FIFA World Cup,Group A,United States v Czechoslovakia,M-1990-05,Czechoslovakia,1990-06-10,26,T-20,0
2,1990 FIFA World Cup,Group A,United States v Czechoslovakia,M-1990-05,Czechoslovakia,1990-06-10,40,T-20,0
3,1990 FIFA World Cup,Group A,United States v Czechoslovakia,M-1990-05,Czechoslovakia,1990-06-10,50,T-20,0
4,1990 FIFA World Cup,Group A,United States v Czechoslovakia,M-1990-05,United States,1990-06-10,60,T-80,0
...,...,...,...,...,...,...,...,...,...
1381,2022 FIFA World Cup,not applicable,France v Morocco,M-2022-62,France,2022-12-14,5,T-28,0
1382,2022 FIFA World Cup,not applicable,France v Morocco,M-2022-62,France,2022-12-14,79,T-28,0
1383,2022 FIFA World Cup,not applicable,Croatia v Morocco,M-2022-63,Croatia,2022-12-17,7,T-17,0
1384,2022 FIFA World Cup,not applicable,Croatia v Morocco,M-2022-63,Morocco,2022-12-17,9,T-45,0
