In [225]:
import numpy as np
import pandas as pd

In [226]:
def convert_to_snake_case(text):
    text = text.strip()
    return ''.join(['_' + s.lower() if (s.isupper() and i > 0 and text[i - 1] != ' ' and not text[i - 1].isupper()) \
                    else '_' if s == ' ' \
                    else s.lower() for i, s in enumerate(text)])

In [227]:
world_cup_players = pd.read_csv("../../data/input_data/world_cup_players.csv",
                            usecols=['MatchID', 'Team Initials', 'Shirt Number', 'Player Name','Event'])

world_cup_matches = pd.read_csv("../../data/input_data/world_cup_matches.csv",
                            usecols=['MatchID', 'Year', 'Datetime', 'Stadium', 'City', 'Home Team Name','Home Team Goals', 'Away Team Goals', 'Away Team Name',
                                    'Attendance', 'Home Team Initials', 'Away Team Initials'])

world_cup_countries = pd.read_csv("../../data/input_data/fifa_countries_2006.csv",
                            usecols=['Position', 'Team'])

world_cup_players.columns = [convert_to_snake_case(col) for col in world_cup_players.columns]
world_cup_matches.columns = [convert_to_snake_case(col) for col in world_cup_matches.columns]
world_cup_countries.columns = [convert_to_snake_case(col) for col in world_cup_countries.columns]

In [228]:
world_cup_countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   position  32 non-null     int64 
 1   team      32 non-null     object
dtypes: int64(1), object(1)
memory usage: 644.0+ bytes


In [229]:
world_cup_players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37784 entries, 0 to 37783
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   match_id       37784 non-null  int64 
 1   team_initials  37784 non-null  object
 2   shirt_number   37784 non-null  int64 
 3   player_name    37784 non-null  object
 4   event          9069 non-null   object
dtypes: int64(2), object(3)
memory usage: 1.4+ MB


In [230]:
world_cup_matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   year                852 non-null    float64
 1   datetime            852 non-null    object 
 2   stadium             852 non-null    object 
 3   city                852 non-null    object 
 4   home_team_name      852 non-null    object 
 5   home_team_goals     852 non-null    float64
 6   away_team_goals     852 non-null    float64
 7   away_team_name      852 non-null    object 
 8   attendance          850 non-null    float64
 9   match_id            852 non-null    float64
 10  home_team_initials  852 non-null    object 
 11  away_team_initials  852 non-null    object 
dtypes: float64(5), object(7)
memory usage: 428.8+ KB


In [231]:
# lower all string columns
def lower_all_object_columns_of_df(df):
    string_columns = df.select_dtypes(include='object').columns
    df[string_columns] = df[string_columns].apply(lambda x: x.str.lower())

lower_all_object_columns_of_df(world_cup_players)
lower_all_object_columns_of_df(world_cup_matches)
lower_all_object_columns_of_df(world_cup_countries)

In [232]:
world_cup_players['player_name'] = world_cup_players['player_name']
world_cup_players['team _initials'] = world_cup_players['team_initials']
world_cup_players['red_cards'] = world_cup_players['event'].str.count('r')
world_cup_players['yellow_cards'] = world_cup_players['event'].str.count(r'(?<!rs)y')
world_cup_players['goals'] = world_cup_players['event'].str.count('g')
world_cup_players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37784 entries, 0 to 37783
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   match_id        37784 non-null  int64  
 1   team_initials   37784 non-null  object 
 2   shirt_number    37784 non-null  int64  
 3   player_name     37784 non-null  object 
 4   event           9069 non-null   object 
 5   team _initials  37784 non-null  object 
 6   red_cards       9069 non-null   float64
 7   yellow_cards    9069 non-null   float64
 8   goals           9069 non-null   float64
dtypes: float64(3), int64(2), object(4)
memory usage: 2.6+ MB


In [213]:
merged_matches_players = pd.merge(world_cup_matches, world_cup_players, on='match_id')
filtered_df = merged_matches_players[merged_matches_players['year'] == 2006]
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2943 entries, 28218 to 31160
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Year                2943 non-null   float64
 1   Datetime            2943 non-null   object 
 2   Stadium             2943 non-null   object 
 3   City                2943 non-null   object 
 4   Home Team Name      2943 non-null   object 
 5   Home Team Goals     2943 non-null   float64
 6   Away Team Goals     2943 non-null   float64
 7   Away Team Name      2943 non-null   object 
 8   Attendance          2943 non-null   float64
 9   MatchID             2943 non-null   float64
 10  Home Team Initials  2943 non-null   object 
 11  Away Team Initials  2943 non-null   object 
 12  Team Initials       2943 non-null   object 
 13  Shirt Number        2943 non-null   int64  
 14  Player Name         2943 non-null   object 
 15  Event               2943 non-null   object 
 16  red_ca

In [211]:
# merge TODO
df = pd.merge(filtered_df, world_cup_players, on='match_id')

<class 'pandas.core.frame.DataFrame'>
Index: 2943 entries, 28218 to 31160
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   year                2943 non-null   float64
 1   datetime            2943 non-null   object 
 2   stadium             2943 non-null   object 
 3   city                2943 non-null   object 
 4   home_team_name      2943 non-null   object 
 5   home_team_goals     2943 non-null   float64
 6   away_team_goals     2943 non-null   float64
 7   away_team_name      2943 non-null   object 
 8   attendance          2943 non-null   float64
 9   match_id            2943 non-null   float64
 10  home_team_initials  2943 non-null   object 
 11  away_team_initials  2943 non-null   object 
 12  team_initials       2943 non-null   object 
 13  shirt_number        2943 non-null   int64  
 14  player_name         2943 non-null   object 
 15  event               2943 non-null   object 
 16  red_ca