In [4]:
import numpy as np
import pandas as pd

In [36]:
def convert_to_snake_case(text):
    text = text.strip()
    return ''.join(['_' + s.lower() if (s.isupper() and i > 0 and text[i - 1] != ' ' and not text[i - 1].isupper()) \
                    else '_' if s == ' ' \
                    else s.lower() for i, s in enumerate(text)])

In [37]:
world_cup_players = pd.read_csv("../../data/input_data/world_cup_players.csv",
                            usecols=['MatchID', 'Team Initials', 'Shirt Number', 'Player Name','Event'])

world_cup_matches = pd.read_csv("../../data/input_data/world_cup_matches.csv",
                            usecols=['MatchID', 'Year', 'Datetime', 'Stadium', 'City', 'Home Team Name','Home Team Goals', 'Away Team Goals', 'Away Team Name',
                                    'Attendance', 'Home Team Initials', 'Away Team Initials'])

world_cup_countries = pd.read_csv("../../data/input_data/fifa_countries_2006.csv",
                            usecols=['Position', 'Team'])

world_cup_players.columns = [convert_to_snake_case(col) for col in world_cup_players.columns]
world_cup_matches.columns = [convert_to_snake_case(col) for col in world_cup_matches.columns]
world_cup_countries.columns = [convert_to_snake_case(col) for col in world_cup_countries.columns]

In [38]:
world_cup_countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   position  32 non-null     int64 
 1   team      32 non-null     object
dtypes: int64(1), object(1)
memory usage: 644.0+ bytes


In [39]:
world_cup_players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37784 entries, 0 to 37783
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   match_id       37784 non-null  int64 
 1   team_initials  37784 non-null  object
 2   shirt_number   37784 non-null  int64 
 3   player_name    37784 non-null  object
 4   event          9069 non-null   object
dtypes: int64(2), object(3)
memory usage: 1.4+ MB


In [40]:
world_cup_matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 852 entries, 0 to 851
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   year                852 non-null    int64  
 1   datetime            852 non-null    object 
 2   stadium             852 non-null    object 
 3   city                852 non-null    object 
 4   home_team_name      852 non-null    object 
 5   home_team_goals     852 non-null    int64  
 6   away_team_goals     852 non-null    int64  
 7   away_team_name      852 non-null    object 
 8   attendance          850 non-null    float64
 9   match_id            852 non-null    int64  
 10  home_team_initials  852 non-null    object 
 11  away_team_initials  852 non-null    object 
dtypes: float64(1), int64(4), object(7)
memory usage: 80.0+ KB


In [41]:
# lower all string columns
def lower_all_object_columns_of_df(df):
    string_columns = df.select_dtypes(include='object').columns
    df[string_columns] = df[string_columns].apply(lambda x: x.str.lower())

lower_all_object_columns_of_df(world_cup_players)
lower_all_object_columns_of_df(world_cup_matches)
lower_all_object_columns_of_df(world_cup_countries)

In [42]:
merged_matches_players = pd.merge(world_cup_matches, world_cup_players, on='match_id')
merged_matches_players_2006 = merged_matches_players[merged_matches_players['year'] == 2006]
merged_matches_players_2006.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2943 entries, 28218 to 31160
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   year                2943 non-null   int64  
 1   datetime            2943 non-null   object 
 2   stadium             2943 non-null   object 
 3   city                2943 non-null   object 
 4   home_team_name      2943 non-null   object 
 5   home_team_goals     2943 non-null   int64  
 6   away_team_goals     2943 non-null   int64  
 7   away_team_name      2943 non-null   object 
 8   attendance          2943 non-null   float64
 9   match_id            2943 non-null   int64  
 10  home_team_initials  2943 non-null   object 
 11  away_team_initials  2943 non-null   object 
 12  team_initials       2943 non-null   object 
 13  shirt_number        2943 non-null   int64  
 14  player_name         2943 non-null   object 
 15  event               1023 non-null   object 
dtypes: flo

In [43]:
# get missing char players 
players_with_special_char = merged_matches_players_2006[merged_matches_players_2006['player_name'].str.contains('�')]
players_with_special_char['player_name'].unique()

array(['uma�a m.', 'bola�os c.', 'nu�ez v.', 'c�ceres', 'acu�a', 'ca�iza',
       'nu�ez', 'caba�as', 'alvb�ge', 'k�llstr�m', 'allb�ck',
       'jo�o ricardo', 'andr� macanga', 'akw�', 'sim�o', 'z� kalanga',
       'loc�', 'lam�', 'fl�vio', 'm�rio', 'zuberb�hler', 'l�cio', 'kak�',
       'z� roberto', 'luis�o', 'ca�izares'], dtype=object)

In [44]:
# clean world_cup_players df and add data to
merged_matches_players_2006 = merged_matches_players_2006.rename(columns={'team_initials': 'player_team_initials'})
merged_matches_players_2006['event'] = merged_matches_players_2006['event'].fillna('')
merged_matches_players_2006['red_cards'] = merged_matches_players_2006['event'].str.count('r')
merged_matches_players_2006['yellow_cards'] = merged_matches_players_2006['event'].str.count(r'(?<!rs)y')
merged_matches_players_2006['goals'] = merged_matches_players_2006['event'].str.count('g')
merged_matches_players_2006['player_name'] = merged_matches_players_2006['player_name'].replace(['uma�a m.', 'bola�os c.', 'nu�ez v.', 'c�ceres', 'acu�a', 'ca�iza',
       'nu�ez', 'caba�as', 'alvb�ge', 'k�llstr�m', 'allb�ck',
       'jo�o ricardo', 'andr� macanga', 'akw�', 'sim�o', 'z� kalanga',
       'loc�', 'lam�', 'fl�vio', 'm�rio', 'zuberb�hler', 'l�cio', 'kak�',
       'z� roberto', 'luis�o', 'ca�izares'], ['umaña M.', 'bolaños C.', 'nuñez V.', 'céceres', 'acuña', 'cañiza',
 'nuñez', 'cabañas', 'alvbäge', 'källström', 'allbäck',
 'joão ricardo', 'andrè macanga', 'akwá', 'simão', 'zé kalanga',
 'locó', 'lamá', 'flávio', 'mário', 'zuberbühler', 'lúcio', 'kaká',
 'zé roberto', 'luisão', 'cañizares'])
merged_matches_players_2006.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2943 entries, 28218 to 31160
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   year                  2943 non-null   int64  
 1   datetime              2943 non-null   object 
 2   stadium               2943 non-null   object 
 3   city                  2943 non-null   object 
 4   home_team_name        2943 non-null   object 
 5   home_team_goals       2943 non-null   int64  
 6   away_team_goals       2943 non-null   int64  
 7   away_team_name        2943 non-null   object 
 8   attendance            2943 non-null   float64
 9   match_id              2943 non-null   int64  
 10  home_team_initials    2943 non-null   object 
 11  away_team_initials    2943 non-null   object 
 12  player_team_initials  2943 non-null   object 
 13  shirt_number          2943 non-null   int64  
 14  player_name           2943 non-null   object 
 15  event                

In [45]:
# get missing char matches 
matches_with_special_char = merged_matches_players_2006[merged_matches_players_2006[['home_team_name', 'away_team_name']].apply(lambda row: row.astype(str).str.contains('�')).any(axis=1)]
pd.Series(matches_with_special_char[['home_team_name', 'away_team_name']].values.ravel()).unique()

array(['argentina', "c�te d'ivoire", 'netherlands',
       'rn">serbia and montenegro'], dtype=object)

In [46]:
# clean world_cup_matches df and add data to
merged_matches_players_2006[['home_team_name', 'away_team_name']] = merged_matches_players_2006[['home_team_name', 'away_team_name']].replace('rn">', '', regex=True)
merged_matches_players_2006 = merged_matches_players_2006.replace("c�te d'ivoire", 'ivory coast')
merged_matches_players_2006 = merged_matches_players_2006.replace("korea republic", 'south korea')
merged_matches_players_2006 = merged_matches_players_2006.replace("ir iran", 'iran')
merged_matches_players_2006[['home_team_name', 'away_team_name']] = merged_matches_players_2006[['home_team_name', 'away_team_name']].replace('usa', 'united states')

In [48]:
home_matches = merged_matches_players_2006.merge(world_cup_countries, left_on='home_team_name', right_on='team', how='inner')
home_matches = home_matches[home_matches['home_team_initials'] == home_matches['player_team_initials']]
away_matches = merged_matches_players_2006.merge(world_cup_countries, left_on='away_team_name', right_on='team', how='inner')
away_matches = away_matches[away_matches['away_team_initials'] == away_matches['player_team_initials']]

# create final dataframe for fact table
merged_df = pd.concat([home_matches, away_matches])
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2943 entries, 0 to 2942
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   year                  2943 non-null   int64  
 1   datetime              2943 non-null   object 
 2   stadium               2943 non-null   object 
 3   city                  2943 non-null   object 
 4   home_team_name        2943 non-null   object 
 5   home_team_goals       2943 non-null   int64  
 6   away_team_goals       2943 non-null   int64  
 7   away_team_name        2943 non-null   object 
 8   attendance            2943 non-null   float64
 9   match_id              2943 non-null   int64  
 10  home_team_initials    2943 non-null   object 
 11  away_team_initials    2943 non-null   object 
 12  player_team_initials  2943 non-null   object 
 13  shirt_number          2943 non-null   int64  
 14  player_name           2943 non-null   object 
 15  event                 2943

In [None]:
#TODO for creating the tables
# give countries indicies = teamdim index, give players indicies = playmatchdim index -> can be created
# 
# select unique match dims, give match_id new values starting from 1, get matchdim
# get unique locations, create location indicies location = locationdim
# get unique players, create player indicies
# make pyexasol connection to database and add tables 