In [37]:
import pandas as pd
import json
import os

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [38]:
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
extracting_dir = os.path.join(parent_dir, 'extracted_data/')

#### Area table parsing

In [39]:
file_name = os.path.join(extracting_dir,'area_data.json')


with open(file_name, 'r') as f:
    data = json.load(f)['areas']

#print(data)

#fitting json into pandas_dataframe

area_df = pd.DataFrame(data)
area_df.head()

Unnamed: 0,id,name,countryCode,flag,parentAreaId,parentArea
0,2000,Afghanistan,AFG,,2014.0,Asia
1,2001,Africa,AFR,,2267.0,World
2,2002,Albania,ALB,,2077.0,Europe
3,2004,Algeria,ALG,,2001.0,Africa
4,2005,American Samoa,ASM,,2175.0,Oceania


#### Competitions table

In [40]:
file_name = os.path.join(extracting_dir,'comp_data.json')


with open(file_name, 'r') as f:
    data = json.load(f)['competitions']


comp_df = pd.DataFrame(data)
#parsing only area id from the area column and leaving out the other information

# comp_df['area_id'] = comp_df['area'].apply(lambda x: x['id'])
# comp_df['country'] = comp_df['area'].apply(lambda x: x['name'])

# comp_df.drop(columns = ['area'],inplace = True)
comp_df = comp_df.drop(columns = ['currentSeason'])

##normalizing code:
comp_df_filtered = pd.json_normalize(data)

comp_df_filtered = comp_df_filtered.loc[:, ~comp_df_filtered.columns.str.startswith('currentSeason')]



#selecting relvant columns only

rel_cols = ['id', 'name', 'code','type','area.id','area.name']

comp_df_filtered = comp_df_filtered[rel_cols]

comp_df_filtered.head(5)

Unnamed: 0,id,name,code,type,area.id,area.name
0,2013,Campeonato Brasileiro Série A,BSA,LEAGUE,2032,Brazil
1,2016,Championship,ELC,LEAGUE,2072,England
2,2021,Premier League,PL,LEAGUE,2072,England
3,2001,UEFA Champions League,CL,CUP,2077,Europe
4,2018,European Championship,EC,CUP,2077,Europe


#### Note that to extract a sub_dictionary inside a dictionary, useapply and lambda functions to extract specific keys of the sub_dictionary

Example:comp_df['area_id'] = comp_df['area'].apply(lambda x: x['id'])

#### Teams table

In [41]:
file_name = os.path.join(extracting_dir,'teams_data.json')


with open(file_name, 'r') as f:
    data = json.load(f)['teams']


teams_df = pd.DataFrame(data)


rel_cols = ['id','name','shortName','tla']

teams_df = teams_df[rel_cols]



print(teams_df.head(5))

   id                 name   shortName  tla
0   1           1. FC Köln  1. FC Köln  KOE
1   2  TSG 1899 Hoffenheim  Hoffenheim  TSG
2   3  Bayer 04 Leverkusen  Leverkusen  B04
3   4    Borussia Dortmund    Dortmund  BVB
4   5    FC Bayern München      Bayern  FCB


#### Finally Parsing Matches table

In [42]:
file_name = os.path.join(extracting_dir,'match_data2024-09-20to2024-09-29.json')

with open(file_name, 'r') as f:
    data = json.load(f)['matches']

matches_df = pd.json_normalize(data,max_level = 2)

matches_df['winner'] = matches_df.apply(lambda x: x['homeTeam.name'] if x['score.winner'] == 'HOME_TEAM' else (x['awayTeam.name'] if x['score.winner'] else 'DRAW'), axis = 1)

rel_cols = ['id','stage','area.id','area.name','competition.id','homeTeam.id','awayTeam.id','score.winner','score.fullTime.home','score.fullTime.away']
matches_df = matches_df[rel_cols]

matches_df.head(2)

Unnamed: 0,id,stage,area.id,area.name,competition.id,homeTeam.id,awayTeam.id,score.winner,score.fullTime.home,score.fullTime.away
0,503084,REGULAR_SEASON,2114,Italy,2019,104,445,AWAY_TEAM,0,2
1,499038,REGULAR_SEASON,2163,Netherlands,2003,684,682,AWAY_TEAM,1,2


#### You see that some of the above dataframes have a subschema (sub_dictionaries) inside them. There is a need to 'flatten' them before we load this data into MySQL database. This process is called "Normalization"

#### Final data frames

In [43]:
matches_df.head()

Unnamed: 0,id,stage,area.id,area.name,competition.id,homeTeam.id,awayTeam.id,score.winner,score.fullTime.home,score.fullTime.away
0,503084,REGULAR_SEASON,2114,Italy,2019,104,445,AWAY_TEAM,0,2
1,499038,REGULAR_SEASON,2163,Netherlands,2003,684,682,AWAY_TEAM,1,2
2,502403,REGULAR_SEASON,2088,Germany,2002,16,15,AWAY_TEAM,2,3
3,503086,REGULAR_SEASON,2114,Italy,2019,450,586,AWAY_TEAM,2,3
4,497988,REGULAR_SEASON,2081,France,2015,522,527,HOME_TEAM,8,0


In [45]:
comp_df_filtered.head()

Unnamed: 0,id,name,code,type,area.id,area.name
0,2013,Campeonato Brasileiro Série A,BSA,LEAGUE,2032,Brazil
1,2016,Championship,ELC,LEAGUE,2072,England
2,2021,Premier League,PL,LEAGUE,2072,England
3,2001,UEFA Champions League,CL,CUP,2077,Europe
4,2018,European Championship,EC,CUP,2077,Europe


In [46]:
teams_df.head()

Unnamed: 0,id,name,shortName,tla
0,1,1. FC Köln,1. FC Köln,KOE
1,2,TSG 1899 Hoffenheim,Hoffenheim,TSG
2,3,Bayer 04 Leverkusen,Leverkusen,B04
3,4,Borussia Dortmund,Dortmund,BVB
4,5,FC Bayern München,Bayern,FCB


In [47]:
area_df.head()

Unnamed: 0,id,name,countryCode,flag,parentAreaId,parentArea
0,2000,Afghanistan,AFG,,2014.0,Asia
1,2001,Africa,AFR,,2267.0,World
2,2002,Albania,ALB,,2077.0,Europe
3,2004,Algeria,ALG,,2001.0,Africa
4,2005,American Samoa,ASM,,2175.0,Oceania
