# Web-Scraping

In [None]:
### extract all soccer data

from dotenv import load_dotenv
load_dotenv(override=True)
import os
from os.path import join
import sys
import time

from web_scraping import get_soccer_leagues_ids, get_soccer_league_season_standings, get_soccer_teams_url, get_player_season_stats

current_season = 2022
soccer_league_ids = get_soccer_leagues_ids()

for year in range(1982,2000):
    print(year)
    try:
        for soccer_league_id in soccer_league_ids:
            if soccer_league_id["league_name"] == "Fußball-Bundesliga" or soccer_league_id["league_name"] == "Big 5 European Leagues Combined":
                continue
            # get season standings
            df = get_soccer_league_season_standings(year, current_season, soccer_league_id["league_id"], soccer_league_id["league_stats_name"])
            df.to_csv(join(os.environ["SportsTables"], "soccer", f"{soccer_league_id['league_name'].replace(' ','-')}_season_standings_{year}.csv"), index=False)
            
            
            # get player stats
            df = get_player_season_stats(year, current_season, soccer_league_id["league_id"], soccer_league_id["league_stats_name"])
            df.to_csv(join(os.environ["SportsTables"], "soccer", f"{soccer_league_id['league_name'].replace(' ','-')}_player_stats_{year}.csv"), index=False)
        
            time.sleep(3)
    except:
        print("not possible")
        continue
    

# Extract column headers from different tables

In [None]:
import json
from glob import glob
from dotenv import load_dotenv
load_dotenv(override=True)

import os
from os.path import join
import sys
import pandas as pd

metadata = {}

tables = ["player_stats", "season_standings"]

for table in tables:
    column_names = []
    for found_table in glob(join(os.environ["SportsTables"], "soccer", f"*{table}_*.csv")):
        column_names.extend(list(pd.read_csv(found_table).columns))
    
    column_names = list(set(column_names))
    metadata[table] = {}
    for column_name in column_names:
        metadata[table][column_name] = None
    
if os.path.isfile("metadata.json"):
    print("metadata.json exists already!!! ")
else:
    with open("metadata.json", "w") as f:
        json.dump(metadata, f)

# Extract valid header file

In [None]:
import json
from glob import glob
import os 
os.environ["SPORTS_DB"] = "/ext/daten-wi/slangenecker/SportsTables"
from os.path import join
import pandas as pd

def get_sportsDB_soccer_type_mappings():
    with open(join(os.environ["SPORTS_DB"], "Soccer", "Soccer-Relations.json"), "r") as f:
        type_mappings = json.load(f)["soccer-big5Leagues-Players-2021-2022.csv"]["columns"][0]
        return type_mappings


def get_all_sportsDB_soccer_tables(only_file_names:bool=False):
    result = glob(join(os.environ["SPORTS_DB"], "Soccer", "soccerPlayerScraping", "*.csv"))
    if only_file_names:
        return [table.split("/")[-1] for table in result]
    else:
        return result

def load_sportsDB_soccer_table(table_name:str, only_headers:bool=False):
    df = pd.read_csv(join(os.environ["SPORTS_DB"], "Soccer", "soccerPlayerScraping", table_name))
    if only_headers:
        return df.columns
    else:
        return df

In [None]:
col_name_to_sem_type = get_sportsDB_soccer_type_mappings()

results = {}
for table in get_all_sportsDB_soccer_tables(True):
    print(table)
    results[table.split(".csv")[0]] = {}
    col_names = load_sportsDB_soccer_table(table, True)
    for col_num, col_name in enumerate(col_names):
        try:
            results[table.split(".csv")[0]][f"column_{col_num}"] = { "semanticType": col_name_to_sem_type[col_name]}
        except:
            print(col_name)
    
with open(join(os.environ["SPORTS_DB"], "Soccer", "sportsDB_type_sportsDB.json"), "w") as f:
    json.dump(results, f)