## SoccerCovid MLS USA

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
dates_range = ['2019', '2019-playoffs', '2020', '2020-mls-is-back-tournament', '2020-playoffs', '2021', '2021-playoffs']
year_range = dates_range[6] # till 6

In [3]:
base_url = 'https://www.worldfootball.net/all_matches/usa-major-league-soccer-'

page = requests.get(base_url+year_range)
soup = BeautifulSoup(page.content, "html.parser")
table_div = soup.find_all("div", {"class": "data"})
table_tag = table_div[2].find("table")
schedule_df = pd.read_html(str(table_tag))[0]

print('Stage 1 Complete')

# Clean up df columns
schedule_df.columns = ['date', 'time', 'teamA', 'd1', 'teamB', 'score', 'd2']
del schedule_df['d1']
del schedule_df['d2']

# Process dates 
schedule_df['date'].fillna(method='ffill', inplace=True)

# Get individual match details url
rows = table_tag.findAll('tr')
match_url = []

for row in rows[1:]:
    if row.findAll('td') and row.findAll('td')[5].find('a'):
        match_url.append(row.findAll('td')[5].find('a')['href'])
    else:
        match_url.append(np.NaN)
schedule_df['match_url'] = match_url

# Split score for individual teams
schedule_df[['score', 'd3']] = schedule_df['score'].str.split('(',expand=True)
schedule_df[['teamA_score', 'teamB_score']] = schedule_df['score'].str.split(':', expand=True)

schedule_df['teamA_score'] = pd.to_numeric(schedule_df['teamA_score'], errors='coerce')
schedule_df['teamB_score'] = pd.to_numeric(schedule_df['teamB_score'], errors='coerce')

# Clean excess columns
del schedule_df['score']
del schedule_df['d3']

print('Stage 2 Complete')

# Get winner details 
def get_winner_details(record):
    if(record['teamA_score'] and record['teamA_score'] > record['teamB_score']):
        return record['teamA'], record['teamA_score']
    elif(record['teamB_score'] > record['teamA_score']):
        return record['teamB'], record['teamB_score']
    else:
        return np.NaN,np.NaN
    
schedule_df['teamA_score'] = pd.to_numeric(schedule_df['teamA_score'])
schedule_df['teamB_score'] = pd.to_numeric(schedule_df['teamB_score'])

schedule_df[['winner_Team','winner_score']] = pd.DataFrame(schedule_df
                                                           .apply(get_winner_details, axis=1)
                                                           .tolist(), index=schedule_df.index) 
print('Stage 3 Complete')

def get_venue_details(url):
    print(url)
    if type(url) == float:
        return np.nan
    url = 'https://www.worldfootball.net' + url
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    if soup.find_all("td", {"width": "50%"}) and len(soup.find_all("td", {"width": "50%"}))>4:
        href = soup.find_all("td", {"width": "50%"})[4]
        url = href.find('a').get('href')
        url = 'https://www.worldfootball.net'+url
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        return soup.find("div", {"class": "breadcrumb"}).text
    return np.nan 

# Process venue location details
schedule_df['venue_location'] = schedule_df['match_url'].apply(get_venue_details)
schedule_df["venue_location"] = schedule_df["venue_location"].str.replace('\n', '')
schedule_df[['venue', 'venue_city', 'venue_state']] = schedule_df["venue_location"].str.split(",", expand= True)
schedule_df[['venue_state', 'venue_country']] = schedule_df['venue_state'].str.split("(", expand=True)
schedule_df['venue_country'] = schedule_df['venue_country'].str.replace(')', '')

print('Stage 4 Complete')

# Clean columns 
del schedule_df['match_url']

# Get each teams home stadium location details
base_url = 'https://www.worldfootball.net/players/usa-major-league-soccer-'

page = requests.get(base_url+year_range)
soup = BeautifulSoup(page.content, "html.parser")
table_div = soup.find_all("div", {"class": "data"})
table_tag = table_div[2].find("table")

teams_df = pd.read_html(str(table_tag))[0]
teams_df = teams_df.drop(teams_df.columns[2:], axis=1)
del teams_df[0]
teams_df.columns = ['team']

url = []
for tr in table_tag.find_all('tr'):
    tds = tr.find_all('td')
    url.append(tds[1].a.get('href'))
teams_df['url'] = url

print('Stage 5 Complete')

def get_venue_location(url):
    url = 'https://www.worldfootball.net'+url
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    table_div = soup.find_all("table", {"class": "standard_tabelle yellow"})[0]
    return table_div.a.get('title')

teams_df['venue_location'] = teams_df['url'].apply(get_venue_location)
del teams_df['url']

schedule_df = pd.merge(schedule_df, teams_df, how='left', left_on='winner_Team', right_on='team').drop(columns = ['team'])
schedule_df.rename(columns = {"venue_location_x":"venue_location", "venue_location_y": "winner_location"}, inplace=True)
schedule_df.to_csv('data/mls-'+year_range+'.csv')

print('Stage 6 Complete')

Stage 1 Complete
Stage 2 Complete
Stage 3 Complete
/report/major-league-soccer-2021-playoffs-1-runde-philadelphia-union-new-york-rb/
/report/major-league-soccer-2021-playoffs-1-runde-sporting-kansas-city-vancouver-whitecaps/
/report/major-league-soccer-2021-playoffs-1-runde-new-york-city-fc-atlanta-united-fc/
/report/major-league-soccer-2021-playoffs-1-runde-portland-timbers-minnesota-united-fc/
/report/major-league-soccer-2021-playoffs-1-runde-nashville-sc-orlando-city/
/report/major-league-soccer-2021-playoffs-1-runde-seattle-sounders-real-salt-lake/
nan
/report/major-league-soccer-2021-playoffs-conference-semifinals-colorado-rapids-portland-timbers/
/report/major-league-soccer-2021-playoffs-conference-semifinals-sporting-kansas-city-real-salt-lake/
/report/major-league-soccer-2021-playoffs-conference-semifinals-philadelphia-union-nashville-sc/
/report/major-league-soccer-2021-playoffs-conference-semifinals-new-england-revolution-new-york-city-fc/
nan
/report/major-league-soccer-2021

  schedule_df['venue_country'] = schedule_df['venue_country'].str.replace(')', '')


Stage 4 Complete
Stage 5 Complete
Stage 6 Complete
