In [3]:
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
import pandas as pd

In [4]:
#data collectuion through webscraping
#get website URL and prettify
#this webscraping notebook was made during 11/26/2023 - 11/30/2023, as such the website version may have changed

url_link = "https://en.wikipedia.org/wiki/2022_FIFA_World_Cup"
result = requests.get(url_link).text

doc = BeautifulSoup(result, "html.parser")

In [5]:
#group stage data

def custom_selector(tag):
    #custom selector that gets table for group stage stuff
    return tag.name == "table" and tag.has_attr("class") and ("wikitable" in tag.get("class")) and (len(tag.get("class")) == 1)

groups = doc.find_all(custom_selector)   
    

In [6]:
#gets all the teams

teams = []

for group in groups:
    messyteams = group.find_all('a', title = re.compile("team"))
    teams += [[team.text for team in messyteams]]
    
#remove last element in list since it doesn't contain any new team and combine into one big list

teams = teams[:-1]
allTeams = [item for sublist in teams for item in sublist]


In [7]:
# gets all the stats associated with each group

stats = []

for group in groups:
    statistics = group.find_all('td', style = re.compile("^(font-weight)"), string = re.compile("[1234567890]"))
    stats += [[stat.text.strip() for stat in statistics]]


In [8]:
#aggregate stats by team within each group

statsByTeam = []

for group in stats:
    if len(group) == 32:
        curr = np.array_split(group, 4)
        statsByTeam.append(curr)
        
statsByTeam  

#append each team to its corresponding stats

counter = 0
dfList = []
groupNames = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']

for group in statsByTeam:
    g = []
    for team in group:
        team = np.append(team, [allTeams[counter], groupNames[counter // 4]])
        g.append(team)
        counter += 1
    dfList.append(g)

#combine into one final list

dfFinalList = [items for groups in dfList for items in groups]


In [9]:
#get the columns for the dataframe

cols = groups[0].find_all("th", width = "28")

finalColumns = []

for i in cols:
    finalColumns.append(i.text.strip())

finalColumns = finalColumns[1:] + ['Country', 'Group']


In [10]:
#dataframe containing all the group stage stats

groupStageDF = pd.DataFrame(dfFinalList, columns = finalColumns).set_index("Country")

#Basic Data cleaning

groupStageDF = groupStageDF.drop(['Pld'], axis = 1)

convert_dict = {'W': int,
                'D': int,
                'L': int,
                'GF': int,
                'GA': int,
                'Pts': int
                }

groupStageDF = groupStageDF.astype(convert_dict)
groupStageDF['GD'] = groupStageDF['GD'].apply(lambda x: int(x[1]) if x[0] == '+' else 0 if len(x) == 1 else -int(x[1]))
groupStageDF

Unnamed: 0_level_0,W,D,L,GF,GA,GD,Pts,Group
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Netherlands,2,1,0,5,1,4,7,A
Senegal,2,0,1,5,4,1,6,A
Ecuador,1,1,1,4,3,1,4,A
Qatar,0,0,3,1,7,-6,0,A
England,2,1,0,9,2,7,7,B
United States,1,2,0,2,1,1,5,B
Iran,1,0,2,4,7,-3,3,B
Wales,0,1,2,1,6,-5,1,B
Argentina,2,0,1,5,2,3,6,C
Poland,1,1,1,2,2,0,4,C


In [13]:
# write to CSV
groupStageDF.to_csv('groupstage.csv')

In [12]:
# use this to get the scores for all the matches in the word cup
# scores were collected in the order in which they were played in the tournament

groups = doc.find_all("div", {"itemtype": "http://schema.org/SportsEvent", "class": "footballbox"})

for i in groups:
    print(i.find("th", {"class": "fhome"}).text)
    print(i.find("th", {"class": "faway"}).text)
    print(i.find("th", {"class": "fscore"}).text)

groups

Qatar 
 Ecuador
0–2
Senegal 
 Netherlands
0–2
Qatar 
 Senegal
1–3
Netherlands 
 Ecuador
1–1
Ecuador 
 Senegal
1–2
Netherlands 
 Qatar
2–0
England 
 Iran
6–2
United States 
 Wales
1–1
Wales 
 Iran
0–2
England 
 United States
0–0
Wales 
 England
0–3
Iran 
 United States
0–1
Argentina 
 Saudi Arabia
1–2
Mexico 
 Poland
0–0
Poland 
 Saudi Arabia
2–0
Argentina 
 Mexico
2–0
Poland 
 Argentina
0–2
Saudi Arabia 
 Mexico
1–2
Denmark 
 Tunisia
0–0
France 
 Australia
4–1
Tunisia 
 Australia
0–1
France 
 Denmark
2–1
Australia 
 Denmark
1–0
Tunisia 
 France
1–0
Germany 
 Japan
1–2
Spain 
 Costa Rica
7–0
Japan 
 Costa Rica
0–1
Spain 
 Germany
1–1
Japan 
 Spain
2–1
Costa Rica 
 Germany
2–4
Morocco 
 Croatia
0–0
Belgium 
 Canada
1–0
Belgium 
 Morocco
0–2
Croatia 
 Canada
4–1
Croatia 
 Belgium
0–0
Canada 
 Morocco
1–2
Switzerland 
 Cameroon
1–0
Brazil 
 Serbia
2–0
Cameroon 
 Serbia
3–3
Brazil 
  Switzerland
1–0
Serbia 
  Switzerland
2–3
Cameroon 
 Brazil
1–0
Uruguay 
 South Korea
0–0
Portugal 
 Ghana
3

[<div class="footballbox" itemscope="" itemtype="http://schema.org/SportsEvent">
 <div class="fleft"><time><div class="fdate">20 November 2022<span style="display:none"> (<span class="bday dtstart published updated">2022-11-20</span>)</span></div><div class="ftime">19:00</div></time></div><table class="fevent"><tbody><tr itemprop="name">
 <th class="fhome" itemprop="homeTeam" itemscope="" itemtype="http://schema.org/SportsTeam"><span itemprop="name"><a href="/wiki/Qatar_national_football_team" title="Qatar national football team">Qatar</a><span class="flagicon"> <span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="201" data-file-width="512" decoding="async" height="9" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/65/Flag_of_Qatar.svg/23px-Flag_of_Qatar.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/65/Flag_of_Qatar.svg/35px-Flag_of_Qatar.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/6/65/Fla