# World Cup Stats

Use Pandas scraping to collect the 2019 FIFA World Cup Stats.

In [1]:
import pandas as pd

Use Pandas to scrape the following site, extract the "Tournament ranking" results table, clean the data, and export it to a CSV.

In [2]:
url = 'https://en.wikipedia.org/wiki/2019_FIFA_Women%27s_World_Cup'

In [3]:
# Use the Pandas `read_html` to parse the url
tables = pd.read_html(url)
tables

[   Coupe du Monde Féminine de la FIFA – France 2019  \
 0                                               NaN   
 1                                Tournament details   
 2                                      Host country   
 3                                             Dates   
 4                                             Teams   
 5                                          Venue(s)   
 6                                   Final positions   
 7                                         Champions   
 8                                        Runners-up   
 9                                       Third place   
 10                                     Fourth place   
 11                            Tournament statistics   
 12                                   Matches played   
 13                                     Goals scored   
 14                                       Attendance   
 15                                    Top scorer(s)   
 16                                   Best playe

In [18]:
# Find the correct table
for i in range(79):
    print(f"table: {i}")
    print(tables[i])
# tables[13]

table: 0
   Coupe du Monde Féminine de la FIFA – France 2019  \
0                                               NaN   
1                                Tournament details   
2                                      Host country   
3                                             Dates   
4                                             Teams   
5                                          Venue(s)   
6                                   Final positions   
7                                         Champions   
8                                        Runners-up   
9                                       Third place   
10                                     Fourth place   
11                            Tournament statistics   
12                                   Matches played   
13                                     Goals scored   
14                                       Attendance   
15                                    Top scorer(s)   
16                                   Best player(s)   
1

In [26]:
# Save the table to a DataFrame
stats_df = pd.DataFrame(tables[18])
stats_df

Unnamed: 0,Pos,Teamvte,Pld,W,D,L,GF,GA,GD,Pts,Qualification
0,1,Germany,3,3,0,0,6,0,+6,9,Advance to knockout stage
1,2,Spain,3,1,1,1,3,2,+1,4,Advance to knockout stage
2,3,China,3,1,1,1,1,1,0,4,Advance to knockout stage
3,4,South Africa,3,0,0,3,1,8,−7,0,


In [23]:
# Drop NA rows and reset the index
stats_df = stats_df.dropna().reset_index()
stats_df

Unnamed: 0,index,Pos,Teamvte,Pld,W,D,L,GF,GA,GD,Pts,Qualification
0,0,1,Germany,3,3,0,0,6,0,6,9,Advance to knockout stage
1,1,2,Spain,3,1,1,1,3,2,1,4,Advance to knockout stage
2,2,3,China,3,1,1,1,1,1,0,4,Advance to knockout stage


In [21]:
# Check the data types
stats_df.dtypes

index            int64
France          object
4–0             object
South Korea    float64
dtype: object

In [24]:
# Remove the "+" and replace the "−" with "-" from the "GD" column
stats_df["GD"] = stats_df["GD"].str.replace("+", "", regex=False)
stats_df["GD"] = stats_df["GD"].str.replace("−", "-", regex=False)
stats_df

Unnamed: 0,index,Pos,Teamvte,Pld,W,D,L,GF,GA,GD,Pts,Qualification
0,0,1,Germany,3,3,0,0,6,0,6,9,Advance to knockout stage
1,1,2,Spain,3,1,1,1,3,2,1,4,Advance to knockout stage
2,2,3,China,3,1,1,1,1,1,0,4,Advance to knockout stage


In [25]:
# Convert the following columns to dtype int
cols = ["Pos", "Pld", "W", "D", "L", "GF", "GA", "GD", "Pts"]

for col in cols:
    stats_df[col] = stats_df[col].astype('int')
stats_df

Unnamed: 0,index,Pos,Teamvte,Pld,W,D,L,GF,GA,GD,Pts,Qualification
0,0,1,Germany,3,3,0,0,6,0,6,9,Advance to knockout stage
1,1,2,Spain,3,1,1,1,3,2,1,4,Advance to knockout stage
2,2,3,China,3,1,1,1,1,1,0,4,Advance to knockout stage


In [None]:
# Export as a CSV without the index
stats_df.to_csv("fifa_stats.csv", index=False)