# World Cup Stats

Use Pandas scraping to collect the 2019 FIFA World Cup Stats.

In [1]:
import pandas as pd

Use Pandas to scrape the following site, extract the "Tournament ranking" results table, clean the data, and export it to a CSV.

In [2]:
url = 'https://en.wikipedia.org/wiki/2019_FIFA_Women%27s_World_Cup'

In [3]:
# Use the Pandas `read_html` to parse the url


[   Coupe du Monde Féminine de la FIFA – France 2019  \
 0                                               NaN   
 1                                Tournament details   
 2                                      Host country   
 3                                             Dates   
 4                                             Teams   
 5                                          Venue(s)   
 6                                   Final positions   
 7                                         Champions   
 8                                        Runners-up   
 9                                       Third place   
 10                                     Fourth place   
 11                            Tournament statistics   
 12                                   Matches played   
 13                                     Goals scored   
 14                                       Attendance   
 15                                    Top scorer(s)   
 16                                   Best playe

In [4]:
# Find the correct table


Unnamed: 0,Pos,Grp,Team,Pld,W,D,L,GF,GA,GD,Pts,Final result
0,1.0,F,United States,7.0,7.0,0.0,0.0,26.0,3.0,+23,21.0,Champions
1,2.0,E,Netherlands,7.0,6.0,0.0,1.0,11.0,5.0,+6,18.0,Runners-up
2,3.0,F,Sweden,7.0,5.0,0.0,2.0,12.0,6.0,+6,15.0,Third place
3,4.0,D,England,7.0,5.0,0.0,2.0,13.0,5.0,+8,15.0,Fourth place
4,,,,,,,,,,,,
5,5.0,B,Germany,5.0,4.0,0.0,1.0,10.0,2.0,+8,12.0,Eliminated in quarter-finals
6,6.0,A,France (H),5.0,4.0,0.0,1.0,10.0,4.0,+6,12.0,Eliminated in quarter-finals
7,7.0,C,Italy,5.0,3.0,0.0,2.0,9.0,4.0,+5,9.0,Eliminated in quarter-finals
8,8.0,A,Norway,5.0,2.0,1.0,2.0,7.0,7.0,0,7.0,Eliminated in quarter-finals
9,,,,,,,,,,,,


In [5]:
# Save the table to a DataFrame


In [6]:
# Drop NA rows and reset the index


Unnamed: 0,index,Pos,Grp,Team,Pld,W,D,L,GF,GA,GD,Pts,Final result
0,0,1.0,F,United States,7.0,7.0,0.0,0.0,26.0,3.0,+23,21.0,Champions
1,1,2.0,E,Netherlands,7.0,6.0,0.0,1.0,11.0,5.0,+6,18.0,Runners-up
2,2,3.0,F,Sweden,7.0,5.0,0.0,2.0,12.0,6.0,+6,15.0,Third place
3,3,4.0,D,England,7.0,5.0,0.0,2.0,13.0,5.0,+8,15.0,Fourth place
4,5,5.0,B,Germany,5.0,4.0,0.0,1.0,10.0,2.0,+8,12.0,Eliminated in quarter-finals
5,6,6.0,A,France (H),5.0,4.0,0.0,1.0,10.0,4.0,+6,12.0,Eliminated in quarter-finals
6,7,7.0,C,Italy,5.0,3.0,0.0,2.0,9.0,4.0,+5,9.0,Eliminated in quarter-finals
7,8,8.0,A,Norway,5.0,2.0,1.0,2.0,7.0,7.0,0,7.0,Eliminated in quarter-finals
8,10,9.0,C,Australia,4.0,2.0,1.0,1.0,9.0,6.0,+3,7.0,Eliminated in round of 16
9,11,10.0,C,Brazil,4.0,2.0,0.0,2.0,7.0,5.0,+2,6.0,Eliminated in round of 16


In [7]:
# Check the data types


index             int64
Pos             float64
Grp              object
Team             object
Pld             float64
W               float64
D               float64
L               float64
GF              float64
GA              float64
GD               object
Pts             float64
Final result     object
dtype: object

In [8]:
# Remove the "+" and replace the "−" with "-" from the "GD" column


Unnamed: 0,index,Pos,Grp,Team,Pld,W,D,L,GF,GA,GD,Pts,Final result
0,0,1.0,F,United States,7.0,7.0,0.0,0.0,26.0,3.0,23,21.0,Champions
1,1,2.0,E,Netherlands,7.0,6.0,0.0,1.0,11.0,5.0,6,18.0,Runners-up
2,2,3.0,F,Sweden,7.0,5.0,0.0,2.0,12.0,6.0,6,15.0,Third place
3,3,4.0,D,England,7.0,5.0,0.0,2.0,13.0,5.0,8,15.0,Fourth place
4,5,5.0,B,Germany,5.0,4.0,0.0,1.0,10.0,2.0,8,12.0,Eliminated in quarter-finals
5,6,6.0,A,France (H),5.0,4.0,0.0,1.0,10.0,4.0,6,12.0,Eliminated in quarter-finals
6,7,7.0,C,Italy,5.0,3.0,0.0,2.0,9.0,4.0,5,9.0,Eliminated in quarter-finals
7,8,8.0,A,Norway,5.0,2.0,1.0,2.0,7.0,7.0,0,7.0,Eliminated in quarter-finals
8,10,9.0,C,Australia,4.0,2.0,1.0,1.0,9.0,6.0,3,7.0,Eliminated in round of 16
9,11,10.0,C,Brazil,4.0,2.0,0.0,2.0,7.0,5.0,2,6.0,Eliminated in round of 16


In [9]:
# Convert the following columns to dtype int
cols = ["Pos", "Pld", "W", "D", "L", "GF", "GA", "GD", "Pts"]



Unnamed: 0,index,Pos,Grp,Team,Pld,W,D,L,GF,GA,GD,Pts,Final result
0,0,1,F,United States,7,7,0,0,26,3,23,21,Champions
1,1,2,E,Netherlands,7,6,0,1,11,5,6,18,Runners-up
2,2,3,F,Sweden,7,5,0,2,12,6,6,15,Third place
3,3,4,D,England,7,5,0,2,13,5,8,15,Fourth place
4,5,5,B,Germany,5,4,0,1,10,2,8,12,Eliminated in quarter-finals
5,6,6,A,France (H),5,4,0,1,10,4,6,12,Eliminated in quarter-finals
6,7,7,C,Italy,5,3,0,2,9,4,5,9,Eliminated in quarter-finals
7,8,8,A,Norway,5,2,1,2,7,7,0,7,Eliminated in quarter-finals
8,10,9,C,Australia,4,2,1,1,9,6,3,7,Eliminated in round of 16
9,11,10,C,Brazil,4,2,0,2,7,5,2,6,Eliminated in round of 16


In [10]:
# Export as a CSV without the index
