In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import json
import codecs

In [3]:
from scraperutils import *

In [4]:
url = "https://www.ussoccer.com/results-statistics"
html = urlopen(url)

In [5]:
soup = BeautifulSoup(html)

In [6]:
column_headers = [th.getText() for th in 
                  soup.findAll('tr', limit=2)[0].findAll('th')]

In [7]:
column_headers

['Date', 'Matchup', 'Result', 'Venue', 'Attendance', 'Goal Scorers']

In [8]:
data_rows = soup.findAll('tr')[1:]

In [9]:
data_rows

[<tr class="year-2019 team-U-20-WNT" itemscope="" itemtype="https://schema.org/SportsEvent">
 <td>
 <time datetime="2019-03-05T08:00:00Z" itemprop="startDate">March 5, 2019</time>
 </td>
 <td>
 <meta content="U-20 WNT vs Sweden U-19" itemprop="name"/>
 <a href="/us-under20-womens-national-team/tournaments/2019-womens-u19-la-manga-tournament/20190305-u20wnt-v-sweden-u19-la-manga" itemprop="url">
 <span itemprop="competitor" itemscope="" itemtype="https://schema.org/SportsTeam"><span itemprop="name">U-20 WNT</span></span>
 <text>vs</text>
 <span itemprop="competitor" itemscope="" itemtype="https://schema.org/SportsTeam"><span itemprop="name">Sweden U-19</span></span>
 </a>
 </td>
 <td>4-0 <strong>W</strong> </td>
 <td itemprop="location" itemscope="" itemtype="https://schema.org/Place"><span itemprop="address"><meta content="La Manga Club; Cartagena, Spain" itemprop="name"/>La Manga Club; Cartagena, Spain</span></td>
 <td></td>
 <td>Bell, Yates, Morris, Doms</td>
 </tr>,
 <tr class="year

In [10]:
player_data = [[cleanString(td.getText()) for td in data_rows[i].findAll('td')]
            for i in range(len(data_rows))]

In [11]:
df = pd.DataFrame(player_data, columns=column_headers)


In [12]:
df.head()

Unnamed: 0,Date,Matchup,Result,Venue,Attendance,Goal Scorers
0,"March 5, 2019",U-20 WNT vs Sweden U-19,4-0 W,"La Manga Club; Cartagena, Spain",,"Bell, Yates, Morris, Doms"
1,"March 3, 2019",U-20 WNT vs France U-19,2-3 L,"La Manga Club; Cartagena, Spain",,"D'Aquila, Dyke"
2,"March 2, 2019",WNT vs England - SheBelieves Cup,2-2 D,"Nissan Stadium; Nashville, Tenn.",22125.0,"Rapinoe, Heath"
3,"March 1, 2019",U-20 WNT vs Germany U-19,2-3 L,"La Manga Club; Cartagena, Spain",,"Meza, Canniff"
4,"February 27, 2019",WNT vs Japan - SheBelieves Cup,2-2 D,"Talen Energy Stadium; Chester, Pa.",14555.0,"Rapinoe, Morgan"


In [13]:
df["Venue"]

0                        La Manga Club; Cartagena, Spain
1                        La Manga Club; Cartagena, Spain
2                       Nissan Stadium; Nashville, Tenn.
3                        La Manga Club; Cartagena, Spain
4                     Talen Energy Stadium; Chester, Pa.
5      Bisham Abbey National Sports Centre; Bisham, E...
6      Bisham Abbey National Sports Centre; Bisham, E...
7                        Avaya Stadium; San Jose, Calif.
8                    State Farm Stadium; Glendale, Ariz.
9               Estadio José Rico Perez; Alicante, Spain
10                        Stade Océane; Le Havre, France
11           Premier Sports Campus; Lakewood Ranch, Fla.
12           Premier Sports Campus; Lakewood Ranch, Fla.
13           Premier Sports Campus; Lakewood Ranch, Fla.
14           Premier Sports Campus; Lakewood Ranch, Fla.
15                  IMG Academy Stadium; Bradenton, Fla.
16                  Estadio Charrúa; Montevideo, Uruguay
17                          Lum

In [14]:
# df["Date"] = pd.to_datetime(df["Date"])

In [15]:
df['Matchup'] = df['Matchup'].apply(replaceSponsor)

In [16]:
df['Matchup']

0                                U-20 WNT vs Sweden U-19
1                                U-20 WNT vs France U-19
2                       WNT vs England - SheBelieves Cup
3                               U-20 WNT vs Germany U-19
4                         WNT vs Japan - SheBelieves Cup
5                                    U-18 WNT vs England
6                                    U-18 WNT vs England
7                                      MNT vs Costa Rica
8                                          MNT vs Panama
9                                           WNT vs Spain
10                                         WNT vs France
11                                    U-20 WNT vs France
12                                    U-17 MNT vs Brazil
13                                    U-17 MNT vs Turkey
14                                  U-17 MNT vs Portugal
15       U-20 MNT vs Mexico - Concacaf U-20 Championship
16             U-17 WNT vs Germany - FIFA U-17 World Cup
17                             

In [17]:
df["Venue"] = df["Venue"].apply(cleanString)

In [18]:
df["OpponentTeam"] = df["Matchup"].apply(getOpponentTeam)

In [19]:
df["USTeam"] = df["Matchup"].apply(getUSTeam)

In [20]:
df["Competition"] = df["Matchup"].apply(getCompetition)

In [21]:
df.drop('Matchup', axis='columns', inplace=True)

In [22]:
df["Venue"]

0                        La Manga Club; Cartagena, Spain
1                        La Manga Club; Cartagena, Spain
2                       Nissan Stadium; Nashville, Tenn.
3                        La Manga Club; Cartagena, Spain
4                     Talen Energy Stadium; Chester, Pa.
5      Bisham Abbey National Sports Centre; Bisham, E...
6      Bisham Abbey National Sports Centre; Bisham, E...
7                        Avaya Stadium; San Jose, Calif.
8                    State Farm Stadium; Glendale, Ariz.
9               Estadio José Rico Perez; Alicante, Spain
10                        Stade Océane; Le Havre, France
11           Premier Sports Campus; Lakewood Ranch, Fla.
12           Premier Sports Campus; Lakewood Ranch, Fla.
13           Premier Sports Campus; Lakewood Ranch, Fla.
14           Premier Sports Campus; Lakewood Ranch, Fla.
15                  IMG Academy Stadium; Bradenton, Fla.
16                  Estadio Charrúa; Montevideo, Uruguay
17                          Lum

In [23]:
df.head()

Unnamed: 0,Date,Result,Venue,Attendance,Goal Scorers,OpponentTeam,USTeam,Competition
0,"March 5, 2019",4-0 W,"La Manga Club; Cartagena, Spain",,"Bell, Yates, Morris, Doms",Sweden U-19,U-20 WNT,
1,"March 3, 2019",2-3 L,"La Manga Club; Cartagena, Spain",,"D'Aquila, Dyke",France U-19,U-20 WNT,
2,"March 2, 2019",2-2 D,"Nissan Stadium; Nashville, Tenn.",22125.0,"Rapinoe, Heath",England,WNT,SheBelieves Cup
3,"March 1, 2019",2-3 L,"La Manga Club; Cartagena, Spain",,"Meza, Canniff",Germany U-19,U-20 WNT,
4,"February 27, 2019",2-2 D,"Talen Energy Stadium; Chester, Pa.",14555.0,"Rapinoe, Morgan",Japan,WNT,SheBelieves Cup


In [27]:
df["Result"] = df["Result"].apply(getScoreArray)

In [31]:
df.head()

Unnamed: 0,Date,Result,Venue,Attendance,Goal Scorers,OpponentTeam,USTeam,Competition
0,"March 5, 2019","[4, 0]","La Manga Club; Cartagena, Spain",,"Bell, Yates, Morris, Doms",Sweden U-19,U-20 WNT,
1,"March 3, 2019","[2, 3]","La Manga Club; Cartagena, Spain",,"D'Aquila, Dyke",France U-19,U-20 WNT,
2,"March 2, 2019","[2, 2]","Nissan Stadium; Nashville, Tenn.",22125.0,"Rapinoe, Heath",England,WNT,SheBelieves Cup
3,"March 1, 2019","[2, 3]","La Manga Club; Cartagena, Spain",,"Meza, Canniff",Germany U-19,U-20 WNT,
4,"February 27, 2019","[2, 2]","Talen Energy Stadium; Chester, Pa.",14555.0,"Rapinoe, Morgan",Japan,WNT,SheBelieves Cup


In [32]:
data = json.dumps(json.loads(df.to_json(orient='records')), indent=2)

In [33]:
d = json.loads(data)

In [34]:
with open('results.json', 'w') as outfile:
    json.dump(d,outfile,ensure_ascii=False)