# Web Scraping and time series

In [None]:
#import the necessary packages
import pandas as pd
import urllib.request

from bs4 import BeautifulSoup as bsoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

### go to the Eurosport Paris Roubaix cycling results page and load it into a pandas dataframe

https://www.eurosport.com/cycling/paris-roubaix/2022/live-road-race-men_mtc1323920/live.shtml

Hint, pandas read_html() will fail to find the table even though it is on the page.  You need to find the table (class='standing-table') with a selenium driver.get() or BeautifulSoup, then pass that to pandas read_html().

In [53]:
# Reading in the web page

# Eurosport Paris Roabix cycling results
url = "https://www.eurosport.com/cycling/paris-roubaix/2022/live-road-race-men_mtc1323920/live.shtml"

# Setting options to not open the browser / make headless
options = webdriver.ChromeOptions()
options.add_argument("headless")

# Webdriver is in the hw file. But Driver Path is the location in which the chromedriver is located
DRIVER_PATH = 'chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

# Placing the Website in the webdriver
driver.get(url)

In [54]:
# Reading in the Table
df = pd.read_html(driver.find_element(By.CLASS_NAME, 'standing-table').get_attribute('outerHTML'))[0]
df

# Note. Did not fix the names in Riders as was instructed by Dalton that this formating fix was not needed for the assignment.


Unnamed: 0.1,Unnamed: 0,Riders,Teams,Time
0,1,Dylan van BaarleINEOS Grenadiers,INEOS Grenadiers,5h 37' 00''
1,2,Wout van AertTeam Jumbo - Visma,Team Jumbo - Visma,+1' 47''
2,3,Stefan KüngGroupama - FDJ,Groupama - FDJ,+1' 47''
3,4,Tom DevriendtIntermarché - Wanty - Gobert Maté...,Intermarché - Wanty - Gobert Matériaux,+1' 47''
4,5,Matej MohoricBahrain Victorious,Bahrain Victorious,+1' 47''
...,...,...,...,...
164,165,Quentin JaureguiB&B Hotels - KTM,B&B Hotels - KTM,DNF
165,166,Arne MaritSport Vlaanderen - Baloise,Sport Vlaanderen - Baloise,DNF
166,167,Vito BraetSport Vlaanderen - Baloise,Sport Vlaanderen - Baloise,DNF
167,168,Aaron Van PouckeSport Vlaanderen - Baloise,Sport Vlaanderen - Baloise,DNF


In [55]:
# Option 2 for doing the same as above. 
url = "https://www.eurosport.com/cycling/paris-roubaix/2022/live-road-race-men_mtc1323920/live.shtml"

# Setting options to not open the browser
options = webdriver.ChromeOptions()
options.add_argument("headless")

# Placed the Driver in the HW File
DRIVER_PATH = 'chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)

# Labeling and getting the Website
driver.get(url)

# Passing the webpage to Beautiful Soup to parse through
soup = bsoup(driver.page_source, 'html.parser')
table = soup.find('table', {'class':'standing-table'})
# table

# Creating a blank dataframe to place the columns into
new_table = pd.DataFrame(columns=["Rank", "Name", "Team", "Time"])

# Finding The Standing table
rows = table.find_all('tr', {'class':'standing-table__row'})

# Looping through the different rows and placing the values into the coorect columns
for row in rows:
    new_table = new_table.append({"Rank":str(row.find('td', {'class':'standing-table__cell standing-table__cell--position'}).text),
                      "Name":str(row.find('a', {'class':'standing-table__player-link'}).text.strip()[:len(row.find('span', {'class':'team-name'}).text.strip())]),
                      "Team":str(row.find('span', {'class':'team-name'}).text),
                      "Time":str(row.find('td', {'class':'standing-table__cell standing-table__cell--main standing-table__cell--time active-column'}).text)}
                      , ignore_index=True)

new_table


Unnamed: 0,Rank,Name,Team,Time
0,1,Dylan van Baarle,INEOS Grenadiers,5h 37' 00''
1,2,Wout van AertTeam,Team Jumbo - Visma,+1' 47''
2,3,Stefan KüngGro,Groupama - FDJ,+1' 47''
3,4,Tom DevriendtIntermarché - Wanty - Gob,Intermarché - Wanty - Gobert Matériaux,+1' 47''
4,5,Matej MohoricBahra,Bahrain Victorious,+1' 47''
...,...,...,...,...
164,165,Quentin Jauregui,B&B Hotels - KTM,DNF
165,166,Arne MaritSport Vlaanderen,Sport Vlaanderen - Baloise,DNF
166,167,Vito BraetSport Vlaanderen,Sport Vlaanderen - Baloise,DNF
167,168,Aaron Van PouckeSport Vlaa,Sport Vlaanderen - Baloise,DNF


### For the next part of the homework you will be scraping data from the following URL

# US FDA web page
https://www.accessdata.fda.gov/scripts/cder/daf/

You are going to go to the FDA web page and search for Fentanyl.  It should return a list of all drugs that contain fentanyl.  The table that comes back is just a simple list of the drug names.  If you click on the drug name, it expands the row to show Drug Name, Link, NDA number, Dosage Form, Marketing Status, and Manufacturer.  You need to scrape this web page and print those values seperated by commas.

### POST forms
POST forms do not put the data in the URL, the data is submitted via javascript.  You have to pretend to fill out the form and submit it.  Grabbing the parameters
First we need to find out what parameters we’re going to hunt down. To do this, first make your way to the form, then get prepared.

1) In Chrome, more tools > Developer Tools 2) Click the Network tab 3) Fill the form out, and submit it 4) Scroll up to the top of the Network pane, select the segment of the URL you’re at (I’m at tempai.cfm) 5) Click it 6) Select Headers on the right 7) Scroll down until you see Form Data

#### 1)  Part 1, enter the search term fentanyl and submit 
#### 2) Pull the data from the website, check the status code to make sure it was successfull and create a BeautifulSoup object and parse the data.  Print the soup object below:
#### 3) import selenium webdriver and BeautifulSoup
#### 4) run the selenium page_source through beautiful soup using the html parser

Get your webdriver and get the page.  Submit the search term and click the search button.

In [56]:
# Importing Selenium Webdriver and beautiful Soup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
# Beautiful Soup imported above

# Chrome Driver is located in the locale file. But can be changed if necessary.
DRIVER_PATH = 'chromedriver'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)

# Labeling and getting the Website
url = 'https://www.accessdata.fda.gov/scripts/cder/daf/'
driver.get(url)

# Another way to open the page
# url = 'https://www.accessdata.fda.gov/scripts/cder/daf/'
# request_url = urllib.request.urlopen(url)

# Quering for Fentanyl and submitting
driver.find_element(By.NAME, "searchterm").send_keys("fentanyl" + Keys.ENTER)

In [57]:

# Running the selenium pagg_source through beautiful soup. 
pageSource = driver.page_source
soup = bsoup(pageSource, 'html.parser')
soup



<html class="js flexbox canvas canvastext webgl no-touch geolocation postmessage websqldatabase indexeddb hashchange history draganddrop websockets rgba hsla multiplebgs backgroundsize borderimage borderradius boxshadow textshadow opacity cssanimations csscolumns cssgradients cssreflections csstransforms csstransforms3d csstransitions fontface generatedcontent video audio localstorage sessionstorage webworkers no-applicationcache svg inlinesvg smil svgclippaths" style="" xmlns="http://www.w3.org/1999/xhtml" xmlns:addthis="http://www.addthis.com/help/api-spec"><head>
<meta content="IE=EDGE, chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>Drugs@FDA: FDA-Approved Drugs</title>
<meta content="Drugs@FDA: FDA-Approved Drugs" name="dc.title"/>
<meta content="" name="dc.type"/>
<meta content="" name="dc.description"/>
<meta content="" name="dc.language"/>
<meta content="" name="posted"/>
<meta content="" name="Keywords"/>
<me

### 5) Grab all the rows from the table using find_all.  Note that some of the data is in a 'tr' header and some is in 'li'.

Parse the data and print the results

Note the link on the page is a relative link.  You must join that link with 'https://www.accessdata.fda.gov' to make it a link you can click on.
Example:  https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=022510

### The output should look like this (except print all rows)
Drug Name,Link,NDA number,Dosage Form,Marketing Status,Manufacturer

ABSTRAL , https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=022510 , NDA   #022510 , TABLET;SUBLINGUAL , Discontinued , SENTYNL THERAPS INC

ACTIQ , https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=020747 , NDA   #020747 , TROCHE/LOZENGE;TRANSMUCOSAL , Prescription , CEPHALON

DURAGESIC-100 , https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=019813 , NDA   #019813 , FILM, EXTENDED RELEASE;TRANSDERMAL , Discontinued , JANSSEN PHARMS

DURAGESIC-12 , https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=019813 , NDA   #019813 , FILM, EXTENDED RELEASE;TRANSDERMAL , Discontinued , JANSSEN PHARMS

DURAGESIC-25 , https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=019813 , NDA   #019813 , FILM, EXTENDED RELEASE;TRANSDERMAL , Discontinued , JANSSEN PHARMS


In [58]:
df = pd.DataFrame(columns=["Drug Name", "Link", "NDA number", "Dosage Form", "Marketing Status", "Manufacturer"])

# Grabbing all the Results
results = soup.find_all('tr')

# Looping through the results
for result in results:
    
    # Grabbing the links from the webpage
    inner_link = result.find_all('a', href=True)
    for a in inner_link:
        if len(a['href']) > 11:
            row_link = str("https://www.accessdata.fda.gov" + a['href'])
    
    # Grabbing the table results
    inner = result.find_all('li')
    inner_text = inner[0].get_text()
    panda_row  = inner_text.split('|')

    # Placing results in to panda
    df = df.append({"Drug Name": str(panda_row[0])
                    , "Link": row_link
                    , "NDA number": str(panda_row[1])
                    , "Dosage Form": str(panda_row[2])
                    , "Marketing Status": str(panda_row[3])
                    , "Manufacturer": str(panda_row[4]).strip()}, ignore_index=True)

df                    

Unnamed: 0,Drug Name,Link,NDA number,Dosage Form,Marketing Status,Manufacturer
0,ABSTRAL (FENTANYL CITRATE),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #022510,TABLET;SUBLINGUAL,Discontinued,SENTYNL THERAPS INC
1,ACTIQ (FENTANYL CITRATE),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #020747,TROCHE/LOZENGE;TRANSMUCOSAL,Prescription,CEPHALON
2,DURAGESIC-100 (FENTANYL),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #019813,"FILM, EXTENDED RELEASE;TRANSDERMAL",Discontinued,JANSSEN PHARMS
3,DURAGESIC-12 (FENTANYL),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #019813,"FILM, EXTENDED RELEASE;TRANSDERMAL",Discontinued,JANSSEN PHARMS
4,DURAGESIC-25 (FENTANYL),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #019813,"FILM, EXTENDED RELEASE;TRANSDERMAL",Discontinued,JANSSEN PHARMS
5,DURAGESIC-37 (FENTANYL),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #019813,"FILM, EXTENDED RELEASE;TRANSDERMAL",Discontinued,JANSSEN PHARMS
6,DURAGESIC-50 (FENTANYL),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #019813,"FILM, EXTENDED RELEASE;TRANSDERMAL",Discontinued,JANSSEN PHARMS
7,DURAGESIC-75 (FENTANYL),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #019813,"FILM, EXTENDED RELEASE;TRANSDERMAL",Discontinued,JANSSEN PHARMS
8,FENTANYL (FENTANYL CITRATE),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #020195,TROCHE/LOZENGE;ORAL,Discontinued,CEPHALON
9,FENTANYL CITRATE (FENTANYL CITRATE),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #019115,INJECTABLE;INJECTION,Prescription,HOSPIRA


### 10 bonus points for following the link and getting the strength information from that page.  Print the strength information out along with everything you were getting above.

In [59]:
import pandas as pd

# Creating Blank list to keep the dosages in
Strength = []

# Turning column of links into a list to loop through
link_list = df['Link'].to_list()


# Looping through the links and grabbing the first strength of the dosages
for link in link_list:
    request_url = urllib.request.urlopen(link)
    soup = bsoup(request_url, 'html.parser')
    link_strength = soup.find('tr', {'class':'prodBoldText'}).find_all('td')  # Strength Value is the third td in. Finding all the td then next line taking the third
    Strength.append(str(link_strength[2].get_text()))  # Appending the first strength value to the list

df['Strength'] = Strength
df


Unnamed: 0,Drug Name,Link,NDA number,Dosage Form,Marketing Status,Manufacturer,Strength
0,ABSTRAL (FENTANYL CITRATE),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #022510,TABLET;SUBLINGUAL,Discontinued,SENTYNL THERAPS INC,EQ 0.1MG BASE **Federal Register determination...
1,ACTIQ (FENTANYL CITRATE),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #020747,TROCHE/LOZENGE;TRANSMUCOSAL,Prescription,CEPHALON,EQ 0.2MG BASE
2,DURAGESIC-100 (FENTANYL),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #019813,"FILM, EXTENDED RELEASE;TRANSDERMAL",Discontinued,JANSSEN PHARMS,100MCG/HR
3,DURAGESIC-12 (FENTANYL),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #019813,"FILM, EXTENDED RELEASE;TRANSDERMAL",Discontinued,JANSSEN PHARMS,100MCG/HR
4,DURAGESIC-25 (FENTANYL),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #019813,"FILM, EXTENDED RELEASE;TRANSDERMAL",Discontinued,JANSSEN PHARMS,100MCG/HR
5,DURAGESIC-37 (FENTANYL),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #019813,"FILM, EXTENDED RELEASE;TRANSDERMAL",Discontinued,JANSSEN PHARMS,100MCG/HR
6,DURAGESIC-50 (FENTANYL),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #019813,"FILM, EXTENDED RELEASE;TRANSDERMAL",Discontinued,JANSSEN PHARMS,100MCG/HR
7,DURAGESIC-75 (FENTANYL),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #019813,"FILM, EXTENDED RELEASE;TRANSDERMAL",Discontinued,JANSSEN PHARMS,100MCG/HR
8,FENTANYL (FENTANYL CITRATE),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #020195,TROCHE/LOZENGE;ORAL,Discontinued,CEPHALON,100MCG
9,FENTANYL CITRATE (FENTANYL CITRATE),https://www.accessdata.fda.gov/scripts/cder/da...,NDA #019115,INJECTABLE;INJECTION,Prescription,HOSPIRA,EQ 0.05MG BASE/ML
