# Web Extraction for Data

In this notebook we use web extrating techniques to ge information from the ClinicalTrials.gov website. The website itself allows us to download ``.csv`` files which contain most of the information we require for our analysis. Nevertheless, there are some columns we want to have that are not as easily available. 

There are 30,000 webpages that we want to web scrape, below is a code for automated web scraping that speeds up our data collection process. We use this code to scrape :

   * NCT Number (so that we can match it to the ``.csv`` files
   * Desired enrollment 
   * Countires were the trial was conducted

**Note:** If you have the NCT Number for a clinical trial you want to extract information from, you can use this code to do so. 

## Import packages

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# table_data.py
import urllib3

## Cancer data web scraping

In [11]:
cancer_data= pd.read_csv('cancer_data.csv')
http = urllib3.PoolManager()

In [12]:
#NCT Numbers from the clinical trial numbers

list_NCT= list(cancer_data['NCT Number'])
URL = 'https://clinicaltrials.gov/ct2/show/record/'

final_data= []

for NCT in list_NCT:
    
    #request permission 
    req= http.request('GET', URL + str(NCT) + '/')
    
    #extract data
    soup = BeautifulSoup(req.data, 'html.parser')
    
    #find table in webpage with the data 
    table = soup.find('table', attrs={'class': 'ct-data_table tr-data_table tr-tableStyle'})

    #find all table rows
    rows = table.find_all('tr')
    
    #print(rows)
    
    #extract the variable and value from rows in webpages
    data= []
    
    for i in range(len(rows)):
        if rows[i].th is None:
            name= 'NA'
        else:
            name= rows[i].th.text
        if rows[i].td is None:
            value= 'NA'
        else:
            value= rows[i].td.text
        data.append(tuple([name, value]))
    
    #create dataframe with all the values
    df = pd.DataFrame(data, columns=['Name', 'Value'])
    #print(df)
    
    #extract enrollment, NCT number, and Countries 
    my_df= df[df['Name'].str.contains('Enrollment') | df['Name'].str.contains('NCT') | df['Name'].str.contains('Listed Location Countries') ].reset_index()
    
    #get rid of unnecessary strings in the values 
    my_df['Value']= my_df['Value'].str.rstrip()
    
    my_df['Value'][2].replace("\xa0", "")
    #print(my_df)
    
    #print(values_df)
    #append data we want to our final list
    final_data.append(list(my_df['Value']))

## Save the files 

In [13]:
#Create data frame with our final list 
enrollment_countries_df = pd.DataFrame(final_data, columns=['Actual', 'Estimated', 'Countries', 'NCT Number'])

In [14]:
#Create csv file
enrollment_countries_df.to_csv('enrollment_countries_cancer.csv',index=False)

In [15]:
my_df = pd.read_csv('enrollment_countries_cancer.csv')

## Cardiovascular data web scraping


In [None]:
cardiovascular_data= pd.read_csv('cardiovascular_data.csv')
http = urllib3.PoolManager()

In [None]:
#NCT Numbers from the clinical trial numbers

list_NCT= list(cardiovascular_data['NCT Number'])
URL = 'https://clinicaltrials.gov/ct2/show/record/'

final_data= []

for NCT in list_NCT:
    
    #request permission 
    req= http.request('GET', URL + str(NCT) + '/')
    
    #extract data
    soup = BeautifulSoup(req.data, 'html.parser')
    
    #find table in webpage with the data 
    table = soup.find('table', attrs={'class': 'ct-data_table tr-data_table tr-tableStyle'})

    #find all table rows
    rows = table.find_all('tr')
    
    #print(rows)
    
    #extract the variable and value from rows in webpages
    data= []
    
    for i in range(len(rows)):
        if rows[i].th is None:
            name= 'NA'
        else:
            name= rows[i].th.text
        if rows[i].td is None:
            value= 'NA'
        else:
            value= rows[i].td.text
        data.append(tuple([name, value]))
    
    #create dataframe with all the values
    df = pd.DataFrame(data, columns=['Name', 'Value'])
    #print(df)
    
    #extract enrollment, NCT number, and Countries 
    my_df= df[df['Name'].str.contains('Enrollment') | df['Name'].str.contains('NCT') | df['Name'].str.contains('Listed Location Countries') ].reset_index()
    
    #get rid of unnecessary strings in the values 
    my_df['Value']= my_df['Value'].str.rstrip()
    
    my_df['Value'][2].replace("\xa0", "")
    #print(my_df)
    
    #print(values_df)
    #append data we want to our final list
    final_data.append(list(my_df['Value']))

In [None]:
#Create data frame with our final list 
enrollment_countries_cardiovascular = pd.DataFrame(final_data, columns=['Actual', 'Estimated', 'Countries', 'NCT Number'])

In [None]:
#Create csv file
enrollment_countries_cardiovascular.to_csv('enrollment_countries_cardiovascular.csv',index=False)

In [None]:
my_df = pd.read_csv('enrollment_countries_cardiovascular.csv')

## Respiratory diseases web scraping 

In [None]:
respiratory_data= pd.read_csv('respiratory_data.csv')
http = urllib3.PoolManager()

In [None]:
#NCT Numbers from the clinical trial numbers

list_NCT= list(respiratory_data['NCT Number'])
URL = 'https://clinicaltrials.gov/ct2/show/record/'

final_data= []

for NCT in list_NCT:
    
    #request permission 
    req= http.request('GET', URL + str(NCT) + '/')
    
    #extract data
    soup = BeautifulSoup(req.data, 'html.parser')
    
    #find table in webpage with the data 
    table = soup.find('table', attrs={'class': 'ct-data_table tr-data_table tr-tableStyle'})

    #find all table rows
    rows = table.find_all('tr')
    
    #print(rows)
    
    #extract the variable and value from rows in webpages
    data= []
    
    for i in range(len(rows)):
        if rows[i].th is None:
            name= 'NA'
        else:
            name= rows[i].th.text
        if rows[i].td is None:
            value= 'NA'
        else:
            value= rows[i].td.text
        data.append(tuple([name, value]))
    
    #create dataframe with all the values
    df = pd.DataFrame(data, columns=['Name', 'Value'])
    #print(df)
    
    #extract enrollment, NCT number, and Countries 
    my_df= df[df['Name'].str.contains('Enrollment') | df['Name'].str.contains('NCT') | df['Name'].str.contains('Listed Location Countries') ].reset_index()
    
    #get rid of unnecessary strings in the values 
    my_df['Value']= my_df['Value'].str.rstrip()
    
    my_df['Value'][2].replace("\xa0", "")
    #print(my_df)
    
    #print(values_df)
    #append data we want to our final list
    final_data.append(list(my_df['Value']))

In [None]:
#Create data frame with our final list 
enrollment_countries_respiratory = pd.DataFrame(final_data, columns=['Actual', 'Estimated', 'Countries', 'NCT Number'])

In [None]:
#Create csv file
enrollment_countries_respiratory.to_csv('enrollment_countries_respiratory.csv',index=False)

In [None]:
my_df = pd.read_csv('enrollment_countries_respiratory.csv')