In [1]:
import pandas as pd
from matplotlib import *
from IPython.core.display import display, HTML
import requests
from bs4 import BeautifulSoup

In [2]:
url_sample = 'https://clinicaltrials.gov/ct2/show/results/NCT01830933'

### Approach 1: Using Pandas - read_html()

In [3]:
# Approach 1 : Read the tabular data from URL using the inbuilt pandas function - read_html
url_data = pd.read_html(url_sample)

In [4]:
len(url_data)

17

In [5]:
url_data[6]

Unnamed: 0,0,1,2
0,,Usual Care,BreastCARE Intervention
1,STARTED,823,812
2,Completed Meet-up,675,603
3,COMPLETED,655,580
4,NOT COMPLETED,168,232


In [6]:
url_data[16]

Unnamed: 0,0,1
0,Responsible Party:,"University of California, San Francisco"
1,ClinicalTrials.gov Identifier:,NCT01830933 History of Changes
2,Obsolete Identifiers:,NCT01836250
3,Other Study ID Numbers:,150B-0158
4,First Submitted:,"April 10, 2013"
5,First Posted:,"April 12, 2013"
6,Results First Submitted:,"May 15, 2013"
7,Results First Posted:,"May 16, 2013"
8,Last Update Posted:,"August 19, 2014"


In [7]:
url_data[9]

Unnamed: 0,0,1,2,3
0,,Usual Care,BreastCARE Intervention,Total
1,Overall Participants Analyzed [Units: Particip...,655,580,1235
2,,,,
3,"Age, Customized [Units: Participants]",,,
4,<50 years,183,182,365
5,51-65 years,362,300,662
6,>65 years,110,98,208
7,,,,
8,Gender [Units: Participants],,,
9,Female,655,580,1235


In [8]:
# URL contain multiple tables and no names for each.
url_data

[    0                                                  1
 0 NaN  The safety and scientific validity of this stu...,
                                                    0
 0         ClinicalTrials.gov Identifier: NCT01830933
 1  Recruitment Status : Completed  First Posted :...,
                0                                                  1
 0    Study Type:                                     Interventional
 1  Study Design:  Allocation: Randomized; Intervention Model: Pa...
 2     Condition:                                      Breast Cancer
 3  Intervention:                                  Other: BreastCARE,
                                                    0
 0  Key information relevant to the recruitment pr...
 1                                   No text entered.,
                                                    0
 0  Significant events and approaches for the over...
 1                                   No text entered.,
                          0                     

Due to the lack of identifiers associated with the tables in url_data, this might not be the best approach for automated scrapping of multiple pages. Let's try another method using **requests** and **BeautifulSoup4** package.

### Approach 2: Using requests and BeautifulSoup

In [9]:
# Get the data from URL 
req = requests.get(url_sample)
# Parse it using lxml and formart it
soup = BeautifulSoup(req.text.encode('utf-8','ignore'),'lxml')
# Get a specific table
base_table = soup.findAll("table",{"class": "results_table"})[1]
# Count number of rows the that table
row_count = str(base_table).count('<tr>')
# Get all rows in a single object
base_table_rows = base_table.findAll('tr')

In [10]:
row_count

22

In [11]:
# Check the formatted HTMLb version of base_table
print(base_table.prettify())

<table cellpadding="0" cellspacing="0" class="results_table" style="margin-top:0.5ex;">
 <tr>
  <th class="brt bold_baseline_color" style="max-width:50%">
  </th>
  <th class="header3 brt bold_baseline_color" style="max-width:20%;text-align:center;">
   Usual Care
  </th>
  <th class="header3 brt bold_baseline_color" style="max-width:20%;text-align:center;">
   BreastCARE Intervention
  </th>
  <th class="header3 brt bold_baseline_color" style="max-width:20%;text-align:center;">
   Total
  </th>
 </tr>
 <tr>
  <th class="header3 brt bold_baseline_color" style="text-align:left;">
   Overall Participants Analyzed
   <br/>
   <span class="body3">
    [Units: Participants]
    <br/>
   </span>
  </th>
  <td class="header3 brt pale_baseline_color" style="text-align:center;">
   655
  </td>
  <td class="header3 brt pale_baseline_color" style="text-align:center;">
   580
  </td>
  <td class="header3 brt pale_baseline_color" style="text-align:center;">
   1235
  </td>
 </tr>
 <tr>
  <td class=

In [12]:
# Let's clean this data
for e in base_table.findAll('br'):
    e.extract()

In [13]:
# Re-check the formatted HTMLb version of base_table to ensure line breaks have been removed
print(base_table.prettify())

<table cellpadding="0" cellspacing="0" class="results_table" style="margin-top:0.5ex;">
 <tr>
  <th class="brt bold_baseline_color" style="max-width:50%">
  </th>
  <th class="header3 brt bold_baseline_color" style="max-width:20%;text-align:center;">
   Usual Care
  </th>
  <th class="header3 brt bold_baseline_color" style="max-width:20%;text-align:center;">
   BreastCARE Intervention
  </th>
  <th class="header3 brt bold_baseline_color" style="max-width:20%;text-align:center;">
   Total
  </th>
 </tr>
 <tr>
  <th class="header3 brt bold_baseline_color" style="text-align:left;">
   Overall Participants Analyzed
   <span class="body3">
    [Units: Participants]
   </span>
  </th>
  <td class="header3 brt pale_baseline_color" style="text-align:center;">
   655
  </td>
  <td class="header3 brt pale_baseline_color" style="text-align:center;">
   580
  </td>
  <td class="header3 brt pale_baseline_color" style="text-align:center;">
   1235
  </td>
 </tr>
 <tr>
  <td class="brt" colspan="1000

In [14]:
# Define header and accumulator
header="column1","column2","column3"
data = list()
data.append(header)

In [15]:
for i in range(0,row_count):
    row = base_table_rows[i]
    col_count_in_a_row = str(row).count('<td')
    col_count_thresh = 3
    if col_count_in_a_row > col_count_thresh:
        # Instead of using .text prefer .contents[0].strip to remve the noise from the table.
        data.append((row.findAll("td")[0].contents[0].strip(),
                    row.findAll("td")[1].contents[0].strip(),
                    row.findAll("td")[2].contents[0].strip(),
                    row.findAll("td")[3].contents[0].strip()))

In [16]:
data

[('column1', 'column2', 'column3'),
 ('<50 years', '183', '182', '365'),
 ('51-65 years', '362', '300', '662'),
 ('>65 years', '110', '98', '208'),
 ('Female', '655', '580', '1235'),
 ('Male', '0', '0', '0'),
 ('Non-Latina White', '229', '202', '431'),
 ('Latina', '144', '141', '285'),
 ('Black/ African American', '150', '125', '275'),
 ('Asian or Pacific Islander', '123', '105', '228'),
 ('Native American or Other', '9', '7', '16'),
 ('United States', '655', '580', '1235')]

In [17]:
pd.DataFrame(data)

Unnamed: 0,0,1,2,3
0,column1,column2,column3,
1,<50 years,183,182,365.0
2,51-65 years,362,300,662.0
3,>65 years,110,98,208.0
4,Female,655,580,1235.0
5,Male,0,0,0.0
6,Non-Latina White,229,202,431.0
7,Latina,144,141,285.0
8,Black/ African American,150,125,275.0
9,Asian or Pacific Islander,123,105,228.0
