In [1]:
# NOTEBOOK TO BE RUN EVERY MONDAY

# Import Libraries

In [2]:
################ WEB SCRAPING MODULES ############
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import bs4
from fake_useragent import UserAgent
import requests
################ TIME MODLULES ###################
import time
import datetime
############## DATA MANIPULATION MODULES #########
import pandas as pd
import numpy as np
import re

# Define Functions

In [3]:
def get_links_and_text():
    link=[i.get('href') for i in soup.find_all('a')]
    text=[i.text for i in soup.find_all('a')]
    #############################################################
    INDEX_1=[i for i in range(len(text)) if '$' in text[i]]
    #############################################################
    link=[link[i] for i in INDEX_1]
    text=[text[i] for i in INDEX_1]
    return link,text

In [4]:
def create_features(z):
    amount=[]
    mag=[]
    for i,x in enumerate(z):
        x=re.sub('(?<=\d),(?=\d)', '',x) # replace comma b/w integers $2,800 --> $2800
        x=x.replace(". ", " ").replace(", "," ") # Remove all fullstops from string, except if it's between digits
        x=re.sub('[^A-Za-z0-9$.]+', ' ', x) # remove all special characters except $ and .
        a=x[x.index('$'):].split()[0][-1],x[x.index('$'):].split()[0][1:-1]
        amount.append(float(a[1]))
        if(a[0].lower()=='m'):
            mag.append('Millions')
        elif(a[0].lower()=='b'):
            mag.append('Billions')
        else:
            mag.append(np.nan)
    currency=['USD' for i in range(len(z))]
    return amount,mag,currency

# Scrape [FiercePharma](https://www.fiercepharma.com/) 

In [5]:
driver = webdriver.Chrome(ChromeDriverManager().install()) # open Chrome driver/window
y,z=[],[]
# Scrape links and headlines from first 5 pages of FiercePharma
# Multiple iterations(5) are done to eliminate any chances of error
for j in range(1,5):
    print('Iteration',j)
    print('############')
    for i in range(1,6):
        driver.get('https://www.fiercepharma.com/?page={}'.format(i)) # open web page on Chrome window
        source = driver.page_source
        soup=bs4.BeautifulSoup(source, 'html.parser')
        link,text=get_links_and_text()
        print('No. of links and text are: ',len(link),len(text))
        y.append(link)
        z.append(text)
        time.sleep(5)
        print('*'*50)

Iteration 1
############
No. of links and text are:  0 0
**************************************************
No. of links and text are:  0 0
**************************************************
No. of links and text are:  0 0
**************************************************
No. of links and text are:  0 0
**************************************************
No. of links and text are:  0 0
**************************************************
Iteration 2
############
No. of links and text are:  0 0
**************************************************
No. of links and text are:  1 1
**************************************************
No. of links and text are:  1 1
**************************************************
No. of links and text are:  0 0
**************************************************
No. of links and text are:  0 0
**************************************************
Iteration 3
############
No. of links and text are:  2 2
**************************************************
No. of links

In [6]:
# Normalize features
y=[j for i in y for j in i]
z=[j for i in z for j in i]
# Get Amount from headlines/title
amount,mag,currency=create_features(z)

In [7]:
# Parse each article in DataFrame and scrape Fund type, dates, investors and series type
investors=[]
series=[]
sector=[]
dates=[]
sector_list=[' Therapeutics',' Biopharma',' Pharma',' Biotherapeutics',' BioTherapeutics',' Inc',
             ' Pharmaceuticals',' Bioinformatics','Dx',' Biotics',' Biotechnology',' Diagnostics',' Ltd']
print('Number of articles to parse =',len(y))
for j,url in enumerate(y):
    print('Article',j)
    driver.get(url) # open web page on Chrome window
    source = driver.page_source
    soup=bs4.BeautifulSoup(source, 'html.parser')
    dates.append(soup.find('time').text)
    for script in soup(["script", "style"]):
        script.extract()    
    a = soup.get_text()
    a=" ".join(a.split())
    a=re.sub('[^A-Za-z0-9.]+', ' ', a)
    investors.append([i for i in a.split('.') if bool(re.search('investors ',i.lower()))==True])
    series.append([i.lower()[i.lower().find('series'):i.lower().find('series')+8] for i in a.split('.') if i.lower().find('series')!=-1])
    sector.append([a.split(i)[0].split()[-1].strip()+" "+i for i in sector_list if i in a])

Number of articles to parse = 10
Article 0
Article 1
Article 2
Article 3
Article 4
Article 5
Article 6
Article 7
Article 8
Article 9


In [8]:
# Normalize investors and series type features
series2=[]
def most_frequent(List): 
    return max(set(List), key = List.count) 

for i in range(len(investors)):
    if(sector[i]==[]):
        sector[i]=np.nan
    if(investors[i]==[]):
        investors[i]=np.nan    

for i in series:
    try:
        series2.append(most_frequent(i))
    except:
        series2.append(np.nan)

In [9]:
# Create DataFrame from scraped features
df=pd.DataFrame(zip(sector,dates,series2,investors,currency,mag,amount),
                columns=['Name of Fund','Year Funded','Fund Type','Fund Manager','Currency','Magnitude','Amount of Funding'])

In [10]:
# get year from date
df['Year Funded']=df['Year Funded'].apply(lambda x: x.split(',')[1][:5])
# Make values NaN in column "Fund Type" if not list allowed_vals
allowed_vals=['series b', 'series d', 'series a', 'series c','series e']
df['Fund Type'][~df['Fund Type'].isin(allowed_vals)] = np.nan
# Make values NaN in column "Magnitude" if not in list allowed_vals
allowed_vals=['Million','Billion','Millions','Billions']
df['Magnitude'][~df['Magnitude'].isin(allowed_vals)] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
# Make values NaN in column "Amount of Funding" if not a float
def is_integer(n):
    try:
        float(n)
    except ValueError:
        return False
    else:
        return float(n).is_integer()
for i in range(df.shape[0]):
    if(is_integer(df.iloc[i,-1])!=True):
        df.iloc[i,-1]=np.nan

## Save Output

In [12]:
from datetime import date
name='Funding Rounds FiercePharma-'+str(date.today())+'.xlsx'
df.to_excel(name,index=False)
df=pd.read_excel(name)
df=df.drop_duplicates()
df.to_excel(name,index=False)

In [13]:
df

Unnamed: 0,Name of Fund,Year Funded,Fund Type,Fund Manager,Currency,Magnitude,Amount of Funding
0,"['report Biopharma', 'Marketing Pharma']",2020,,,USD,Millions,109.0
3,"['Moderna Therapeutics', 'report Biopharma',...",2020,,,USD,Billions,
9,"['report Biopharma', 'Marketing Pharma']",2020,,,USD,Billions,


# Scrape [FierceBiotech](https://www.fiercebiotech.com/) 

In [14]:
y,z=[],[]
# Scrape links and headlines from first 5 pages of FierceBiotech
# Multiple iterations(5) are done to eliminate any chances of error
for j in range(1,5):
    print('Iteration',j)
    print('############')
    for i in range(1,6):
        driver.get('https://www.fiercebiotech.com/?page={}'.format(i)) # open web page on Chrome window
        source = driver.page_source
        soup=bs4.BeautifulSoup(source, 'html.parser')
        link,text=get_links_and_text()
        print('No. of links and text are: ',len(link),len(text))
        y.append(link)
        z.append(text)
        time.sleep(5)
        print('*'*50)

Iteration 1
############
No. of links and text are:  0 0
**************************************************
No. of links and text are:  2 2
**************************************************
No. of links and text are:  4 4
**************************************************
No. of links and text are:  3 3
**************************************************
No. of links and text are:  1 1
**************************************************
Iteration 2
############
No. of links and text are:  1 1
**************************************************
No. of links and text are:  2 2
**************************************************
No. of links and text are:  0 0
**************************************************
No. of links and text are:  0 0
**************************************************
No. of links and text are:  1 1
**************************************************
Iteration 3
############
No. of links and text are:  0 0
**************************************************
No. of links

In [15]:
# Normalize features
y=[j for i in y for j in i]
z=[j for i in z for j in i]
# Get Amount from headlines/title
amount,mag,currency=create_features(z)

In [16]:
# Parse each article in DataFrame and scrape Fund type, dates, investors and series type
investors=[]
series=[]
sector=[]
dates=[]
sector_list=[' Therapeutics',' Biopharma',' Pharma',' Biotherapeutics',' BioTherapeutics',' Inc',
             ' Pharmaceuticals',' Bioinformatics','Dx',' Biotics',' Biotechnology',' Diagnostics',' Ltd']
print('Number of articles to parse =',len(y))
for j,url in enumerate(y):
    print('Article',j)
    driver.get(url) # open web page on Chrome window
    source = driver.page_source
    soup=bs4.BeautifulSoup(source, 'html.parser')
    dates.append(soup.find('time').text)
    for script in soup(["script", "style"]):
        script.extract()    
    a = soup.get_text()
    a=" ".join(a.split())
    a=re.sub('[^A-Za-z0-9.]+', ' ', a)
    investors.append([i for i in a.split('.') if bool(re.search('investors ',i.lower()))==True])
    series.append([i.lower()[i.lower().find('series'):i.lower().find('series')+8] for i in a.split('.') if i.lower().find('series')!=-1])
    sector.append([a.split(i)[0].split()[-1].strip()+" "+i for i in sector_list if i in a])

Number of articles to parse = 21
Article 0
Article 1
Article 2
Article 3
Article 4
Article 5
Article 6
Article 7
Article 8
Article 9
Article 10
Article 11
Article 12
Article 13
Article 14
Article 15
Article 16
Article 17
Article 18
Article 19
Article 20


In [17]:
# Normalize investors and series type features
series2=[]
def most_frequent(List): 
    return max(set(List), key = List.count) 

for i in range(len(investors)):
    if(sector[i]==[]):
        sector[i]=np.nan
    if(investors[i]==[]):
        investors[i]=np.nan    

for i in series:
    try:
        series2.append(most_frequent(i))
    except:
        series2.append(np.nan)

In [18]:
# Create DataFrame from scraped features
df2=pd.DataFrame(zip(sector,dates,series2,investors,currency,mag,amount),
                columns=['Name of Fund','Year Funded','Fund Type','Fund Manager','Currency','Magnitude','Amount of Funding'])

In [19]:
# get year from date
df2['Year Funded']=df2['Year Funded'].apply(lambda x: x.split(',')[1][:5])
# Make values NaN in column "Fund Type" if not list allowed_vals
allowed_vals=['series b', 'series d', 'series a', 'series c','series e']
df2['Fund Type'][~df2['Fund Type'].isin(allowed_vals)] = np.nan
# Make values NaN in column "Magnitude" if not in list allowed_vals
allowed_vals=['Million','Billion','Millions','Billions']
df2['Magnitude'][~df2['Magnitude'].isin(allowed_vals)] = np.nan

for i in range(df2.shape[0]):
    if(is_integer(df2.iloc[i,-1])!=True):
        df2.iloc[i,-1]=np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## Save Output

In [20]:
name='Funding Rounds FierceBiotech-'+str(date.today())+'.xlsx'
df2.to_excel(name,index=False)
df2=pd.read_excel(name)
df2=df2.drop_duplicates()
df2.to_excel(name,index=False)
driver.close()

In [21]:
df2

Unnamed: 0,Name of Fund,Year Funded,Fund Type,Fund Manager,Currency,Magnitude,Amount of Funding
0,['Course Biopharma'],2020,series b,['Example water Europe Top Menu Virtual Events...,USD,Millions,54.0
1,"['Vesigen Therapeutics', 'Course Biopharma',...",2020,series a,,USD,Millions,
2,"['Encoded Therapeutics', 'Course Biopharma',...",2020,series c,[' Drawn from a long list of marquee investors...,USD,Millions,135.0
3,['Course Biopharma'],2020,,,USD,Millions,485.0
4,"['Tizona Therapeutics', 'Course Biopharma']",2020,,,USD,Millions,300.0
5,['Course Biopharma'],2020,,[' Germany s CureVac attracted the support of ...,USD,Millions,640.0
6,"['Myonexus Therapeutics', 'Course Biopharma']",2020,series a,,USD,Millions,40.0
7,"['Jnana Therapeutics', 'Course Biopharma', '...",2020,,,USD,Millions,40.0
8,['Course Biopharma'],2020,series b,[' We are truly grateful to our investors and ...,USD,Millions,53.0
9,['Course Biopharma'],2020,,,USD,Millions,294.0
