In [1]:
# NOTEBOOK TO BE RUN EVERY DAY

# Import Libraries

In [2]:
################ WEB SCRAPING MODULES ############
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import bs4
from fake_useragent import UserAgent
import requests
################ TIME MODLULES ###################
import time
import datetime
############## DATA MANIPULATION MODULES #########
import pandas as pd
import numpy as np
import re

# Define Functions

In [3]:
def get_links_and_text():
    link=[i.get('href') for i in soup.find_all('a')]
    text=[i.text for i in soup.find_all('a')]
    #############################################################
    INDEX_1=[i for i in range(len(text)) if '$' in text[i]]
    #############################################################
    link=[link[i] for i in INDEX_1]
    text=[text[i] for i in INDEX_1]
    return link,text

In [4]:
def create_features(z):
    amount=[]
    mag=[]
    for i,x in enumerate(z):
        x=re.sub('(?<=\d),(?=\d)', '',x) # replace comma b/w integers $2,800 --> $2800
        x=x.replace(". ", " ").replace(", "," ") # Remove all fullstops from string, except if it's between digits
        x=re.sub('[^A-Za-z0-9$.]+', ' ', x) # remove all special characters except $ and .
        a=x[x.index('$'):].split()[1],x[x.index('$'):].split()[0][1:]
        mag.append(a[0])
        amount.append(a[1])
    currency=['USD' for i in range(len(z))]
    return amount,mag,currency

# Scrape [BioSpace News](https://www.biospace.com/news/)

In [5]:
y,z=[],[]
driver = webdriver.Chrome(ChromeDriverManager().install()) # open Chrome driver/window
# Scrape links and headlines from first 3 pages of BioSpace News
# Multiple iterations(5) are done to eliminate any chances of error
for j in range(1,5):
    print('Iteration',j)
    print('############')
    for i in range(1,4): 
        driver.get('https://www.biospace.com/news/{}/'.format(i)) # open web page on Chrome window
        source = driver.page_source
        soup=bs4.BeautifulSoup(source, 'html.parser')
        link,text=get_links_and_text()
        print('No. of links and text are: ',len(link),len(text))
        y.append(link)
        z.append(text)
        time.sleep(5)
        print('*'*50)

Iteration 1
############
No. of links and text are:  0 0
**************************************************
No. of links and text are:  0 0
**************************************************
No. of links and text are:  0 0
**************************************************
Iteration 2
############
No. of links and text are:  0 0
**************************************************
No. of links and text are:  1 1
**************************************************
No. of links and text are:  0 0
**************************************************
Iteration 3
############
No. of links and text are:  0 0
**************************************************
No. of links and text are:  1 1
**************************************************
No. of links and text are:  0 0
**************************************************
Iteration 4
############
No. of links and text are:  0 0
**************************************************
No. of links and text are:  1 1
**************************************

In [6]:
# Normalize features
y=[j for i in y for j in i]
y=['https://www.biospace.com'+i for i in y]
z=[j for i in z for j in i]
# Get Amount from headlines/title
amount,mag,currency=create_features(z)

In [7]:
# Parse each article in DataFrame and scrape Fund type, dates, investors and series type
investors=[]
series=[]
sector=[]
dates=[]
sector_list=[' Therapeutics',' Biopharma',' Pharma',' Biotherapeutics',' BioTherapeutics',' Inc',
             ' Pharmaceuticals',' Bioinformatics','Dx',' Biotics',' Biotechnology',' Diagnostics',' Ltd']
print('Number of articles to parse =',len(y))
for j,url in enumerate(y):
    print('Article',j)
    driver.get(url)
    source = driver.page_source
    soup=bs4.BeautifulSoup(source, 'html.parser')
    dates.append(soup.find('span',{'class':'palm-block-level'}).text.split(':')[1].strip())
    for script in soup(["script", "style"]):
        script.extract()    
    a = soup.get_text()
    a=" ".join(a.split())
    a=re.sub('[^A-Za-z0-9.]+', ' ', a)
    investors.append([i for i in a.split('.') if bool(re.search('investors ',i.lower()))==True])
    series.append([i.lower()[i.lower().find('series'):i.lower().find('series')+8] for i in a.split('.') if i.lower().find('series')!=-1])
    sector.append([a.split(i)[0].split()[-1].strip()+" "+i for i in sector_list if i in a])

Number of articles to parse = 3
Article 0
Article 1
Article 2


In [8]:
# Normalize investors and series type features
series2=[]
def most_frequent(List): 
    return max(set(List), key = List.count) 

for i in range(len(investors)):
    if(sector[i]==[]):
        sector[i]=np.nan
    if(investors[i]==[]):
        investors[i]=np.nan    

for i in series:
    try:
        series2.append(most_frequent(i))
    except:
        series2.append(np.nan)

In [9]:
# Create DataFrame from scraped features
df=pd.DataFrame(zip(sector,dates,series2,investors,currency,mag,amount),
                columns=['Name of Fund','Year Funded','Fund Type','Fund Manager','Currency','Magnitude','Amount of Funding'])

In [10]:
# get year from date
df['Year Funded']=df['Year Funded'].apply(lambda x: x.split(',')[1])
# Make values NaN in column "Fund Type" if not list allowed_vals
allowed_vals=['series b', 'series d', 'series a', 'series c','series e']
df['Fund Type'][~df['Fund Type'].isin(allowed_vals)] = np.nan
# Make values NaN in column "Magnitude" if not in list allowed_vals
allowed_vals=['Million','Billion','Millions','Billions']
df['Magnitude'][~df['Magnitude'].isin(allowed_vals)] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
# Make values NaN in column "Amount of Funding" if not a float
def is_integer(n):
    try:
        float(n)
    except ValueError:
        return False
    else:
        return float(n).is_integer()
for i in range(df.shape[0]):
    if(is_integer(df.iloc[i,-1])!=True):
        df.iloc[i,-1]=np.nan

# Save Output

In [16]:
from datetime import date
name='Funding Rounds BioSpace-'+str(date.today())+'.xlsx'
df.to_excel(name,index=False)
df=pd.read_excel(name)
df=df.drop_duplicates()
df.to_excel(name,index=False)
driver.close()

In [17]:
df

Unnamed: 0,Name of Fund,Year Funded,Fund Type,Fund Manager,Currency,Magnitude,Amount of Funding
0,['Sciences Inc'],2020,,[' NASDAQ ALPN a leading clinical stage immuno...,USD,Million,60
