# Inhibitor csv File Generation by Web Scraping and Parsing KIDFamMap Data in Python

KIDFamMap is a database listing thousands of known kinase inhibitors. 

Kinases are listed on the web page at URL http://gemdock.life.nctu.edu.tw/kidfammap/browse.php and each can be clicked on individually to navigate to a page of information about that kinase. This page includes a link to another page, which contains a table of inhibitors for that kinase.

Here we use the scrapy package to create and run a spider to scrape the website's HTML code for all of the relevant information.
- First, the URLs for each kinase's web page are obtained from the main page.
- These URLs are then followed, taking us to each kinase's web page.
- From there we extract the URLs for the web pages containing the individual kinase's table of inhibitors
- We then follow these URLs and extract 
    - all of the relevant information about each inhibitor
    - the UniProt accession ID (and the kinase name for comparison)

Once the data has been extracted from the website, it is cleaned and inserted into a Pandas data frame.

After that, the following steps take place:
- Using information in the data frame, we generate a column of URLs that our web app can use to display images of the inhibitors' chemical structures.
- Convert kinases and inhibitors to uppercase.
- Translate kinase names to match "Entry name" (UniProt ID) in kinase table.
- Remove duplicate rows.
- Remove unnecessary columns.
- Use this data frame to make two data frames: one listing the kinase-inhibitor pairs, and one listing each inhibitor alongside all of its information. These two data sets can then be linked via the inhibitor name in our relational database.
- Make primary key columns.
- Export as csv files.

Import packages

In [None]:
import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess

Make a spider

In [None]:
# Use the scrapy.Spider class to make your own spider

class InhibitorSpider( scrapy.Spider ):
    
    name = "inhibitor_spider"
    
    # Define the first action to take
        
    def start_requests( self ): 
        
        # Define which URL to start at
        
        url = 'http://gemdock.life.nctu.edu.tw/kidfammap/browse.php'

        # Go to the website at the above URL and get a response object
        # which contains the HTML code for that web page
        # Define what to do with the response object
        # i.e. send it to the parse method defined below
        
        yield scrapy.Request( url = url, callback = self.parse )
            
    # Using the HTML in the previous response object, get the URLs for each 
    # kinase's web page, and go to those websites
    
    def parse( self, response ):
        
        # Each kinase listed on the page is a hyperlink, leading to
        # a page of information for that kinase
        # Define a CSS locator to point to the hyperlinks' URLs in the HTML
        # code and extract them as strings
        
        links = response.xpath( '//a/@href' ).extract()
        #links = response.xpath( '//table/tr/td/table/tr/td/a/@href' ).extract()
        #links = response.css( 'table > tr > td > a::attr(href)' ).extract()
        
        # Go to the kinases' web pages using the new URLs
        # and send the response objects to the parse2 method below
        
        for link in links:
            yield response.follow( url = link, callback = self.parse2 )
        
    
    # Using the previous response objects, get the URLs for each 
    # inhibitor table
    
    def parse2( self, response ):
        
        # Each kinase's web page has a hyperlink to another page 
        # containing a table of inhibitors for that kinase
        # Define a CSS locator to point to the URLs in those
        # hyperlinks and extract them as strings
        
        inhib_links = response.xpath( '//a[@class="show_inhibitor"]/@href' ).extract()
        #inhib_links = response.css( 'a.show_inhibitor::attr(href)' ).extract()

        # Go to the kinases' web pages using the new URLs
        # and send the response objects to the parse3 method below
        
        for ilink in inhib_links:
            yield response.follow( url = ilink, callback = self.parse3 )
   
    # Using the previous response objects, get information from the 
    # table of inhibitors for each kinase
    
    def parse3( self, response ):
        
        # Each inhibitor list web page has information we'd like to
        # extract and place into a list "inhibs" (which we must initialise
        # in the next cell rather than here)
        # Define a CSS locator to point to the data in the rows
        # of the inhibitor table and extract the text
       
        raw = response.css( 'div.result tbody > tr' ).extract()
        uniprot = response.xpath( '//a[@target="_blank"]/text()' ).extract()
        kin = response.css( 'td.tdleft5::text' ).extract()
        
        # Save the data in a list "fields"
        
        fields = [ field for field in raw ]
        
        # For each kinase, append the information about its
        # inhibitors to "inhibs"
        
        inhibs.append( fields )
        kins.append( kin )
        uniprots.append( uniprot )

Run the spider: crawl KIDFamMap for inhibitors

In [None]:
kins = []
uniprots = []

# Our inhibitor table data will be returned to "inhibs"

inhibs = [] 

# Run the spider

process = CrawlerProcess()
process.crawl( InhibitorSpider )
process.start()

# N.B. kernel needs to be cleared before repeating

Clean up the data in "inhibs" and store in "inhibitors"

In [None]:
inhibitors = []

for i in inhibs: # for each kinase
    for j in i: # for each kinase-inhibitor relationship
        chemical = []
        inh = j.split("</td>") # Split row into individual fields
        for k in inh: # for each field, remove unnecessary characters
            field = k.replace("<tr>","") 
            field = field.replace("\r","")
            field = field.replace("\t","")
            field = field.replace("<td>","")
            field = field.replace("\n","")
            chemical.append(field) # Make a row of cleaned, separate fields
        inhibitors.append(chemical) # Add this row to "inhibitors"

Define column names, based on those on the KIDFamMap website

In [None]:
headers = ["Index","Kinase","Inhibitor","Partial_Img_URL",
           "Ki_nM","IC50_nM","Kd_nM","EC50_nM","POC","Source","Link","To_Remove"]

Save the inhibitors information in a Pandas data frame

In [None]:
inhibitors_df = pd.DataFrame(inhibitors, columns = headers)

Generate a dictionary of kinase names with their UniProt IDs

In [None]:
kinases = []

for i in kins:
    kin = str(i)
    kin = kin.replace("[","")
    kin = kin.replace("]","")
    kin = kin.replace("\\r","")
    kin = kin.replace("\\t","")
    kin = kin.replace("<td>","")
    kin = kin.replace("\\n","")
    kin = kin.replace("(', ')","")
    kin = kin.replace("'","")
    kin = kin.replace(" ","")
    kin = kin.split(",")
    kinases.append(kin[0]) 

uniprot_ids = []

for i in uniprots:
    up = str(i)
    up = up.replace("[","")
    up = up.replace("]","")
    up = up.replace("\\r","")
    up = up.replace("\\t","")
    up = up.replace("<td>","")
    up = up.replace("\\n","")
    up = up.replace("(', ')","")
    up = up.replace("'","")
    up = up.replace(" ","")
    up = up.split(",")
    uniprot_ids.append(str(up[0]))

In [None]:
uniprot_dict = dict(list(zip(kinases, uniprot_ids)))

Translate the kinase IDs to UniProt IDs and store in column

In [None]:
UniProt_ID = []

for n, i in enumerate( inhibitors_df.Kinase ):
    if i in uniprot_dict.keys():
        uni = uniprot_dict.get( i )
        UniProt_ID.append( uni )
    else:
        UniProt_ID.append( float('NaN') )

UniProt_ID = pd.Series( UniProt_ID )
inhibitors_df = inhibitors_df.assign(UniProt_ID = UniProt_ID)

Using information in the data frame, generate a column of URLs for the inhibitors' chemical structure images. Our web app can subsequently use these to display images.

In [None]:
IMG_URL = []

for n,i in enumerate(inhibitors_df.Partial_Img_URL):
    URL = 'http://gemdock.life.nctu.edu.tw/kidfammap/data/png/'
    URL += str(inhibitors_df.Source[n])+"/"
    URL += str(i)+".png"
    IMG_URL.append(URL)

IMG_URL = pd.Series(IMG_URL)
inhibitors_df = inhibitors_df.assign(IMG_URL = IMG_URL)

Make "kinase" and "inhibitor" entries uppercase.

In [None]:
uppercase_kinase = []
uppercase_inhib = []

for n,i in enumerate(inhibitors_df.Kinase):
    uppercase_kinase.append(str(i).upper())
    uppercase_inhib.append(inhibitors_df.Inhibitor[n].upper())

uppercase_kinase = pd.Series(uppercase_kinase)
uppercase_inhib = pd.Series(uppercase_inhib)

inhibitors_df = inhibitors_df.assign(Kinase = uppercase_kinase)
inhibitors_df = inhibitors_df.assign(Inhibitor = uppercase_inhib)

Make a temporary column combining the inhibitor and kinase names, to check for duplicates

In [None]:
unique = []

for n,i in enumerate(inhibitors_df.Inhibitor):
    uniq = str(i)+str(inhibitors_df.Kinase[n])
    unique.append(uniq)

unique = pd.Series(unique)
inhibitors_df = inhibitors_df.assign(UNIQUE = unique)

Drop any duplicate kinase-inhibitor pairs and reset the indices

In [None]:
inhibitors_df = inhibitors_df.drop_duplicates( subset = "UNIQUE" )
inhibitors_df = inhibitors_df.reset_index( drop = True )

Drop any rows without a UniProt ID

In [None]:
inhibitors_df = inhibitors_df.dropna( subset = ["UniProt_ID"] )
inhibitors_df = inhibitors_df.reset_index( drop = True )

Remove any columns not required for the web app

In [None]:
inhibitors_df = inhibitors_df.drop(["Index", "To_Remove", "Partial_Img_URL", "Link", "UNIQUE"], axis = 1)

Make data frame of kinase-inhibitor pairs

In [None]:
inhib_kin_df = inhibitors_df[['Kinase', 'Inhibitor', 'UniProt_ID']]

Make data frame of inhibitors

In [None]:
inhibitors_df = inhibitors_df.drop_duplicates(subset = "Inhibitor")
inhibitors_df = inhibitors_df.drop(["Kinase"], axis = 1)
inhibitors_df = inhibitors_df.reset_index(drop = True)

Make a column of primary keys

In [None]:
prim_key = []

count = 1

for i in inhibitors_df.Inhibitor:
    key = "IN"+"{:07d}".format(count)
    prim_key.append(key)
    count += 1

prim_key = pd.Series(prim_key)

inhibitors_df = inhibitors_df.assign(ID_IN = prim_key)

In [None]:
prim_key = []

count = 1

for i in inhib_kin_df.Inhibitor:
    key = "KI"+"{:07d}".format(count)
    prim_key.append(key)
    count += 1

prim_key = pd.Series(prim_key)

inhib_kin_df = inhib_kin_df.assign(ID_KI = prim_key)

Write to csv

In [None]:
inhib_kin_df.to_csv("inhib_kin.csv", index = False)

In [None]:
inhibitors_df.to_csv("inhibitors.csv", index = False)