In [None]:
import os
import requests 
from bs4 import BeautifulSoup 
import re 
import pandas as pd

Create a CSV file in which scraped data is saved

In [None]:
directory = '/Users/abha/Desktop/GitHub/Lending Club'
#directory = './GitHub/Lending Club'
try:
    os.makedirs(directory)
except OSError:
    if not os.path.isdir(directory):
        raise
os.chdir(directory)

Here, I am only focusing on Notes that have been issued and sold prior to 12-31-2010. However, the analysis can be easily extented for entire sample period uptil 8-31-2017.

In [392]:
def get_filing_links(url):
    'Function that identifies and collects all hyperlinks displayed on URL into a list'
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser') 
    tags = soup.find_all('filinghref')
    links = []
    for tag in tags:
        a = tag.get_text()
        #data scraped from txt files instead of htm files for Lending Club's SEC filings
        a = a.replace('-index.htm', '.txt')
        links.append(a)
    return (links)    

In [391]:
def get_sales_filing(url):
    'Function that checks if URL is to a sales filing or not'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    content = soup.find_all('table')[0].get_text()
    if content.find('Prospectus Supplement (Sales Report)')>0:
        print('Sales prospectus found')
        success = 1
    else:
        success = 0
    return (success)   

In [390]:
def get_scraped_data(url):
    '''Function that scrapes info on borrower FICO scores, loan request date and final issuance date 
    for each note sold in Sales filings'''
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    data = []
    data1 = []
    data2 = []
    for tag in soup.find_all(text=re.compile('Series of Member Payment Dependent Notes')):
            table = tag.findParent('table')
            table_row = table.find_all('tr')[1]
            content = table_row.get_text().split('\n\n')[1:]
            content = [s.rstrip() for s in content]
            header = ['Series of Member Payment Dependent Notes', 'Aggregate principal amount of Notes offered', 
                      'Aggregate principal amount of Notes sold', 'Stated interest rate', 'Service charge', 
                      'Sale and Original Issue Date', 'Initial Maturity', 'Final Maturity', 
                      'Amount of corresponding member loan funded by Lending Club']
            data.append(dict(zip(header, content))) #create list of dictionaries
    for tag in soup.find_all(text=re.compile('Credit Score Range:')):
            table = tag.findParent('table')
            table_row = table.find_all('tr')[0]
            if len(table_row.get_text().split('\n'))==5:
                content = table_row.get_text().split('\n')[2]
                header = ['Credit score range']
                data1.append(dict(zip(header, [content]))) #create list of dictionaries  
    for tag in soup.find_all(text=re.compile(r'was requested on.*by a borrower')):
            a = tag.split('\n')[1].split('. ')[1]
            content = [' '.join(a.split(' ')[6:9]), url.replace('.txt', '-index.htm')]
            header = ['Request date', 'File name']
            data2.append(dict(zip(header, content)))       
    return (data, data1, data2)

In [None]:
cik = "0001409970"  
sec_filing = "424B3"
priorto = "20101231"
count = 100
for file_num in range(1, 16):
    print('Opening CSV file %d for writing!' %file_num)
    file_name = 'Sales Report %d.csv' %(file_num)
    output = open(file_name, 'w')
    start = (file_num - 1)*100
    base_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK="+str(cik)+"&type="+str(sec_filing)+"&dateb="+str(priorto)+"&owner=exclude&start="+str(start)+"&count="+str(count)+"&output=xml"
    filing_links = get_filing_links(base_url)
    ctr = 0
    for url in filing_links:
        success = get_sales_filing(url)
        if success ==1:
            ctr += 1
            (data, data1, data2) = get_scraped_data(url)
            if ctr==1:
                df = pd.concat([pd.concat([pd.DataFrame(data), pd.DataFrame(data1)], axis=1), pd.DataFrame(data2)], axis=1)
                print(df.shape)
                print(url)
            else:
                a = pd.concat([pd.concat([pd.DataFrame(data), pd.DataFrame(data1)], axis=1), pd.DataFrame(data2)], axis=1)
                print(a.shape)
                print(url)
                df = pd.concat([df, a])
                print(df.shape)
                
    df.to_csv(file_name, index=False)
    output.close()  