## Get Statistics for 10K Reports

In [1]:
## Load packages
import re
import numpy as np
import glob, os
import pandas as pd
from pprint import pprint
import requests
import sys
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from dateutil.rrule import rrule, DAILY
from tqdm import tqdm_notebook as tqdm
from __future__ import division
import random
pd.set_option('display.max_colwidth', -1) # For displaying all entries in full length in pandas

# Clean txt raw files
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\n|\t')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

### Get list of all SEC reports

In [3]:
# Retrieve reports' meta-data from SEC repository
## Prepare empty df to store report meta-data
df_ix = {
    "cik":[],
    "name":[],
    "form":[],
    "date":[],
    "link":[],
    "quarter":[]
}

## Parse across quarters for 2020 to get CIK, NAME, FORM, DATE, and LINK
for q in [1,2]:
    indexfile = pd.read_table("https://www.sec.gov/Archives/edgar/full-index/2020/QTR" + str(q) +"/master.idx")    
    indexfile = indexfile.ix[6:,0]
    
    for entry in tqdm(indexfile):
        
        df_ix["quarter"].append(q)
        
        items = entry.split("|")
        df_ix["cik"].append(items[0])
        df_ix["name"].append(items[1])
        df_ix["form"].append(items[2])
        df_ix["date"].append(items[3])
        df_ix["link"].append("https://www.sec.gov/Archives/" + items[4].replace(".txt","-index.htm"))
        
    print("Successfully parsed " + str(len(df_ix["cik"])) + " reports for 2020-Q" + str(q))
    
## Store report list for 2020
report_index = pd.DataFrame(df_ix)
report_index.to_csv(os.getcwd() + "/10x_report_list.csv")

  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


A Jupyter Widget


Successfully parsed 322905 reports for 2020-Q1


A Jupyter Widget


Successfully parsed 501609 reports for 2020-Q2


### Scrape only new reports

In [18]:
# Get stats of already scraped reports
old_report_sentences = pd.read_csv(os.getcwd().replace("/Scraping", "") + "/Processing/10x_report_sentences.csv", index_col=0)
# Keep only most recent reports
report_index = report_index[report_index["date"]>=max(old_report_sentences["date"])]
print("Got " + str(len(report_index["date"])) + " new reports since " + str(max(old_report_sentences["date"])))

Got 28763 new reports since 2020-05-21


### Find HTML link for 10-K and 10-Q reports

In [20]:
# Read report list
reports_10x = report_index[(report_index["form"] == "10-K") | (report_index["form"]=="10-Q")]

# Find report links
reports_10x["sic"] = "na"
reports_10x["report_link"] = "na"

#for i in tqdm(range(0,len(reports_10x))):
for x in tqdm(range(len(reports_10x.index))):
        
    try:
        r = requests.get(reports_10x["link"].iloc[x])
        soup = BeautifulSoup(r.content, "lxml") 

        reports_10x["report_link"].iloc[x] = (soup.find_all("div", {"id": "formDiv"})[1].
                               find('a', href=True).attrs["href"].replace("/ix?doc=",""))
        reports_10x["sic"].iloc[x] = str(soup.find("div", {"class": "companyInfo"}).find_all("a")[2].text)
    except:
        pass
       
# Append newly scraped report info remove potential duplicates
df = pd.read_csv(os.getcwd() + "/10x_report_links.csv", index_col=0).append(reports_10x)
df = df.drop_duplicates()

# Store extended file as CSV
df.to_csv(os.getcwd() + "/10x_report_links.csv")
print("Successfully retrieved " + str(len(reports_10x["report_link"])) + " report links and appended")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


A Jupyter Widget

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Successfully retrieved 249 report links and appended


### Retrieve sentences for selected reports

In [21]:
# Parse through 10x html reports
# Define "corona" keywords 
corona_words = ["corona","covid"]

## Prepare DF for storage
df = {
    "cik":[],
    "sic":[],
    "date":[],
    "name":[],
    "form":[],
    "report_link":[],
    "report_word_count":[],
    "report_corona_count":[],
    "sentence_text":[]
}

## Open html reports and get stats
for x in tqdm(range(0,len(reports_10x))):
    
    ### Open reports
    r = reports_10x["report_link"].iloc[x]
    url = "https://www.sec.gov" + r
    u = requests.get(url)    
    soup = BeautifulSoup(u.content, "lxml") 
    text = soup.text.replace("\n","")
    text = text.lower()

    ### Corona count
    report_corona_count = sum([text.count(word) for word in corona_words])
    report_word_count = len(re.findall(r'\w+', text))

    if report_corona_count == 0:
        
        df["date"].append(reports_10x["date"].iloc[x])
        df["sic"].append(reports_10x["sic"].iloc[x])
        df["cik"].append(reports_10x["cik"].iloc[x])
        df["name"].append(reports_10x["name"].iloc[x])
        df["form"].append(reports_10x["form"].iloc[x])
        
        df["report_link"].append(r)
        df["report_corona_count"].append(report_corona_count)
        df["report_word_count"].append(report_word_count)
        df["sentence_text"].append("na")

    else:
    ### Get corona sentences
        corona_sentences = [sentence + '.' for sentence in text.split('.') if 'corona' in sentence or 'covid' in sentence]

        for sentence in corona_sentences:

            ## Get sentence info
            sentence_word_count = len(re.findall(r'\w+', sentence))

            df["date"].append(reports_10x["date"].iloc[x])
            df["sic"].append(reports_10x["sic"].iloc[x])
            df["cik"].append(reports_10x["cik"].iloc[x])
            df["name"].append(reports_10x["name"].iloc[x])
            df["form"].append(reports_10x["form"].iloc[x])
            
            df["report_link"].append(r)
            df["report_corona_count"].append(report_corona_count)
            df["report_word_count"].append(report_word_count)
            df["sentence_text"].append(sentence.encode("utf-8"))
    
# Turn DF into pandas
df = pd.DataFrame(df) 
# Append new DF to old DF
report_sentences = old_report_sentences.append(df)
# Drop potential duplicates
report_sentences = report_sentences.drop_duplicates()
# Store all sentences as csv
report_sentences.to_csv(os.getcwd().replace("/Scraping", "") + "/Processing/10x_report_sentences.csv")
print(str(len(df["sentence_text"])) + " new sentences retrieved and appended")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


A Jupyter Widget




ValueError: Invalid file path or buffer object type: <class 'pandas.core.frame.DataFrame'>