In [19]:
import os
import sys
import urllib
import pandas as pd
import numpy as np
import re

In [5]:
# function to download file and return FALSE if download error
def DownloadSECFile(link, dfile):
    try:
        urllib.request.urlretrieve(link, dfile)
    except:
        return False
    return True

In [13]:
from datetime import datetime

In [18]:
datetime.date()

TypeError: descriptor 'date' of 'datetime.datetime' object needs an argument

In [None]:
#' Retrieves quarterly master index
#'
#' \code{getMasterIndex} retrieves the quarterly master indexes from the US SEC site.
#'
#' getMasterIndex function takes filing year as an input parameter from a user,  
#' downloads quarterly master indexes from the US SEC server 
#' \url{https://www.sec.gov/Archives/edgar/full-index/}. It then strips headers from the 
#' master index files, converts them into dataframe, and 
#' merges such quarterly dataframes into yearly dataframe, and stores them 
#' in Rda format. It has ability to download master indexes for multiple years 
#' based on the user input. This function creates a new directory 'Master Indexes' 
#' into current working directory to save these Rda Master Index. Please note, for 
#' all other functions in this package need to locate the same working 
#' directory to access these Rda master index files.
#'     
#' @usage getMasterIndex(filing.year)
#'
#' @param filing.year vector of integer containing filing years.
#' 
#' @return Function downloads quarterly master index files and stores them 
#' into the mentioned directory.
#'   
#' @examples
#' \dontrun{
#' 
#' getMasterIndex(2006) 
#' ## Downloads quarterly master index files for 2006 and 
#' stores into yearly 2006master.Rda file.
#' 
#' getMasterIndex(c(2006, 2008)) 
#' ## Downloads quarterly master index files for 2006 and 2008, and 
#' stores into 2006master.Rda and 2008master.Rda files.
#'}

def getMasterIndex(filing_year):
    
    if not os.path.isdir("Master Indexes"):
        os.makedirs("Master Indexes")
    
    status_array = pd.DataFrame()
    for i in range(1, len(filing_year)+1):
        year = filing_year[i]
        year_master = pd.DataFrame()
        quarterloop = 4
        
        print("Downloading Master Indexes from SEC server for",year,"...\n")

        for quarter in range(1, quarterloop+1):
            ## save downloaded file as specific name
            dfile = "Master Indexes/" + str(year) + "QTR" + str(quarter) + "master.gz"
            file = "Master Indexes/" + str(year) + "QTR" + str(quarter) + "master"
            link = "https://www.sec.gov/Archives/edgar/full-index/"+str(year)+"/QTR"+str(quarter)+"/master.gz"
            res = DownloadSECFile(link, dfile)
            if res:
                ## unzip gz file
                R.utils::gunzip(dfile, destname = file, temporary = FALSE, skip = FALSE, overwrite = TRUE, remove = TRUE)

                ## Removing ''' so that scan with '|' not fail due to occurrence of ''' in company name
                raw_data <- gsub("'", "", open(file, "r").readlines())
                
                ## Find line number where header description ends
                header_end = grep("--------------------------------------------------------", raw_data)
                
                ## writting back to storage
                with open(file, "w") as outfile:
                    outfile.writelines(raw_data)
                
                scraped_data = scan(file, 
                                    what = list("", "", "", "", ""), 
                                    flush = False, 
                                    skip = header_end, sep = "|", 
                                    quiet = True)
                
                ## Remove punctuation characters from company names
                company_name = gsub("[[:punct:]]", " ", scraped_data[[2]], perl = True)
                
                final_data = data_frame(cik = scraped_data[[1]], company_name = company_name, form_type = scraped_data[[3]], 
                  date_filed = scraped_data[[4]], edgar_link = scraped_data[[5]], quarter = quarter)
                
                year_master = rbind(year_master, final_data)
                
                file_remove(file)
                
                status_array = rbind(status_array, pd.DataFrame(Filename = str(year)+": quarter-"+str(quarter), 
                  status = "Download success"))
                
                print("Master Index for quarter", quarter,"\n")
                
            else:
                status_array = rbind(status_array, pd.DataFrame(Filename = str(year)+": quarter-"+str(quarter),
                  status = "Server Error"))
        
    for (i in 1:length(filing.year)) {
        
        year <- filing.year[i]
        
        year.master <- data.frame()
        
        quarterloop <- 4
        
        # Find the number of quarters completed in input year
        if (year == format(Sys.Date(), "%Y")) {
            quarterloop <- ceiling(as.integer(format(Sys.Date(), "%m"))/3)
        }
        
        cat("Downloading Master Indexes from SEC server for",year,"...\n")

        for (quarter in 1:quarterloop) {
            
            # save downloaded file as specific name
            dfile <- paste0("Master Indexes/", year, "QTR", quarter, "master.gz")
            file <- paste0("Master Indexes/", year, "QTR", quarter, "master")
            
            # form a link to download master file
            link <- paste0("https://www.sec.gov/Archives/edgar/full-index/", year, "/QTR", quarter, "/master.gz")
            
            res <- DownloadSECFile(link, dfile, dmethod)
            
            if (res) {
                
                # Unzip gz file
                R.utils::gunzip(dfile, destname = file, temporary = FALSE, skip = FALSE, overwrite = TRUE, remove = TRUE)
                
                # Removing ''' so that scan with '|' not fail due to occurrence of ''' in company name
                raw.data <- gsub("'", "", readLines(file))
                
                # Find line number where header description ends
                header.end <- grep("--------------------------------------------------------", raw.data)
                
                # writting back to storage
                writeLines(raw.data, file)
                
                scraped.data <- scan(file, what = list("", "", "", "", ""), flush = F, skip = header.end, sep = "|", 
                  quiet = T)
                
                # Remove punctuation characters from company names
                company.name <- gsub("[[:punct:]]", " ", scraped.data[[2]], perl = T)
                
                final.data <- data.frame(cik = scraped.data[[1]], company.name = company.name, form.type = scraped.data[[3]], 
                  date.filed = scraped.data[[4]], edgar.link = scraped.data[[5]], quarter = quarter)
                
                year.master <- rbind(year.master, final.data)
                
                file.remove(file)
                
                status.array <- rbind(status.array, data.frame(Filename = paste0(year, ": quarter-", quarter), 
                  status = "Download success"))
                
                cat("Master Index for quarter", quarter,"\n")
                
            } else {
                status.array <- rbind(status.array, data.frame(Filename = paste0(year, ": quarter-", quarter), 
                  status = "Server Error"))
            }
        }
        
  
        assign(paste0(year, "master"), year.master)
        
        save(year.master, file = paste0("Master Indexes/", year, "master.Rda"))
      
    }
    
}

In [None]:
def getFilings(cik.no = "ALL", form_type = "ALL", filing_year, quarter = [1, 2, 3, 4],
                       downl_permit = "n") {
     

    
  
    # Create empty master index file and then updated it yearwise
    index_df = pd.DataFrame()
  
    # Iterate thorugh each years
    #for year in filing_year:
    #    yr_master = str(year)+"master.Rda"
    #    filepath = "Master Indexes/"+yr_master
    #    if not os.path.isfile(filepath):
    #        getMasterIndex(year)
        
  for( year in filing.year ){
    
    yr.master <- paste0(year, "master.Rda")  ## Create specific year .Rda filename.
    
    filepath <- paste0("Master Indexes/", yr.master)
    
    if (!file.exists(filepath)) {
      getMasterIndex(year)  # download master index
    }
    
    load(filepath)  # Import master Index
    
    if(form.type == "ALL"){
      form.type = unique(year.master$form.type)
    }
    
    year.master = year.master[which(year.master$cik %in% cik.no & year.master$form.type %in% form.type 
                               & year.master$quarter %in% quarter), ]
    
    if(nrow(year.master)>0){

      year.master$filing.year = year
      
      # Update main master index file
      index.df <- rbind(index.df, year.master)
    }

  }

  if (nrow(index.df) == 0) {
    cat("No filing information found for given CIK(s) and Form Type in the mentioned year(s)/quarter(s).\n")
    return()
  }
  
  index.df = index.df[order(index.df$cik, index.df$filing.year), ]
  
  # Downloading files
  total.files <- nrow(index.df)
  
  msg3 <- paste0("Total number of filings to be downloaded = ", total.files, 
                 ". Do you want to download (y/n)? ")
  
  if (as.character(downl.permit) == "n") {
    downl.permit <- readline(prompt = msg3)
  }
  
  if (as.character(downl.permit) == "y") {
    
    dir.create("Edgar filings_full text")
    
    cat("Downloading fillings. Please wait...", "\n")
    
    # Create progress bar object
    progress.bar <- txtProgressBar(min = 0, max = total.files, style = 3)

    
    # Convert edgar link column to character from levels
    index.df$edgar.link <- as.character(index.df$edgar.link)
    
    # get ACCESSION NUMBER as a fourth element of edgar link delimted by '/'
    accessions <- do.call(rbind.data.frame, strsplit(index.df$edgar.link, "\\/"))[4]
    index.df$accession.number <- gsub("\\.txt", "", accessions[, 1])
    
    row.names(index.df) <- c(1:nrow(index.df))
      
    index.df$status <- NA
    
    for (i in 1:total.files) {
      
      edgar.link <- paste0("https://www.sec.gov/Archives/", index.df$edgar.link[i])
      
      f.type <- gsub("/", "", index.df$form.type[i])
      
      year <- index.df$filing.year[i]
      cik <- index.df$cik[i]
        
      new.dir <- paste0("Edgar filings_full text/Form ", f.type)
      dir.create(new.dir)
      new.dir2 <- paste0(new.dir, "/", cik)
      dir.create(new.dir2)
      
      dest.filename <- paste0(new.dir2, "/", cik, "_", f.type, 
                              "_", index.df$date.filed[i], 
                              "_", index.df$accession.number[i], ".txt")
      
      if (file.exists(dest.filename)) {
        res <- TRUE
      } else {
        res <- DownloadSECFile(edgar.link, dest.filename, dmethod)
      }
      
      if (res) {
        index.df$status[i] <- "Download success"
        
      } else {
        index.df$status[i] <- "Download Error"
      }
      
      # Update progress bar
      setTxtProgressBar(progress.bar, i)
      
    }
    
    index.df$edgar.link <- NULL

    # Close progress bar
    close(progress.bar)
    
    return(index.df)
  }
  
}

In [None]:
##getBusinDescr
f_type = ["10-K", "10-K405","10KSB", "10KSB40"]

output = getFilings(cik.no = cik.no, form.type = f.type , filing.year, 
                   quarter = [1, 2, 3, 4], downl.permit = "y")

print("Extracting 'Item 1' section...\n")


# Function for text cleaning
def CleanFiling2(text):

    text = gsub("[0-9]+", "", text)  ## remove Alphnumerics

    text = gsub("\s{1,}", " ", text)

    text = gsub('\"',"", text)

    return text


In [31]:
a = [1,2,3,4,5]
a[2:4]

[3, 4]

In [None]:
def preprocess_filing_text(f_text):
    # Preprocessing the filing text
    f_text = re.sub("\n|\t|,", " ", f_text)
    f_text = re.sub("\s{2,}|\/", " ", f_text)
    f_text = re.sub("^\s{1,}", "", f_text)
    
    f_text = re.sub("Items", "Item", f_text, re.I re.I = True)
    f_text = re.sub("PART I", "", f_text, re.I = True)
    f_text = re.sub("Item III", "Item 3", f_text, re.I = True)
    f_text = re.sub("Item II", "Item 2", f_text, re.I = True)
    f_text = re.sub("Item I|Item l", "Item 1", f_text, re.I = True)
    f_text = re.sub(":|\*", "", f_text, re.I = True)
    f_text = re.sub("-", " ", f_text)
    f_text = re.sub("ONE", "1", f_text, re.I = True)
    f_text = re.sub("TWO", "2", f_text, re.I = True)
    f_text = re.sub("THREE", "3", f_text, re.I = True)
    f_text = re.sub("1\s{0,}\.", "1", f_text)
    f_text = re.sub("2\s{0,}\.", "2", f_text)
    f_text = re.sub("3\s{0,}\.", "3", f_text)
    
    # Check for empty Lines and delete it
    empty_lnumbers = grep("^\s*$", f_text)
    
    if len(empty_lnumbers) > 0:
        f_text = f_text[-empty_lnumbers]  ## Remove all lines only with space
        
    return f_text

In [None]:
output = pd.DataFrame()
for i in range(output.shape[0]):
    f_type = re.sub("/", "", output["form_type"][i])
    year = output["filing_year"][i]
    cik = output["cik"][i]
    date_filed = output["date.filed"][i]
    accession.number = output["accession_number"][i]
    
    dest_filename = "".join([ "Edgar filings_full text/Form ", f_type, "/", cik, "/", cik, "_", f_type, "_",
                             date_filed, "_", accession_number, ".txt"])
    filing_text = open(dest_filename, "r").readlines()
    
    ## Take data from first <DOCUMENT> to </DOCUMENT>
    start = grep("<DOCUMENT>", filing.text, re.I = TrueRUE)[1]
    stop = grep("</DOCUMENT>", filing.text, re.I = TrueRUE)[1]
    filing_text = filing.text[(start-1):stop]
    
    ## See if 10-K is in XBRL or old text format
    #if (any(grepl("<xml>|<type>xml|<html>|10k.htm", filing_text, re.I = True)):
    #  
    #  doc = XML::htmlParse(filing_text, asText = TRUE)
    #  
    #  f_text = XML::xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", 
    #                             XML::xmlValue)
    #  
    #  f_text = iconv(f_text, "latin1", "ASCII", sub = " ")
    #else:
    #  f_text = filing_text
      
    f_text = preprocess_filing_text(f_text)

    
    # Cobine lines with ITEM only in one line and attach 1. Business from another line
    item_lnumbers = grep("^ITEM\\s{0,}\\d{0,}\\s{0,}$|^ITEM\\s{0,}1 and 2\\s{0,}$", f_text, re.I = True)
    
    f_text[item_lnumbers + 1] = paste0(f_text[item_lnumbers], " ", f_text[item_lnumbers + 1])
    
    
    # Get BUSINESS DESCRIPTION
    startline = grep("^Item\\s{0,}1\\s{0,}Business\\s{0,}\\.{0,1}\\s{0,}$|^Item\\s{0,}1\\s{0,}DESCRIPTION OF BUSINESS\\s{0,}\\.{0,1}\\s{0,}$", 
                      f_text, re.I = True)
    
    endline = grep("^Item\\s{0,}2\\s{0,}Properties\\s{0,}\\.{0,1}\\s{0,}$|Item\\s{0,}2\\s{0,}DESCRIPTION OF PROPERTY\\s{0,}\\.{0,1}\\s{0,}$|^Item\\s{0,}2\\s{0,}REAL ESTATE\\s{0,}\\.{0,1}\\s{0,}$", 
                    f_text, re.I = True)
    
    # Check for (ITEMS 1 and 2. BUSINESS AND PROPERTIES) and (ITEM 3. LEGAL PROCEEDINGS)
    if (len(startline) == 0 && len(endline) == 0) {
      
      startline = grep("^Item\\s{0,}1 and 2\\s{1,}Business AND PROPERTIES\\s{0,}\\.{0,1}\\s{0,}$|^Item\\s{0,}1 and 2\\s{1,}Business and Description of Property\\s{0,}\\.{0,1}\\s{0,}$", 
                        f_text, re.I = True)
      
      endline = grep("^Item\\s{0,}3\\s{1,}LEGAL PROCEEDINGS\\s{0,}\\.{0,1}\\s{0,}$|^Item\\s{0,}3\\s{1,}LEGAL matters\\s{0,}\\.{0,1}\\s{0,}$", 
                      f_text, re.I = True)
    }
    
    product.descr = NA
    
    if (len(startline) != 0 && len(endline) != 0) {
      
      if (len(startline) == len(endline)) {
        for (l in 1:len(startline)) {
          product.descr[l] = paste(f_text[startline[l]:endline[l]], collapse = " ")
        }
      } else {
        startline = startline[len(startline)]
        endline = endline[len(endline)]
        product.descr = paste(f_text[startline:endline], collapse = " ")
      }
      
      product.descr = gsub("\\s{2,}", " ", product.descr)
      words.count = stringr::str_count(product.descr, pattern = "\\S+")
      product.descr = product.descr[which(words.count == max(words.count))]
      
      product.descr = gsub(" co\\.| inc\\.| ltd\\.| llc\\.| comp\\.", " ", product.descr, re.I = True)
      
      product.descr2 = unlist(strsplit(product.descr, "\\. "))
      product.descr2 = paste0(product.descr2, ".")
      
      #product.descr = CleanFiling3(product.descr)
    }
    
    new_dir = paste0("Business descriptions text")
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)
    
    
    filename2 = paste0(new.dir, '/',cik, "_", f.type, "_", date.filed, 
                        "_", accession.number, ".txt")
    with open(filename2) as outfile:
        outfile.writelines(product_descr2)
    
  }
  
  ## convert dates into R dates
  output["date_filed"] = as.Date(str(output$date.filed), "%Y-%m-%d")
  
  output["quarter"] = NULL
  output["filing_year"] = NULL
  output["status"] = NULL
  
  print("Business descriptions are stored in 'Business descriptions text' directory.")
  
  return(output)
}