### Set working directory

In [24]:
import os
import re
import pandas as pd
import glob

In [25]:
print(os.getcwd())
home = '/Users/yubinkim/Dropbox/Summer paper/'
try:
    os.chdir(home + 'data/')
    print("Directory changed")
except OSerror:
    print("Can't change the current working directory")
    

print(os.getcwd())

/Users/yubinkim/Dropbox/Summer paper
Directory changed
/Users/yubinkim/Dropbox/Summer paper/data


### Import data

In [30]:
Valero2014 = '1035002_1_0001035002-14-000012.txt'
Kellogg2019 = '55067_1_0001628280-19-002643.txt'

In [32]:
file = open(Valero2014, "r")
text = file.read()
file.close()
# print(text)

In [33]:
#Specify the start year of filings to get
start_year = 2014
#Specify the end year of filings to get 
end_year = 2019
home = '/Users/yubinkim/Dropbox/Summer paper'
filings = home+'/data/'

In [37]:
def process_header(start_year:int, end_year:int, filings:str, out):
    edgar_vars = {
        "file": re.compile(r'<SEC-DOCUMENT>(.*\.txt)', re.IGNORECASE),
        "cik": re.compile(r'^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d{10})', re.IGNORECASE),
        "report_date": re.compile(r'^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d{8})', re.IGNORECASE),
        "file_date": re.compile(r'^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d{8})', re.IGNORECASE),
        "name": re.compile(r'^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.+$)', re.IGNORECASE),
        "sic": re.compile(r'^\s*STANDARD\s*INDUSTRIAL\s*CLASSIFICATION:.*?(\d{4})', re.IGNORECASE),
        "hlink":re.compile(r'(.*?(([0]*(\d+))\-(\d{2})\-(\d{6})))', re.IGNORECASE)
    }
    
    regex_endheader = re.compile(r'</SEC-HEADER>', re.IGNORECASE)
    
    eframe = pd.DataFrame(columns = edgar_vars.keys())
    
    for year in range(start_year, end_year+1):
        #specify the files you to get from the folder
        path = filings + str(year) + '/*.txt'
        #read in the names of each of the files contained in the folder
        files = glob.glob(path)
        #process one file at a time.
        for file in files:
            #create a dictionary to hold the information we are obtaining (e.g., cik number)
            header_vars = {}
            #for each of the keys contained in the dictionary, set the initial value to -99.
            #This way we are sure that each item is defined in the dictionary even if we cannot
            #find the value?
            for x in edgar_vars.keys():
                header_vars[x] = -99
            #open the file we are processing and read it in one line at a time.
            f = open(file, 'r')
            for line in f:
                tems = edgar_vars.items()
                #loop through the dictionary and assign
                #the key to "k" and the value to "v"
                #For example, the first time through, k="file", and v="re.compile('<SEC-DOCUMENT>(.*\.txt)', re.IGNORECASE)"
                for k, v in tems:
                    match = v.search(line)
                    #if a match is found for the first time, add it to the dictionary containing the header values.
                    #The purpose of the expression, re_key!="hlink" is to not try and match the hlink expression.
                    #The hlink expression is used at the end to create a hyperlink to the file on edgar.
                    if match and header_vars[k] == -99 and k != "hlink":
                        header_vars[k] = match.group(1)
                #check to see if we are at the end of the header part of the filing.
                #exit if we are there
                match = regex_endheader.search(line)
                if match:
                    break
            f.close()
            #create a link to the file on edgar
            if header_vars['file']!=-99:
                    #Construct a link to the actual filing
                    match = edgar_vars['hlink'].search(header_vars['file'])
                    if match:
                            header_vars['hlink'] = str('http://www.sec.gov/Archives/edgar/data/')+str(header_vars['cik'].lstrip('0'))+str("/")+str(match.group(3))+str(match.group(5))+str(match.group(6))+str("/")+str(match.group(2))+str("-index.htm")
                    eframe.loc[len(eframe)] = header_vars
                    
    eframe.to_csv(out, sep = ",", encoding = 'utf-8')
    print(f'Header File: {out} created')
    return eframe

In [38]:
outfile = home+ '\filingsoutput.csv'
edgar_dat = process_header(start_year, end_year, filings, outfile)
edgar_dat.head()

Header File: /Users/yubinkim/Dropbox/Summer paperilingsoutput.csv created


Unnamed: 0,file,cik,report_date,file_date,name,sic,hlink
0,0001035002-14-000012.txt,1035002,20131231,20140321,VALERO ENERGY CORP/TX,2911,http://www.sec.gov/Archives/edgar/data/1035002...
1,0001035002-15-000015.txt,1035002,20141231,20150320,VALERO ENERGY CORP/TX,2911,http://www.sec.gov/Archives/edgar/data/1035002...
2,0001035002-16-000079.txt,1035002,20151231,20160331,VALERO ENERGY CORP/TX,2911,http://www.sec.gov/Archives/edgar/data/1035002...
3,0001035002-17-000015.txt,1035002,20161231,20170323,VALERO ENERGY CORP/TX,2911,http://www.sec.gov/Archives/edgar/data/1035002...
4,0001193125-18-090034.txt,1035002,20180503,20180321,VALERO ENERGY CORP/TX,2911,http://www.sec.gov/Archives/edgar/data/1035002...


0    http://www.sec.gov/Archives/edgar/data/1035002...
1    http://www.sec.gov/Archives/edgar/data/1035002...
2    http://www.sec.gov/Archives/edgar/data/1035002...
3    http://www.sec.gov/Archives/edgar/data/1035002...
4    http://www.sec.gov/Archives/edgar/data/1035002...
5    http://www.sec.gov/Archives/edgar/data/55067/0...
Name: hlink, dtype: object
