In [1]:
import pandas as pd
import numpy as np
import wrds

In [2]:
# We use this everywhere
bea_codes = pd.read_excel('Data/User inputs/NAICS2BEA.xlsx')
bea_seg = pd.read_stata('Data/Temp/levelkey.dta')

# Ind_short to indcode
bea_seg.rename(columns = {'ind_short': 'indcode'}, inplace = True)

# Set up WRDS connection
db = wrds.Connection(wrds_username='tadej')

Loading library list...
Done


# Regulation index

All I do here is that I merge industry codes to the regulation index, and compute mean and median by industry group

In [3]:
# Import data
data_reg = pd.read_excel('Data/Raw inputs/regdata_by_3-digit_industry.xls')

# Rename columns, drop some
data_reg.rename(inplace = True, columns = {'industry': 'naics', 
                'Industry-relevant restriction count (Industry Regulation Index)': 'regindex'})
data_reg.drop(columns = ['Industry-relevant words'], inplace = True)

# Create log of reg
data_reg['logreg'] = np.log(data_reg['regindex'])

# Merge with  BEA codes
data_reg = data_reg.merge(bea_codes, how = 'left')

# Merge with BEA segments
data_reg = data_reg.merge(bea_seg, how = 'inner')

In [4]:
# Create some agg variables
gby = data_reg.groupby(['indcode', 'year'])

data_reg['a1m_regindex'] = gby['regindex'].transform(np.mean)
data_reg['a1med_regindex'] = gby['regindex'].transform(np.median)

data_reg['a1m_logreg'] = gby['logreg'].transform(np.mean)
data_reg['a1med_logreg'] = gby['logreg'].transform(np.median)

# Spreads

Here I do some industry code merging and light aggregating for the industry spread datasets by Glichrist and Zakrajšek

In [5]:
# Import data
data_spread = pd.read_csv('Data/Raw inputs/spr_naics3_q.csv', parse_dates = ['date'])

# Some date stuff
data_spread['year'] = data_spread['date'].dt.year
data_spread['qtr'] = data_spread['date'].dt.quarter

# Keep only 4th quarter
data_spread.query('qtr == 4', inplace = True)

# Reanme and drop columns
data_spread.rename(columns = {'naics3': 'naics'}, inplace = True)
data_spread.drop(columns = ['date'], inplace = True)

In [6]:
# Merge with bea codes
data_spread = data_spread.merge(bea_codes, how = 'left')

# Merge with bea segments
data_spread = data_spread.merge(bea_seg, how = 'inner')

In [7]:
# Some more or less aggregate stuff
data_spread.eval('nb_spavg = nbonds * spr_avg', inplace = True)

data_spread = data_spread.pivot_table(index = ['indcode', 'year'], 
                                      values = ['nbonds', 'nb_spavg'],
                                      aggfunc = np.sum).reset_index()

data_spread.eval('a1m_spread =  nb_spavg / nbonds', inplace = True)

# Keep stuff
data_spread = data_spread.filter(items = ['year', 'indcode', 'a1m_spread'])

# PDII

Here I do some industry code merging and light aggregating for the PDII occupational licensing datasets by Kleiner-Krueger

In [8]:
# Read in data
data_pdii = pd.read_stata('Data/Raw inputs/PDII_RDD_Survey.dta',
                          columns = ['baseid', 'q11', 'q11a', 'industry'])

# Generate new variables
data_pdii['q11ind'] = None
data_pdii.loc[data_pdii['q11'] == '2: no', 'q11ind'] = 0
data_pdii.loc[data_pdii['q11'] == '1: yes', 'q11ind'] = 1

data_pdii['q11aind'] = None
data_pdii.loc[data_pdii['q11a'] == '2: no', 'q11aind'] = 0
data_pdii.loc[data_pdii['q11a'] == '1: yes', 'q11aind'] = 1

# Generate 3 digit naics
data_pdii['naics'] = data_pdii.industry.astype(str).str.slice(0, 3).astype(int)

# Drop stuff
data_pdii.drop(columns = ['industry', 'q11', 'q11a'], inplace = True)

# Make vars numeric
data_pdii = data_pdii.astype(float)

In [9]:
# Merge with bea codes
data_pdii = data_pdii.merge(bea_codes, how = 'left')

# Merge with bea segments
data_pdii = data_pdii.merge(bea_seg, how = 'inner')

In [10]:
# Aggregate across ind_code
data_pdii = data_pdii.pivot_table(index = 'indcode', values = ['q11ind', 'q11aind'],
                                  aggfunc = np.mean)

# Rename
data_pdii.rename(columns = {'q11ind': 'a1m_licensed', 'q11aind': 'a1m_licreq'}, inplace = True)

# Bushee

This merges the Bushee dataset with Thomson Reuters 13F.
Bushee dataset is the one from the replication files, while the TR 13F is pulled from WRDS, specifically the `tfn.s34` database. I select the following columns:
- `year` as the year part of `rdate`
- `mgrno`, `shares`, `cusip`, `mgrname`

I filter the date so as to keep only the entries where the month part of `rdate` is 12.

After the merging I also group together some big investment firms. Here I exclude the recoding done in the stat file for Dimensional, because in the fund they assigned it the same number as Blackrock (typo I guess), and because all those firms they wanted to group under Dimensional already have the same number.

In [28]:
# Read in data, replace none
data_bushee = pd.read_stata('Data/Raw inputs/bushee_data_2015.dta')
data_bushee = data_bushee.replace('.', np.nan)

# Drop duplicated mgrno, year, keep max mgrno_v
data_bushee.sort_values('mgrno_v', ascending= False).reset_index(inplace = True)
data_bushee.drop_duplicates(['mgrno', 'year'], keep = 'last', inplace = True)

# Rename, drop
data_bushee = data_bushee.filter(items = ['year', 'mgrno', 'invpermclass'])\
                         .rename(columns = {'invpermclass': 'invclass'})

In [29]:
# Read in Thompson Reuters 13F
select_str = '''
SELECT date_part('year', rdate) AS year, mgrno, shares, cusip, mgrname
FROM tfn.s34 
WHERE date_part('month', rdate) = 12
'''

data_13f = db.raw_sql(select_str)

# Drop none cusip
data_13f.dropna(subset = ['cusip'], inplace = True)

# Drop duplicated mgrno&cusip&year -- in 97% of the cases shares are the same, 
# so it doesn't matter. As for the rest - if they don't care, neither do I
data_13f.drop_duplicates(subset = ['mgrno', 'cusip', 'year'], inplace = True)

In [30]:
# Merge Bushee with TR 13F
data_13f = data_13f.merge(data_bushee, how = 'outer')

In [32]:
# Create new variable for mgrno
data_13f['mgrno_mapped'] = data_13f['mgrno']

# Do some recoding
data_13f.loc[data_13f.mgrname.str.contains('BLACKROCK') == True, 'mrgno_mapped'] = 11386
data_13f.loc[data_13f.mgrname.str.contains('CAPITAL RESEARCH') == True, 'mrgno_mapped'] = 12740
data_13f.loc[data_13f.mgrname.str.contains('VANGUARD GROUP') == True, 'mrgno_mapped'] = 90457

fidelity = ["FIDELITY INTERNATIONAL", "FIDELITY INTERNATL LTD", 
    "FIDELITY INTL LTD", "FIDELITY INTL. LTD.", "FIDELITY MANAGEMENT & RESEARCH",
    "FIDELITY MGMT & RES CORP", "FIDELITY MGMT & RESEARCH (US)", "FIDELITY MGMT & RESEARCH CO"]

data_13f.loc[data_13f.mgrname.isin(fidelity), 'mrgno_mapped'] = 27700

state_str = ["STATE STR BK & TRUST CO BOSTON", "STATE STR CORP", 
    "STATE STR CORPORATION", "STATE STR GBL ADVR IRELAND LTD", "STATE STR RESEARCH & MGMT CO",
    "STATE STR RESEARCH & MGMT CO.", "STATE STR RESR & MGMT", "STATE STREET BOSTON CORP",
    "STATE STREET CORP", "STATE STREET RES. & MGMT", "STATE STREET RESR & MGMT"]

data_13f.loc[data_13f.mgrname.isin(state_str), 'mrgno_mapped'] = 81540

In [33]:
# Save what we have up to this point
data_13f.to_stata('Data/Intermediate/bushee_detailed.dta')

## Percentage ownership

Here I compute percentage ownership of each institution in each firm

In [42]:
# Drop some cols
data_13f = data_13f.filter(items = ['cusip', 'year', 'invclass', 'shares'])

# Sum by cusip, year, invclass
data_13f = data_13f.pivot_table

In [43]:
data_13f.columns

Index(['year', 'mgrno', 'shares', 'cusip', 'mgrname', 'invclass',
       'mrgno_mapped'],
      dtype='object')