# SEC Form 4 Data Collation
Full README.md see here: https://www.sec.gov/files/insider_transactions_readme.pdf 

# Overview of Merging Steps
1. Merge deriv_trans and nonderiv_trans by the same columns, rename primary key to trans_sk
2. Drop rows (keep only direct ownership, keep transactions with date before 2024 Q4, drop rows with NA transaction date, split dataset to those with 0 and non0 transaction amount (shares x price per share)
- we use the data with non0 transactions, total 3191965 rows, with transaction dates 1992 - 2024 
3. Merge with form 4 submission data (note that 1 submission usually have multiple transactions, max 30 each submission)
- this allows us to get submission/filing date and the CIK of shares they buy (ISSUECIK)

-- 4. Filter the transactions with Issue Trading Symbol (i.e. stock code) NOT in the historical stock price data, taken from Kaggle, in order to calculate abnormal returns later --

5. Merge with reporting owner data, to get name of the person who owns (i.e. acquires and disposes these shares)
- currently, we only merge those whos submission has only 1 reporting owner, only 1% have more than 1 reporting owner.

In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS
import yfinance as yf
pd.set_option('display.max_columns', None)

In [9]:
DATA_FOLDER = "FINAL_RAW_DATA"
LITIGATIONS_DATA_PATH = "infected.csv"
STOCK_PRICE_DATA_PATH = "dataset_summary.csv"
YEARS_THRESHOLD = (2005, 2021) # to match little sis network data

In [10]:
## Form Submission Main data
submission_data = pd.read_csv(f"{DATA_FOLDER}/SUBMISSION.csv") # ACCESSION_NUMBER is the primary key

## Transaction info for each submission (buy and sell), ACCESSION_NUMBER and (NON)DERIV_TRANS_SK are the primary keys
# One form (i.e. ACCESSION_NUMBER) can have multiple transactions (i.e. *_SK), transactions can be across multiple years, max 30 each 
# Duplicate *_SK keys are for different transactions, and there are max 2 of each duplicate _SK keys
nonderiv_trans_data = pd.read_csv(f"{DATA_FOLDER}/NONDERIV_TRANS.csv")
deriv_trans_data = pd.read_csv(f"{DATA_FOLDER}/DERIV_TRANS.csv")

## Holding info for each submission (what they have - After each transaction..?)
nonderiv_holding_data = pd.read_csv(f"{DATA_FOLDER}/NONDERIV_HOLDING.csv")
deriv_holding_data = pd.read_csv(f"{DATA_FOLDER}/DERIV_HOLDING.csv")

## Name info
reporting_owner_data = pd.read_csv(f"{DATA_FOLDER}/REPORTINGOWNER.csv")
names_data = pd.read_csv(f"{DATA_FOLDER}/OWNER_SIGNATURE.csv") 

## Additional info, to match with '*_FN' columns in all other datasets based on matching ACCESSION_NUMBER
footnotes_data = pd.read_csv(f"{DATA_FOLDER}/FOOTNOTES.csv")

  nonderiv_trans_data = pd.read_csv(f"{DATA_FOLDER}/NONDERIV_TRANS.csv")
  deriv_trans_data = pd.read_csv(f"{DATA_FOLDER}/DERIV_TRANS.csv")
  nonderiv_holding_data = pd.read_csv(f"{DATA_FOLDER}/NONDERIV_HOLDING.csv")
  deriv_holding_data = pd.read_csv(f"{DATA_FOLDER}/DERIV_HOLDING.csv")


In [11]:
# Overview of the data. Drop duplicates if any
dataframes = {
    'submission_data': submission_data,
    'nonderiv_trans_data': nonderiv_trans_data,
    'deriv_trans_data': deriv_trans_data,
    'nonderiv_holding_data': nonderiv_holding_data,
    'deriv_holding_data': deriv_holding_data,
    'reporting_owner_data': reporting_owner_data,
    'names_data': names_data,
    #'footnotes_data': footnotes_data
}

for name, df in dataframes.items():
    shape = df.shape
    print(f"{name}, {shape}")
    
    # Commented out because only names_data has duplicates, which is not used in merging

    #df.drop_duplicates(inplace=True)
    #if df.shape[0] != shape[0]:
    #    print(f"Duplicate rows removed: {shape[0] - df.shape[0]}")
    #else:
    #    print("No duplicate rows")
    #print()

submission_data, (2917488, 13)
nonderiv_trans_data, (4343860, 28)
deriv_trans_data, (1763084, 42)
nonderiv_holding_data, (1522788, 14)
deriv_holding_data, (1000283, 26)
reporting_owner_data, (3171123, 13)
names_data, (3119138, 3)


In [12]:
SELECTED_TRANSACTION_COLS = ['ACCESSION_NUMBER', 'SECURITY_TITLE', 'TRANS_DATE', 'DEEMED_EXECUTION_DATE', 'TRANS_CODE', 'EQUITY_SWAP_INVOLVED',
                             'TRANS_TIMELINESS', 'TRANS_SHARES', 'TRANS_PRICEPERSHARE', 'TRANS_ACQUIRED_DISP_CD',
                             'SHRS_OWND_FOLWNG_TRANS', 'DIRECT_INDIRECT_OWNERSHIP', 'NATURE_OF_OWNERSHIP']
# partial primary keys: 'NONDERIV_TRANS_SK', 'DERIV_TRANS_SK'
DERIV_TRANS_UNSURE_COLS = ['CONV_EXERCISE_PRICE', 'EQUITY_SWAP_INVOLVED', 'EXCERCISE_DATE', 'EXPIRATION_DATE', 'UNDLYNG_SEC_SHARES', 'UNDLYNG_SEC_VALUE']

SUBMISSION_COLS = ['ACCESSION_NUMBER', 'FILING_DATE', 'PERIOD_OF_REPORT', 'ISSUERCIK', 'ISSUERNAME', 'ISSUERTRADINGSYMBOL']

REPORTING_OWNER_COLS = ['RPTOWNERCIK', 'RPTOWNERNAME','RPTOWNER_RELATIONSHIP'] # there is also address data and filenumber

## Merge Transaction datasets

In [13]:
nonderiv_trans_data['TRANS_DATE'] = pd.to_datetime(nonderiv_trans_data['TRANS_DATE'], errors='coerce')
deriv_trans_data['TRANS_DATE'] = pd.to_datetime(deriv_trans_data['TRANS_DATE'], errors='coerce')

## Cannot directly concatenate the two dataframes as they have different columns some with identical names
df1 = nonderiv_trans_data[['NONDERIV_TRANS_SK'] + SELECTED_TRANSACTION_COLS].copy().rename(columns={'NONDERIV_TRANS_SK':'TRANS_SK'})
df2 = deriv_trans_data[['DERIV_TRANS_SK'] + SELECTED_TRANSACTION_COLS].copy().rename(columns={'DERIV_TRANS_SK':'TRANS_SK'})
all_transaction_data = pd.concat([df1,df2], axis=0, ignore_index=True).reset_index(drop=True)
print(all_transaction_data.shape)

(6106944, 14)


### Feature Engineering and Cleaning

In [14]:
## Create transaction amount column
all_transaction_data['trans_amt'] = all_transaction_data['TRANS_SHARES'] * all_transaction_data['TRANS_PRICEPERSHARE']

trans_amt_0 = all_transaction_data[all_transaction_data['trans_amt'] == 0].shape[0]
print(f"There are {trans_amt_0} transactions with amount 0, {trans_amt_0/(all_transaction_data.shape[0])*100:.2f}% of all transactions")

There are 2250213 transactions with amount 0, 36.85% of all transactions


In [15]:
# To ensure we can drop indirect ownership transactions
all_transaction_data['DIRECT_INDIRECT_OWNERSHIP'].value_counts()

DIRECT_INDIRECT_OWNERSHIP
D    5257448
I     849496
Name: count, dtype: int64

In [16]:
# Drop irrelevant rows

# Select only transactions from 2005 to 2021
all_transaction_data = all_transaction_data[(all_transaction_data['TRANS_DATE'].dt.year >= YEARS_THRESHOLD[0]) & (all_transaction_data['TRANS_DATE'].dt.year <= YEARS_THRESHOLD[1])]
print(all_transaction_data.shape)

# Filter out indirect ownership
all_transaction_direct = all_transaction_data[all_transaction_data['DIRECT_INDIRECT_OWNERSHIP'] == 'D']
print(all_transaction_direct.shape)

## Remove transactions with transaction date after 2024 Q4
all_transaction_direct = all_transaction_direct[all_transaction_direct['TRANS_DATE'] < pd.to_datetime('2024-12-31')]
print(all_transaction_direct.shape)

#Filter out rows with NA for transaction date
all_transaction_direct = all_transaction_direct[~(all_transaction_direct['TRANS_DATE'].isna())]
print(all_transaction_direct.shape)

#Split data where trans_amt is 0 and non0, use the non0 data
## e.g. non qualified stock option because is a form of compensation and will have 0 transaction amount
all_transaction_direct_comp = all_transaction_direct[all_transaction_direct['trans_amt'] == 0]
all_transaction_direct = all_transaction_direct[all_transaction_direct['trans_amt'] != 0]

print(all_transaction_direct.shape)

(4907535, 15)
(4213120, 15)
(4213120, 15)
(4213120, 15)
(2596384, 15)


In [17]:
all_transaction_direct[['TRANS_DATE']].describe() # now filtered

Unnamed: 0,TRANS_DATE
count,2596384
mean,2016-04-02 07:18:07.438528768
min,2005-01-02 00:00:00
25%,2013-05-19 00:00:00
50%,2016-02-05 00:00:00
75%,2019-02-14 00:00:00
max,2021-12-31 00:00:00


In [18]:
all_transaction_direct.head()

Unnamed: 0,TRANS_SK,ACCESSION_NUMBER,SECURITY_TITLE,TRANS_DATE,DEEMED_EXECUTION_DATE,TRANS_CODE,EQUITY_SWAP_INVOLVED,TRANS_TIMELINESS,TRANS_SHARES,TRANS_PRICEPERSHARE,TRANS_ACQUIRED_DISP_CD,SHRS_OWND_FOLWNG_TRANS,DIRECT_INDIRECT_OWNERSHIP,NATURE_OF_OWNERSHIP,trans_amt
0,2360796,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,1046.0,28.17,D,43944.0,D,,29465.82
1,2360797,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,637.0,28.17,D,43307.0,D,,17944.29
2,2360795,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,1278.0,28.17,D,44990.0,D,,36001.26
5,2350316,0001144204-14-040532,Common Stock,2014-05-28,,A,0,,1176.0,10.6,A,50295.0,D,,12465.6
6,2350317,0001144204-14-040532,Common Stock,2014-06-27,,A,0,,4673.0,10.7,A,54968.0,D,,50001.1


## Merge with submission data

In [19]:
submission_data['FILING_DATE'] = pd.to_datetime(submission_data['FILING_DATE'], errors='coerce')
submission_data[['FILING_DATE']].describe() # 2011-2024

Unnamed: 0,FILING_DATE
count,2917488
mean,2017-10-29 01:33:20.134912
min,2011-01-03 00:00:00
25%,2014-04-08 00:00:00
50%,2017-10-11 00:00:00
75%,2021-05-04 00:00:00
max,2024-12-31 00:00:00


In [20]:
all_transaction_direct_2 = all_transaction_direct.merge(submission_data[SUBMISSION_COLS], on='ACCESSION_NUMBER', how='left')
all_transaction_direct_2.shape

(2596384, 20)

In [21]:
all_transaction_direct_2.head(2)

Unnamed: 0,TRANS_SK,ACCESSION_NUMBER,SECURITY_TITLE,TRANS_DATE,DEEMED_EXECUTION_DATE,TRANS_CODE,EQUITY_SWAP_INVOLVED,TRANS_TIMELINESS,TRANS_SHARES,TRANS_PRICEPERSHARE,TRANS_ACQUIRED_DISP_CD,SHRS_OWND_FOLWNG_TRANS,DIRECT_INDIRECT_OWNERSHIP,NATURE_OF_OWNERSHIP,trans_amt,FILING_DATE,PERIOD_OF_REPORT,ISSUERCIK,ISSUERNAME,ISSUERTRADINGSYMBOL
0,2360796,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,1046.0,28.17,D,43944.0,D,,29465.82,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL
1,2360797,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,637.0,28.17,D,43307.0,D,,17944.29,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL


In [22]:
all_transaction_direct_2['TRANS_DATE'].describe()

count                          2596384
mean     2016-04-02 07:18:07.438528768
min                2005-01-02 00:00:00
25%                2013-05-19 00:00:00
50%                2016-02-05 00:00:00
75%                2019-02-14 00:00:00
max                2021-12-31 00:00:00
Name: TRANS_DATE, dtype: object

In [23]:
all_transaction_direct_2['ISSUERTRADINGSYMBOL'].nunique()

13192

## Get Reporting Owner/Owner Signature
- currently the code joins only 1-1 matching, from all_transaction_direct_filterd to reporting_owner_data by 'ACCESSION_NUMBER'

In [24]:
# Get overalpping 1-1 matching of accession numbers across the three datasets to avoid duplicated rows from joining

## Get unique pk from all_transaction_direct_filterd
accession_num_unique = all_transaction_direct_2[['ACCESSION_NUMBER']].drop_duplicates()
print(accession_num_unique.shape)

## get unique pk from reporting_owner_data
matching_reporting_owner = reporting_owner_data[reporting_owner_data['ACCESSION_NUMBER'].isin(accession_num_unique['ACCESSION_NUMBER'])]
matching_rpt_pk = matching_reporting_owner[matching_reporting_owner['ACCESSION_NUMBER'].map(matching_reporting_owner['ACCESSION_NUMBER'].value_counts()) == 1][['ACCESSION_NUMBER']]
print(matching_rpt_pk.shape)

print("Final number of unique ACCESSION_NUMBERS with 1-1 matching names,", matching_rpt_pk.shape[0])

(1322672, 1)
(1306752, 1)
Final number of unique ACCESSION_NUMBERS with 1-1 matching names, 1306752


In [25]:
# Merge 1-1 matching data

## filter all_transaction_direct_filterd to get only ACCESSION_NUMBRE in matching_rpt_names_pk
all_transaction_direct_final = all_transaction_direct_2[all_transaction_direct_2['ACCESSION_NUMBER'].isin(matching_rpt_pk['ACCESSION_NUMBER'])]

## merge with reporting_owener_data 
all_transaction_direct_final = all_transaction_direct_final.merge(reporting_owner_data[['ACCESSION_NUMBER'] + REPORTING_OWNER_COLS], on='ACCESSION_NUMBER', how='left') 

print(all_transaction_direct_final.shape)
all_transaction_direct_final.head()

(2546985, 23)


Unnamed: 0,TRANS_SK,ACCESSION_NUMBER,SECURITY_TITLE,TRANS_DATE,DEEMED_EXECUTION_DATE,TRANS_CODE,EQUITY_SWAP_INVOLVED,TRANS_TIMELINESS,TRANS_SHARES,TRANS_PRICEPERSHARE,TRANS_ACQUIRED_DISP_CD,SHRS_OWND_FOLWNG_TRANS,DIRECT_INDIRECT_OWNERSHIP,NATURE_OF_OWNERSHIP,trans_amt,FILING_DATE,PERIOD_OF_REPORT,ISSUERCIK,ISSUERNAME,ISSUERTRADINGSYMBOL,RPTOWNERCIK,RPTOWNERNAME,RPTOWNER_RELATIONSHIP
0,2360796,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,1046.0,28.17,D,43944.0,D,,29465.82,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL,1453971,Ortigas-Wedekind Marga,Officer
1,2360797,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,637.0,28.17,D,43307.0,D,,17944.29,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL,1453971,Ortigas-Wedekind Marga,Officer
2,2360795,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,1278.0,28.17,D,44990.0,D,,36001.26,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL,1453971,Ortigas-Wedekind Marga,Officer
3,2350316,0001144204-14-040532,Common Stock,2014-05-28,,A,0,,1176.0,10.6,A,50295.0,D,,12465.6,2014-06-30,2014-05-28,1474464,"New York REIT, Inc.",NYRT,1545007,Bowman Scott J.,Director
4,2350317,0001144204-14-040532,Common Stock,2014-06-27,,A,0,,4673.0,10.7,A,54968.0,D,,50001.1,2014-06-30,2014-05-28,1474464,"New York REIT, Inc.",NYRT,1545007,Bowman Scott J.,Director


In [26]:
all_transaction_direct_final.to_csv(f"{DATA_FOLDER}/all_transactions_merged.csv")

# ######## DONE W MERGING HERE, @ Lewis can delete the rest below aha #######

# After Merging All transactions

In [27]:
all_transactions_final = pd.read_csv(f"{DATA_FOLDER}/all_transactions_merged.csv")
all_transactions_final.shape

(2546985, 24)

In [None]:
# keep only tickers that have more than 8 transactions
unique_ticker_trans_8above = pd.read_csv(f'{DATA_FOLDER}/unique_names_trans_8above.csv')
unique_ticker_trans_8above = df[['ISSUERTRADINGSYMBOL']].drop_duplicates()

all_transaction_ticker_final = all_transaction_direct_2[all_transaction_direct_2['ISSUERTRADINGSYMBOL'].isin(unique_ticker_trans_8above['TICKER'])]
all_transaction_ticker_final

# Test PERMNO - TICKER

In [None]:
#pd.set_option('display.max_rows', 100)
all_beta_daily.sort_values(by=["PERMNO", "DATE", "TICKER"], inplace=True)

# Calculate the percentage change in b_mkt within each (PERMNO, DATE) group
# all_beta_daily['b_mkt_pct_change'] = all_beta_daily.groupby(['PERMNO', 'DATE'])['b_mkt'].pct_change()

In [None]:
# get only permnos with 1 unique ticker
permno_ticker_unique = all_beta_daily.groupby(['TICKER'])['PERMNO'].agg(unique=lambda x: x.nunique()).reset_index()
permno_ticker_nonunique = permno_ticker_unique[permno_ticker_unique['unique'] > 1][['TICKER']]
# filter OUT permnos with multiple tickers, now ticker can be the primary key
all_beta_daily_nonunique_permno = all_beta_daily[all_beta_daily['TICKER'].isin(permno_ticker_nonunique['TICKER'])]
all_beta_daily_nonunique_permno.shape 

(6437, 14)

In [None]:
all_beta_daily_nonunique_permno.sort_values(by=["PERMNO", "DATE", "TICKER"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_beta_daily_nonunique_permno.sort_values(by=["PERMNO", "DATE", "TICKER"], inplace=True)


In [None]:
all_beta_daily_nonunique_permno

Unnamed: 0,PERMNO,DATE,n,RET,alpha,b_mkt,b_smb,b_hml,b_umd,ivol,tvol,R2,exret,TICKER
118483,10225,2011-10-04,365,5.2583%,0.0003,0.9456,0.0951,-0.0406,0.0516,1.1265%,1.7524%,58.6780%,2.9278%,BEAM
118482,10225,2011-10-05,365,1.6313%,0.0003,0.9438,0.0876,-0.0193,0.0693,1.1248%,1.7535%,58.8503%,-0.1627%,BEAM
118481,10225,2011-10-06,365,1.1873%,0.0003,0.9413,0.0794,-0.0091,0.0805,1.1238%,1.7542%,58.9565%,-0.5741%,BEAM
118480,10225,2011-10-07,365,0.0652%,0.0004,0.9418,0.0671,-0.0124,0.1000,1.1240%,1.7540%,58.9300%,0.9294%,BEAM
118479,10225,2011-10-10,365,1.4115%,0.0004,0.9301,0.0665,-0.0192,0.1067,1.1263%,1.7485%,58.5031%,-1.7996%,BEAM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717128,11522,2000-08-25,365,0.0000%,0.0052,1.0646,0.5403,-0.3934,-0.7490,7.1399%,7.2428%,2.8192%,-0.3742%,BEAM
717127,11522,2000-08-28,365,-0.3289%,0.0053,1.0585,0.5397,-0.3847,-0.7410,7.1392%,7.2401%,2.7679%,-0.6747%,BEAM
717782,11522,2000-08-29,365,0.0000%,0.0052,1.0583,0.5379,-0.3825,-0.7375,7.1393%,7.2401%,2.7669%,-0.4307%,BEAM
717126,11522,2000-08-30,365,0.3300%,0.0052,1.0668,0.5410,-0.3813,-0.7433,7.1390%,7.2396%,2.7592%,0.6483%,BEAM


In [None]:
# get only permnos with 1 unique ticker
permno_ticker_unique = all_beta_daily.groupby(['TICKER'])['PERMNO'].agg(unique=lambda x: x.nunique()).reset_index()
permno_ticker_unique = permno_ticker_unique[permno_ticker_unique['unique'] == 2][['TICKER']]
# filter OUT permnos with multiple tickers, now ticker can be the primary key
two_permnos = all_beta_daily[all_beta_daily['TICKER'].isin(permno_ticker_unique['TICKER'])]
two_permnos.shape 

(4017849, 14)

In [None]:
all_beta_daily[all_beta_daily['TICKER']=='HVT'].sort_values(by='DATE')

Unnamed: 0,PERMNO,DATE,n,RET,alpha,b_mkt,b_smb,b_hml,b_umd,ivol,tvol,R2,exret,TICKER
165654,10294,1998-08-28,365,0.0000%,0.0018,0.6607,-0.5035,1.1322,-1.0397,2.9719%,3.0381%,4.3085%,0.5159%,HVT
165653,10294,1998-08-31,365,5.0000%,0.0019,0.5711,-0.6198,1.1562,-1.1007,2.9742%,3.0425%,4.4394%,5.5417%,HVT
165652,10294,1998-09-01,365,0.0000%,0.0019,0.5709,-0.6202,1.1397,-1.0946,2.9744%,3.0425%,4.4241%,0.9452%,HVT
165651,10294,1998-09-02,365,0.0000%,0.0019,0.5787,-0.6236,1.1648,-1.0914,2.9737%,3.0424%,4.4652%,0.8726%,HVT
165650,10294,1998-09-03,365,-5.3571%,0.0017,0.6308,-0.5688,1.2781,-1.0445,2.9851%,3.0564%,4.6100%,-4.8802%,HVT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159427,10294,2024-12-24,365,0.8297%,-0.0006,0.5503,0.9635,0.6465,-0.3095,2.4395%,2.7345%,20.4117%,0.5364%,HVT
159426,10294,2024-12-26,365,0.8229%,-0.0006,0.5515,0.9595,0.6466,-0.3132,2.4393%,2.7342%,20.4099%,-0.0812%,HVT
159425,10294,2024-12-27,365,0.3355%,-0.0005,0.5798,0.9387,0.6895,-0.3263,2.4182%,2.7234%,21.1554%,0.9594%,HVT
159424,10294,2024-12-30,365,-1.2156%,-0.0006,0.5760,0.9471,0.6757,-0.3127,2.4170%,2.7180%,20.9250%,-1.2039%,HVT


In [None]:
# do those with 1 value per month have the same date usually? Can I use this date to filter out those with duplicate months? 
# do those with very few values actually appear in transaction data?? 
# all_beta_monthly[all_beta_monthly['TICKER']=='SKT'].sort_values('DATE').head(30)

In [None]:
(all_beta_monthly.groupby(['PERMNO'])['TICKER'].agg(un=lambda x: x.nunique()).reset_index()['un'] == 1).sum()
#groupby(['symbol'])['date'].agg(
#    count='count').reset_index().sort_values(by='count', ascending=False) 4/5 are 1-1 matching

4795

In [None]:
# get only permnos with 1 unique ticker
permno_ticker_unique = all_beta_monthly.groupby(['PERMNO'])['TICKER'].agg(unique=lambda x: x.nunique()).reset_index()
permno_ticker_unique = permno_ticker_unique[permno_ticker_unique['unique'] == 1][['PERMNO']]
permno_ticker_unique

Unnamed: 0,PERMNO
0,10026
2,10032
3,10044
5,10065
8,10104
...,...
6243,93427
6244,93428
6245,93429
6246,93434


In [None]:
# filter OUT permnos with multiple tickers, now ticker can be the primary key
all_beta_daily_unique_permno = all_beta_daily[all_beta_daily['TICKER'].isin(permno_ticker_unique['TICKER'])]
all_beta_daily_unique_permno.shape 

(17631134, 14)

In [None]:
(all_beta_daily_unique_permno.groupby(['PERMNO'])['TICKER'].agg(unique=lambda x: x.nunique()).reset_index()['unique'] > 1).sum()

902

In [None]:
all_beta_daily_unique_permno['PERMNO'].nunique()

4562

In [None]:
all_beta_daily_unique_permno['DATE'] = pd.to_datetime(all_beta_daily_unique_permno['DATE'], errors='coerce')

all_transaction_beta_1_1 = all_transaction_ticker_subset_test.merge(all_beta_daily_unique_permno[['TICKER', 'DATE', 'b_mkt']], left_on=['ISSUERTRADINGSYMBOL', 'TRANS_DATE'], right_on=['TICKER', 'DATE'], how='left')
all_transaction_beta_1_1.shape # should be 1735239 rows # THERE ARE DUPLICATED ROWS CREATED IDK WHY . 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_beta_daily_unique_permno['DATE'] = pd.to_datetime(all_beta_daily_unique_permno['DATE'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_beta_daily_unique_permno['ISSUERTRADINGSYMBOL'] = all_beta_daily_unique_permno['TICKER']


(1735239, 23)

In [None]:
all_transaction_beta_1_1['b_mkt'].isna().sum() # 1619487

560204

In [None]:
560204/1735239

0.32283967799248403

In [None]:
# NA values for daily beta matching. 
275462/1724761

0.15971024391205507

In [None]:
# Monthly have slightly less NA values, but not significant. 
272474/1740426

0.156555923664666

In [None]:
12*(2024-1998)

312

In [None]:
# Inner join all_transaction_direct_2 with stock_price_data to get transaction data with avail stock price data
#all_transaction_direct_filterd = all_transaction_direct_2.merge(stock_price_data[['symbol']], left_on='ISSUERTRADINGSYMBOL', right_on='symbol', how='inner')
#all_transaction_direct_filterd.drop(columns=['symbol'], inplace=True)
#all_transaction_direct_filterd.shape

(2068637, 20)

## I am not NOT matching by Kaggle, since Kaggle data is INCOMPLETE

# #######STEPS HERE ONWARDS REQUIRE MODIFICATION, REFER TO GOOGLE DOC COMMENTS########

# Calculate Abnormal Returns 
Details refer to google doc: https://docs.google.com/document/d/12yqX3IYZeSGIn1X3HTskqh4hbwJfwo3XjEGcFEs5IqY/edit?tab=t.0 

### Prepare kaggle dataset and filter SEC Form 4 data

In [None]:
# Get Kaggle data for stocks from 1998-2021
stock_price_daily_data = pd.read_csv(f"{DATA_FOLDER}/stock_prices_latest.csv")
print("Starting shape for stock_price_daily_data", stock_price_daily_data.shape)

# Select s&p500 stocks as this is used as our market stock data
market_stock_data = stock_price_daily_data[stock_price_daily_data['symbol'] == 'SNP']

# Remove 7786 - 5413 symbols that do not exist in all_transaction_direct_final
unique_trading_symbols = all_transaction_direct_final[['ISSUERTRADINGSYMBOL']].drop_duplicates()
stock_price_daily_data = stock_price_daily_data[stock_price_daily_data['symbol'].isin(unique_trading_symbols['ISSUERTRADINGSYMBOL'])]
print("Shape after removing symbols", stock_price_daily_data.shape)

# Filter data to only include years 2005-2021
stock_price_daily_data['date'] = pd.to_datetime(stock_price_daily_data['date'], errors='coerce')
stock_price_daily_data_filtered = stock_price_daily_data[(stock_price_daily_data['date'].dt.year >= YEARS_THRESHOLD[0]) & (stock_price_daily_data['date'].dt.year <= YEARS_THRESHOLD[1])]
print("Filtered rows from stock_price_daily_data", stock_price_daily_data.shape[0] - stock_price_daily_data_filtered.shape[0])

stock_price_daily_data_filtered.sort_values(by=['symbol', 'date'], inplace=True)


## Create all_transaction_direct_final['actual_returns']
- Actual Return(on day t) = (Close Price on day t - Close Price day t-1) /  Close Price day t-1 
- Use kaggle dataset which contains daily historical stock price data, to get the close price on day t-1

In [None]:
# Step 0: Merge transaction data with stock prices for sanity check that values correspond.
test = all_transaction_direct_final.merge(stock_price_daily_data_filtered, left_on=['ISSUERTRADINGSYMBOL', 'TRANS_DATE'],
                                  right_on=['symbol', 'date'], how='left')
## Corresponds to 3/4 of dataset, where difference in TRANS_PRICEPERSHARE and close is less than 1. 
test['Compare_Values'] = test['TRANS_PRICEPERSHARE'] - test['close']
print("Most TRANS_PRICEPERSHARE correspond to the price on the market", test[test['Compare_Values'] < 1].shape[0]/test.shape[0])
print("There are some NA values: no historical stock price data on the day", test['close'].isna().sum()/test.shape[0])

# Step 1: Merge transactions with stock prices
all_transaction_direct_final_returns = all_transaction_direct_final.merge(stock_price_daily_data_filtered, 
                                            left_on=['ISSUERTRADINGSYMBOL', 'TRANS_DATE'], right_on=['symbol', 'date'], how='left')

# Step 2: Merge to get previous day's stock price
stock_price_daily_data_filtered['prev_close'] = stock_price_daily_data_filtered.groupby('symbol')['close_adjusted'].shift(1)

# Merge with transactions (again) to get previous day's stock price
all_transaction_direct_final_returns = all_transaction_direct_final_returns.merge(stock_price_daily_data_filtered[['symbol', 'date', 'prev_close']], 
                                  left_on=['ISSUERTRADINGSYMBOL', 'TRANS_DATE'], right_on=['symbol', 'date'], how='left')

# Step 3: Compute actual return using different methods
all_transaction_direct_final_returns['actual_return'] = (all_transaction_direct_final_returns['close_adjusted'] - all_transaction_direct_final_returns['prev_close']) / all_transaction_direct_final_returns['prev_close']
print("There are some NA actual_returns:", all_transaction_direct_final_returns['actual_return'].isna().sum()/test.shape[0])

Most TRANS_PRICEPERSHARE correspond to the price on the market 0.7504986424818296
There are some NA values: no historical stock price data on the day 0.11947488628293956


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_price_daily_data_filtered['prev_close'] = stock_price_daily_data_filtered.groupby('symbol')['close_adjusted'].shift(1)


There are some NA actual_returns: 0.11989466623556791


## Create all_transaction_direct_final['expected_returns']

### Create dataframe with risk_free_rate for each unique Year and Month
Download risk free rate data here: https://fred.stlouisfed.org/series/TB3MS

In [None]:
# Load the manually downloaded risk-free rate data
risk_free_rate = pd.read_csv(f"{DATA_FOLDER}/Risk_Free_Rate_TB3MS.csv")

# Convert date column to datetime
risk_free_rate['DATE'] = pd.to_datetime(risk_free_rate['observation_date'])

# Convert Rate to Decimal (Divide by 100)
risk_free_rate['Risk_Free_Rate'] = pd.to_numeric(risk_free_rate['TB3MS'], errors='coerce') / 100  

# Extract Year and Month
risk_free_rate['Year'] = risk_free_rate['DATE'].dt.year.astype(int)
risk_free_rate['Month'] = risk_free_rate['DATE'].dt.month.astype(int)

# Keep only necessary columns
risk_free_rate = risk_free_rate[['Year', 'Month', 'Risk_Free_Rate']]
risk_free_rate

Unnamed: 0,Year,Month,Risk_Free_Rate
0,1998,1,0.0504
1,1998,2,0.0509
2,1998,3,0.0503
3,1998,4,0.0495
4,1998,5,0.0500
...,...,...,...
283,2021,8,0.0005
284,2021,9,0.0004
285,2021,10,0.0005
286,2021,11,0.0005


In [None]:
# Merge with transaction data
all_transaction_direct_final_returns['Year'] = all_transaction_direct_final_returns['TRANS_DATE'].dt.year
all_transaction_direct_final_returns['Month'] = all_transaction_direct_final_returns['TRANS_DATE'].dt.month
all_transaction_direct_final_returns = all_transaction_direct_final_returns.merge(risk_free_rate, on = ['Year', 'Month'], how = 'left')

### Create dataframe with beta values for each Stock, for each Year and Month
1. Use historical Stock returns: Compute daily returns for each stock from the Kaggle dataset (unique for each company)
2. Get Get historical market returns: Compute daily returns for a market index (like S&P 500)
3. Run a linear regression: Regress the stock's returns against the market's returns.


In [None]:
# Download S&P 500 index data
sp500 = yf.download("^GSPC", start="1998-01-01", end="2022-12-31")
sp500.reset_index(inplace=True)

if isinstance(sp500.columns, pd.MultiIndex):
    sp500.columns = sp500.columns.get_level_values(0)

# Rename the 'Date' column to 'TRANS_DATE'
sp500.rename(columns={'Date': 'TRANS_DATE'}, inplace=True)

# Calculate Percentage Change to get actual market returns
sp500['market_return'] = sp500['Close'].pct_change()
sp500 = sp500[['TRANS_DATE', 'market_return']]

# Merge market data with transactions 
all_transaction_direct_final_returns = all_transaction_direct_final_returns.merge(sp500, on = 'TRANS_DATE', how = 'left')
print("Rows of where market_return is NA: ", all_transaction_direct_final_returns['market_return'].isna().sum()/all_transaction_direct_final_returns.shape[0])

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


Rows of where market_return is NA:  0.059219484876356844


### METHOD 1 FOR BETA, WHICH IS PROBABLY WRONG

In [None]:
all_transaction_direct_final_returns['Excess_Return'] = all_transaction_direct_final_returns['actual_return'] - all_transaction_direct_final_returns['Risk_Free_Rate']
all_transaction_direct_final_returns['Excess_Market_Return'] = all_transaction_direct_final_returns['market_return'] - all_transaction_direct_final_returns['Risk_Free_Rate']

window_size = 60
min_window = 24
all_transaction_direct_final_returns['Beta'] = np.nan

for ticker, group_data in all_transaction_direct_final_returns.groupby('ISSUERTRADINGSYMBOL'):
    
    group_data.dropna(subset=['Excess_Return', 'Excess_Market_Return'], inplace=True)
    
    if len(group_data) < min_window:
        continue  
    group_data = group_data.sort_values('TRANS_DATE')
    
    y = group_data['Excess_Return'] 
    X = sm.add_constant(group_data['Excess_Market_Return'])

    rolling_model = RollingOLS(endog = y, exog = X, window = min(group_data.shape[0], 60) )
    rolling_results = rolling_model.fit()

    #just get the Beta
    beta_series = rolling_results.params['Excess_Market_Return']
    all_transaction_direct_final_returns.loc[group_data.index, 'Beta'] = beta_series

In [None]:
all_transaction_direct_final_returns['expected_return'] = all_transaction_direct_final_returns['Risk_Free_Rate'] + all_transaction_direct_final_returns['Beta'] * (all_transaction_direct_final_returns['actual_return'] - all_transaction_direct_final_returns['Risk_Free_Rate'])
all_transaction_direct_final_returns['abnormal_return'] = all_transaction_direct_final_returns["actual_return"] - all_transaction_direct_final_returns['expected_return']

### METHOD 2 FOR BETA, WHICH IS TAKES TOO LONG FOR ME TO COMPUTE.. ODDLY

In [None]:
sp500['Year'] = sp500['TRANS_DATE'].dt.year
sp500['Month'] = sp500['TRANS_DATE'].dt.month

In [None]:
# Define function to compute rolling monthly beta (5 years = 60 months, minimum 2 years = 24 months)
# Aggregate stock and market excess returns to monthly level
monthly_all_transaction = all_transaction_direct_final_returns[['ISSUERTRADINGSYMBOL', 'Year', 'Month','Excess_Return']].groupby(['ISSUERTRADINGSYMBOL', 'Year', 'Month']).mean().reset_index()
# Merge stock and market excess returns
sp500_excess = sp500.merge(risk_free_rate, on = ['Year', 'Month'], how = 'left')
sp500_excess['Excess_Market_Return'] = sp500_excess['market_return'] - sp500_excess['Risk_Free_Rate']
monthly_data = monthly_all_transaction.merge(sp500_excess[['Year', 'Month', 'Excess_Market_Return']], 
                                           on=['Year', 'Month'], how='left')
print(monthly_data.columns)


def compute_rolling_beta(df, window=60, min_window=24):  
    df = df.dropna(subset=['Excess_Return', 'Excess_Market_Return'])
    rolling_betas = []

    for i in range(min_window, len(df)):
        subset = df.iloc[max(0, i-window):i]  # Use at most `window` months, but at least `min_window`
        X = subset['Excess_Market_Return']
        y = subset['Excess_Return']
        X = sm.add_constant(X)  # Add intercept

        model = sm.OLS(y, X).fit()
        rolling_betas.append((df.iloc[i]['Year'], df.iloc[i]['Month'], model.params['Excess_Market_Return']))

    return pd.DataFrame(rolling_betas, columns=['Year', 'Month', 'monthly_beta'])

# Compute rolling beta for each stock
monthly_betas = monthly_data.groupby('ISSUERTRADINGSYMBOL').apply(compute_rolling_beta).reset_index(drop=True)

Index(['ISSUERTRADINGSYMBOL', 'Year', 'Month', 'Excess_Return',
       'Excess_Market_Return'],
      dtype='object')


KeyboardInterrupt: 

In [None]:
all_transaction_direct_final_returns['Beta'].isna().sum()

501658

In [None]:
501658/all_transaction_direct_final_returns.shape[0]

0.24687687160097577

# Testing 

In [None]:
all_transaction_direct_final_returns[['ISSUERTRADINGSYMBOL', 'TRANS_DATE','actual_return', 'Risk_Free_Rate', 'Beta', 'abnormal_return']]

Unnamed: 0,ISSUERTRADINGSYMBOL,TRANS_DATE,actual_return,Risk_Free_Rate,Beta,abnormal_return
0,OMCL,2014-06-26,-0.001760,0.0004,1.343669,0.000742
1,OMCL,2014-06-26,-0.001760,0.0004,1.210671,0.000455
2,OMCL,2014-06-26,-0.001760,0.0004,1.137429,0.000297
3,NYRT,2014-05-28,0.004758,0.0003,,
4,NYRT,2014-06-27,-0.014808,0.0004,,
...,...,...,...,...,...,...
2032012,UFPI,2020-06-30,0.014757,0.0016,0.280470,0.009467
2032013,UFPI,2020-06-30,0.014757,0.0016,0.214228,0.010338
2032014,UFPI,2020-06-30,0.014757,0.0016,0.186849,0.010699
2032015,UFPI,2020-06-30,0.014757,0.0016,0.143713,0.011266


In [None]:
nvda_beta = pd.read_csv('FINAL_RAW_DATA/beta_NVDA.csv')
nvda_beta['Date'] = pd.to_datetime(nvda_beta['DATE'])
nvda_trans = all_transaction_direct_final_returns[(all_transaction_direct_final_returns['ISSUERTRADINGSYMBOL']=='NVDA')]

In [None]:
lewis_nvda_beta = pd.read_csv('FINAL_RAW_DATA/beta_values.csv')

In [None]:
beta_compare = nvda_beta.merge(all_transaction_direct_final_returns, left_on='Date', right_on='TRANS_DATE', how='left')
beta_compare[['Date', 'b_mkt', 'Beta']]

Unnamed: 0,Date,b_mkt,Beta
0,2005-01-31,2.1744,
1,2005-02-28,2.2077,
2,2005-03-31,1.9375,
3,2005-04-29,2.0028,
4,2005-05-31,2.1863,
...,...,...,...
142656,2020-12-31,1.6559,-0.463674
142657,2020-12-31,1.6559,-0.468066
142658,2020-12-31,1.6559,-0.108343
142659,2020-12-31,1.6559,-0.203310


In [None]:
beta_compare.head(1)

Unnamed: 0,PERMNO,DATE,n,RET,alpha,b_mkt,b_smb,b_hml,b_umd,ivol,tvol,R2,exret,TICKER,Date,TRANS_SK,ACCESSION_NUMBER,SECURITY_TITLE,TRANS_DATE,DEEMED_EXECUTION_DATE,TRANS_CODE,EQUITY_SWAP_INVOLVED,TRANS_TIMELINESS,TRANS_SHARES,TRANS_PRICEPERSHARE,TRANS_ACQUIRED_DISP_CD,SHRS_OWND_FOLWNG_TRANS,DIRECT_INDIRECT_OWNERSHIP,NATURE_OF_OWNERSHIP,trans_amt,FILING_DATE,PERIOD_OF_REPORT,ISSUERCIK,ISSUERNAME,ISSUERTRADINGSYMBOL,RPTOWNERCIK,RPTOWNERNAME,RPTOWNER_RELATIONSHIP,symbol_x,date_x,open,high,low,close,close_adjusted,volume,split_coefficient,symbol_y,date_y,prev_close,actual_return,Year,Month,Risk_Free_Rate,market_return,Excess_Return,Excess_Market_Return,Beta,expected_return,abnormal_return
0,86580,2005-01-31,60,-2.7165%,0.0455,2.1744,1.6452,0.0082,-1.0396,20.6319%,27.2428%,42.6443%,9.1165%,NVDA,2005-01-31,,,,NaT,,,,,,,,,,,,NaT,,,,,,,,,NaT,,,,,,,,,NaT,,,,,,,,,,,


In [None]:

beta_compare.sort_values(by=['ISSUERTRADINGSYMBOL', 'DATE']).to_csv('FINAL_RAW_DATA/beta_compare.csv', index=False)

In [None]:
beta_compare[['ISSUE']] #.to_csv('FINAL_RAW_DATA/beta_compare.csv', index=False)

In [None]:
nvda_beta['b_mkt'].isna().sum()

0

In [None]:
beta_compare[['Date', 'b_mkt', 'Beta']].drop_duplicates()

Unnamed: 0,Date,b_mkt,Beta
0,NaT,,
2,2005-01-31,2.1744,
4,2005-02-28,2.2077,
27,2005-03-31,1.9375,
33,2005-04-29,2.0028,
...,...,...,...
1939152,NaT,,1.374990
1939153,NaT,,1.472469
1939154,NaT,,1.428247
1939194,NaT,,0.293992


In [None]:
beta_compare[(beta_compare['b_mkt'] - beta_compare['Beta'])< 10][['DATE', 'b_mkt', 'Beta']]

Unnamed: 0,DATE,b_mkt,Beta
339,2014-11-28,1.7145,1.671422
340,2014-11-28,1.7145,1.529918
375,2015-02-27,1.8494,0.357698
376,2015-02-27,1.8494,0.539095
493,2016-03-31,1.1137,0.818705
546,2016-09-30,0.8842,1.751239
547,2016-09-30,0.8842,1.687791
735,2019-06-28,2.221,2.561233
843,2020-03-31,1.494,1.457157


In [None]:
sp500[(sp500['TRANS_DATE'].dt.year >= YEARS_THRESHOLD[0]) & (sp500['TRANS_DATE'].dt.year <= YEARS_THRESHOLD[1])].shape

(4280, 2)

In [None]:
market_stock_data['date'] = pd.to_datetime(market_stock_data['date'], errors='coerce')
market_stock_data[(market_stock_data['date'].dt.year >= YEARS_THRESHOLD[0]) & (market_stock_data['date'].dt.year <= YEARS_THRESHOLD[1])].shape

(4136, 9)

In [None]:
import pandas as pd
import pandas_market_calendars as mcal

# Get the NYSE trading calendar
nyse = mcal.get_calendar("NYSE")

# Define the date range
schedule = nyse.schedule(start_date="2005-01-01", end_date="2021-06-14")

# Count the number of trading days
total_trading_days = len(schedule)

print(f"Total U.S. Trading Days (2005-2021): {total_trading_days}")


Total U.S. Trading Days (2005-2021): 4140


In [None]:
# 4136

In [None]:
market_stock_data['date'].describe()

count                             5191
mean     2011-02-14 03:52:44.476979456
min                2000-10-18 00:00:00
25%                2005-12-17 12:00:00
50%                2011-02-14 00:00:00
75%                2016-04-12 12:00:00
max                2021-06-08 00:00:00
Name: date, dtype: object

In [None]:
stock_price_daily_data_filtered['symbol'].nunique()

5367

In [None]:
apl_goog_df = all_transaction_direct_final_returns[(all_transaction_direct_final_returns['ISSUERTRADINGSYMBOL']=='AAPL')|(all_transaction_direct_final_returns['ISSUERTRADINGSYMBOL']=='GOOG')] 
#[['ISSUERTRADINGSYMBOL', 'TRANS_DATE','TRANS_SHARES', 'TRANS_PRICEPERSHARE','trans_amt','actual_return']]

Unnamed: 0,ISSUERTRADINGSYMBOL,TRANS_DATE,TRANS_SHARES,TRANS_PRICEPERSHARE,trans_amt,actual_return
706,AAPL,2014-06-24,1391.0,90.83,126344.53,-0.006056
707,AAPL,2014-06-24,2900.0,91.47,265263.00,-0.006056
2810,GOOG,2014-06-18,4108.0,550.00,2259400.00,0.019079
3617,GOOG,2014-06-16,1179.0,549.26,647577.54,-0.013557
3618,GOOG,2014-06-16,1179.0,557.25,656997.75,-0.013557
...,...,...,...,...,...,...
2022263,AAPL,2015-04-15,1225.0,,,0.003800
2022865,AAPL,2015-04-01,53056.0,,,-0.001447
2022866,AAPL,2015-04-01,140126.0,,,-0.001447
2025202,AAPL,2019-08-24,560000.0,,,


In [None]:
test[['symbol', 'date','TRANS_SHARES', 'TRANS_PRICEPERSHARE','trans_amt','open', 'high', 'low', 'close', 'close_adjusted']].iloc[:100]
test['Compare_Values'] = test['TRANS_PRICEPERSHARE'] - test['close']

In [None]:
pd.set_option('max_colwidth', None)
footnotes_data[footnotes_data['ACCESSION_NUMBER']=='0000899243-20-018390']

Unnamed: 0,ACCESSION_NUMBER,FOOTNOTE_ID,FOOTNOTE_TXT
6445329,0000899243-20-018390,F1,"Each deferred share unit is the economic equivalent of one share of the Issuer's Common Shares. Generally, thirty days following the date that the Reporting Person ceases to serve on the board of directors of the Issuer, the deferred share units will be settled in cash based on the volume weighted average price of the Common Shares for the five immediately preceding days on which the Common Shares were trading on the Toronto Stock Exchange; provided, however that if the Issuer's Common Shares are trading on more than one exchange at such time, such calculation shall be based on the trading price over such five day period on the stock exchange with the higher average trading volume over the twenty trading days immediately prior to such date (such price the ""Market Value"")."
6445330,0000899243-20-018390,F2,"The price used to calculate the number of DSUs granted was C$1.67, which was the Market Value of the Issuer's Common Shares on the date of grant, converted from Canadian dollars to U.S. dollars using the Bank of Canada daily average exchange rate for the quarter ending June, 30 2020 of C$1.3889 = US $1.00."


In [None]:
test[test['Compare_Values'] < 1][['ACCESSION_NUMBER','symbol','SECURITY_TITLE', 'date','SHRS_OWND_FOLWNG_TRANS','Compare_Values','TRANS_SHARES','TRANS_PRICEPERSHARE','open', 'high', 'low', 'close', 'close_adjusted']]

Unnamed: 0,ACCESSION_NUMBER,symbol,SECURITY_TITLE,date,SHRS_OWND_FOLWNG_TRANS,Compare_Values,TRANS_SHARES,TRANS_PRICEPERSHARE,open,high,low,close,close_adjusted
0,0001179110-14-011078,OMCL,Common Stock,2014-06-26,43944.0,-0.19,1046.0,28.17,28.52,28.52,28.04,28.36,28.3600
1,0001179110-14-011078,OMCL,Common Stock,2014-06-26,43307.0,-0.19,637.0,28.17,28.52,28.52,28.04,28.36,28.3600
2,0001179110-14-011078,OMCL,Common Stock,2014-06-26,44990.0,-0.19,1278.0,28.17,28.52,28.52,28.04,28.36,28.3600
3,0001144204-14-040532,NYRT,Common Stock,2014-05-28,50295.0,0.04,1176.0,10.60,10.39,10.60,10.39,10.56,4.1393
4,0001144204-14-040532,NYRT,Common Stock,2014-06-27,54968.0,-0.60,4673.0,10.70,11.41,11.59,11.30,11.30,4.4442
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2032254,0001214659-20-005978,UFPI,Phantom Stock Units,2020-06-30,120089.0,0.00,29.0,49.51,48.40,49.95,47.96,49.51,49.5100
2032255,0001214659-20-005977,UFPI,phantom stock units,2020-06-30,26624.0,0.00,29.0,49.51,48.40,49.95,47.96,49.51,49.5100
2032256,0001214659-20-005976,UFPI,Phantom Stock Units,2020-06-30,16201.0,0.00,15.0,49.51,48.40,49.95,47.96,49.51,49.5100
2032257,0001214659-20-005975,UFPI,phantom stock units,2020-06-30,83921.0,0.00,29.0,49.51,48.40,49.95,47.96,49.51,49.5100


In [None]:
# Calculate abnormal returns
# all_transaction_direct_final['abnormal_returns'] = all_transaction_direct_final['actual_returns'] - all_transaction_direct_final['expected_returns']

# Getting subset of Data for Caitlyn's network analysis

In [None]:
# with TRANS_DATE: 2342363, without: 365605, without ISSUERNAME: 356387, without 'ISSUERCIK': 354851
unique_df = all_transaction_direct_final[['RPTOWNERNAME', 'RPTOWNERCIK', 'ISSUERTRADINGSYMBOL']].drop_duplicates()
unique_dates_df = all_transaction_direct_final[['TRANS_DATE', 'RPTOWNERNAME', 'RPTOWNERCIK', 'ISSUERTRADINGSYMBOL']].drop_duplicates()

In [None]:
all_transaction_direct_final.shape

(2492139, 23)

# Get tickers and reporting owners from 2005 to 2021, that have transacted more than 8 times

In [None]:
names_data_fr_caitlyn = all_transaction_direct_final[(all_transaction_direct_final['TRANS_DATE'].dt.year >= 2005) & (all_transaction_direct_final['TRANS_DATE'].dt.year <= 2021)]

In [None]:
names_data_fr_caitlyn_grouped = names_data_fr_caitlyn[['RPTOWNERNAME', 'ISSUERTRADINGSYMBOL', 'RPTOWNERCIK', 'ISSUERCIK', 'ISSUERNAME']]

In [None]:
names_data_fr_caitlyn_grouped

Unnamed: 0,RPTOWNERNAME,ISSUERTRADINGSYMBOL,RPTOWNERCIK,ISSUERCIK,ISSUERNAME
0,Ortigas-Wedekind Marga,OMCL,1453971,926326,"OMNICELL, Inc"
1,Ortigas-Wedekind Marga,OMCL,1453971,926326,"OMNICELL, Inc"
2,Ortigas-Wedekind Marga,OMCL,1453971,926326,"OMNICELL, Inc"
3,Bowman Scott J.,NYRT,1545007,1474464,"New York REIT, Inc."
4,Bowman Scott J.,NYRT,1545007,1474464,"New York REIT, Inc."
...,...,...,...,...,...
2492134,WEBSTER PATRICK M,UFPI,1405589,912767,UFP INDUSTRIES INC
2492135,UHLIG-EASTIN CHAD C.,UFPI,1686395,912767,UFP INDUSTRIES INC
2492136,Tutas David A.,UFPI,1567995,912767,UFP INDUSTRIES INC
2492137,MISSAD MATTHEW J,UFPI,1186284,912767,UFP INDUSTRIES INC


In [None]:
names_unique_caitlyn = names_data_fr_caitlyn_grouped.groupby(['RPTOWNERCIK'])['RPTOWNERCIK'].agg(
    count='count').reset_index().sort_values(by='count', ascending=False)
names_unique_caitlyn = names_unique_caitlyn[names_unique_caitlyn['count'] > 8][['RPTOWNERCIK']]
names_unique_caitlyn

In [None]:
names_unique_caitlyn = names_unique_caitlyn[names_unique_caitlyn['count'] > 8][['RPTOWNERCIK']]
names_unique_caitlyn

Unnamed: 0,RPTOWNERCIK
22481,1295231
22430,1294693
22467,1295032
6668,1184940
4414,1141722
...,...
28192,1367882
39619,1501805
28775,1376087
11788,1208654


In [None]:
names_data_fr_caitlyn_grouped

Unnamed: 0,RPTOWNERNAME,ISSUERTRADINGSYMBOL,RPTOWNERCIK,ISSUERCIK,ISSUERNAME
0,Ortigas-Wedekind Marga,OMCL,1453971,926326,"OMNICELL, Inc"
1,Ortigas-Wedekind Marga,OMCL,1453971,926326,"OMNICELL, Inc"
2,Ortigas-Wedekind Marga,OMCL,1453971,926326,"OMNICELL, Inc"
3,Bowman Scott J.,NYRT,1545007,1474464,"New York REIT, Inc."
4,Bowman Scott J.,NYRT,1545007,1474464,"New York REIT, Inc."
...,...,...,...,...,...
2492134,WEBSTER PATRICK M,UFPI,1405589,912767,UFP INDUSTRIES INC
2492135,UHLIG-EASTIN CHAD C.,UFPI,1686395,912767,UFP INDUSTRIES INC
2492136,Tutas David A.,UFPI,1567995,912767,UFP INDUSTRIES INC
2492137,MISSAD MATTHEW J,UFPI,1186284,912767,UFP INDUSTRIES INC


In [None]:
names_data_fr_caitlyn_final = names_data_fr_caitlyn_grouped.merge(names_unique_caitlyn, on = 'RPTOWNERCIK', how='inner').drop_duplicates()
names_data_fr_caitlyn_final.shape

(68558, 5)

In [None]:
names_data_fr_caitlyn_final

Unnamed: 0,RPTOWNERNAME,ISSUERTRADINGSYMBOL,RPTOWNERCIK,ISSUERCIK,ISSUERNAME
0,Ortigas-Wedekind Marga,OMCL,1453971,926326,"OMNICELL, Inc"
3,Bowman Scott J.,NYRT,1545007,1474464,"New York REIT, Inc."
5,VANDERHORST DAVID,TAIT,1317793,942126,TAITRON COMPONENTS INC
9,Ngo Nhat H,OMCL,1305020,926326,"OMNICELL, Inc"
12,Marks Alan Lee,EBAY,1434414,1065088,EBAY INC
...,...,...,...,...,...
1912724,MATHER COURTNEY,CZR,1608717,858339,CAESARS ENTERTAINMENT Corp
1912953,Wider Todd,ARYA,1642715,1746037,ARYA SCIENCES ACQUISITION CORP.
1912954,Conroy Kevin T,ARYA,1306119,1746037,ARYA SCIENCES ACQUISITION CORP.
1913232,"Match Group, Inc.",MTCH,891103,1575189,"Match Group Holdings II, LLC"


In [None]:
#names_data_fr_caitlyn_final.to_csv(f'{DATA_FOLDER}/unique_names_trans_8above.csv', index=False)
#unique_dates_df.to_csv(f'{DATA_FOLDER}/rptowner_trade_with_date.csv', index=False)

In [None]:
#df = pd.read_csv(f'{DATA_FOLDER}/unique_names_trans_8above.csv')
#unique_ticker_trans_8above = df[['ISSUERTRADINGSYMBOL']].drop_duplicates()
#unique_ticker_trans_8above

NameError: name 'pd' is not defined

In [None]:
#unique_ticker_trans_8above.to_csv(f'{DATA_FOLDER}/unique_ticker_trans_8above.csv',index=False)

# Top 100 companies by a) num of transactions and b) count of transactions

In [None]:
disposed_df = all_transaction_direct_final[all_transaction_direct_final['TRANS_ACQUIRED_DISP_CD']== 'D']
summary_df = disposed_df.groupby(['ISSUERCIK','ISSUERNAME'])['trans_amt'].agg(
    count='count',
    sum='sum').reset_index()
summary_df

Unnamed: 0,ISSUERCIK,ISSUERNAME,count,sum
0,1750,AAR CORP,328,1.635547e+08
1,1800,ABBOTT LABORATORIES,2826,1.268049e+09
2,1923,"SERVIDYNE, INC.",12,9.327140e+05
3,1947,ABT ASSOCIATES INC,1,9.622200e+04
4,2034,ACETO CORP,252,1.891873e+07
...,...,...,...,...
6274,1999001,Six Flags Entertainment Corporation/NEW,7,3.116466e+06
6275,2007919,"Inhibrx Biosciences, Inc.",0,0.000000e+00
6276,2012383,"BlackRock, Inc.",32,1.636032e+08
6277,2013745,"Calumet, Inc. /DE",7,6.901487e+06


In [None]:
summar_df_by_count = summary_df.sort_values(by='count', ascending=False)[:100]
summar_df_by_sum = summary_df.sort_values(by='sum', ascending=False)[:100]
summar_df_by_count.shape

(100, 4)

In [None]:
#summar_df_by_count.to_csv(f'{DATA_FOLDER}/summary_df_by_count.csv', index=False)
#summar_df_by_sum.to_csv(f'{DATA_FOLDER}/summary_df_by_sum.csv', index=False)

# Data Exploration

## Exploring transactions and holdings data

In [None]:
# Find overlapping ACCESSION_NUMBER values for nonderiv data
nonderv_holding_n = nonderiv_holding_data['ACCESSION_NUMBER'].nunique()
nonderv_trans_n = nonderiv_trans_data['ACCESSION_NUMBER'].nunique()
print(f"Unique ACCESSION_NUMBER for holdings: {nonderv_holding_n}, for transactions: {nonderv_trans_n}")

nonderiv_overlap_values = set(nonderiv_holding_data['ACCESSION_NUMBER']) & set(nonderiv_trans_data['ACCESSION_NUMBER'])
if nonderiv_overlap_values:
    print("Overlapping ACCESSION_NUMBER values found:", len(nonderiv_overlap_values), round(len(nonderiv_overlap_values) / min((nonderv_trans_n, nonderv_holding_n)), 2))
else:
    print("No overlap found.")

Unique ACCESSION_NUMBER for holdings: 805661, for transactions: 2200849
Overlapping ACCESSION_NUMBER values found: 603111 0.75


In [None]:
# Find overlapping ACCESSION_NUMBER values for deriv data
derv_holding_n = deriv_holding_data['ACCESSION_NUMBER'].nunique()
derv_trans_n = deriv_trans_data['ACCESSION_NUMBER'].nunique()
print(f"Unique ACCESSION_NUMBER for holdings: {derv_holding_n}, for transactions: {derv_trans_n}")

deriv_overlap_values = set(deriv_holding_data['ACCESSION_NUMBER']) & set(deriv_trans_data['ACCESSION_NUMBER'])
if deriv_overlap_values:
    print("Overlapping ACCESSION_NUMBER values found:", len(deriv_overlap_values), round(len(deriv_overlap_values) / min((derv_trans_n, derv_holding_n)), 2))
else:
    print("No overlap found.")

Unique ACCESSION_NUMBER for holdings: 246962, for transactions: 1107281
Overlapping ACCESSION_NUMBER values found: 83827 0.34


### Case: Overlapping ACCESSION_NUMBER for deriv transactions and holidngs
1. '0001062993-24-014375', 1 deriv transaction, 8 holdings, including varying values of 'SHRS_OWND_FOLWNG_TRANS' for different stocks, but no date of transaction
2. '0001654954-19-000382', 1 deriv transaction, 2 holdings, varying 'SHRS_OWND_FOLWNG_TRANS' for same stocks, also no date
3. '0001209191-11-009901', 1 deriv transaction, 5 holdings, varying 'SHRS_OWND_FOLWNG_TRANS' for 1 different and 4 same stocks, some have non-null EXERCISE_DATE
4. '0001437749-21-019709',
5. '0001181431-11-003532',
etc

* Note that nonderiv holdings have a different schema compared to deriv holdings

In [None]:
ACCESSION_NUMBER_TEST = '0001209191-11-009901'

In [None]:
deriv_trans_data[deriv_trans_data['ACCESSION_NUMBER']== ACCESSION_NUMBER_TEST]

Unnamed: 0,ACCESSION_NUMBER,DERIV_TRANS_SK,SECURITY_TITLE,SECURITY_TITLE_FN,CONV_EXERCISE_PRICE,CONV_EXERCISE_PRICE_FN,TRANS_DATE,TRANS_DATE_FN,DEEMED_EXECUTION_DATE,DEEMED_EXECUTION_DATE_FN,TRANS_FORM_TYPE,TRANS_CODE,EQUITY_SWAP_INVOLVED,EQUITY_SWAP_TRANS_CD_FN,TRANS_TIMELINESS,TRANS_TIMELINESS_FN,TRANS_SHARES,TRANS_SHARES_FN,TRANS_TOTAL_VALUE,TRANS_TOTAL_VALUE_FN,TRANS_PRICEPERSHARE,TRANS_PRICEPERSHARE_FN,TRANS_ACQUIRED_DISP_CD,TRANS_ACQUIRED_DISP_CD_FN,EXCERCISE_DATE,EXCERCISE_DATE_FN,EXPIRATION_DATE,EXPIRATION_DATE_FN,UNDLYNG_SEC_TITLE,UNDLYNG_SEC_TITLE_FN,UNDLYNG_SEC_SHARES,UNDLYNG_SEC_SHARES_FN,UNDLYNG_SEC_VALUE,UNDLYNG_SEC_VALUE_FN,SHRS_OWND_FOLWNG_TRANS,SHRS_OWND_FOLWNG_TRANS_FN,VALU_OWND_FOLWNG_TRANS,VALU_OWND_FOLWNG_TRANS_FN,DIRECT_INDIRECT_OWNERSHIP,DIRECT_INDIRECT_OWNERSHIP_FN,NATURE_OF_OWNERSHIP,NATURE_OF_OWNERSHIP_FN
1020924,0001209191-11-009901,1323168,Stock Option,,4.52,,2011-02-11,,,,4.0,M,0,,,,20000.0,,,,4.52,,D,,2001-08-13,,2011-02-13,,Common Stock,,20000.0,,,,0.0,,,,D,,,


In [None]:
deriv_holding_data[deriv_holding_data['ACCESSION_NUMBER']== ACCESSION_NUMBER_TEST]

Unnamed: 0,ACCESSION_NUMBER,DERIV_HOLDING_SK,SECURITY_TITLE,SECURITY_TITLE_FN,CONV_EXERCISE_PRICE,CONV_EXERCISE_PRICE_FN,TRANS_FORM_TYPE,TRANS_FORM_TYPE_FN,EXERCISE_DATE,EXERCISE_DATE_FN,EXPIRATION_DATE,EXPIRATION_DATE_FN,UNDLYNG_SEC_TITLE,UNDLYNG_SEC_TITLE_FN,UNDLYNG_SEC_SHARES,UNDLYNG_SEC_SHARES_FN,UNDLYNG_SEC_VALUE,UNDLYNG_SEC_VALUE_FN,SHRS_OWND_FOLWNG_TRANS,SHRS_OWND_FOLWNG_TRANS_FN,VALU_OWND_FOLWNG_TRANS,VALU_OWND_FOLWNG_TRANS_FN,DIRECT_INDIRECT_OWNERSHIP,DIRECT_INDIRECT_OWNERSHIP_FN,NATURE_OF_OWNERSHIP,NATURE_OF_OWNERSHIP_FN
567018,0001209191-11-009901,810244,Phantom Stock Units,,0.0,,,,,F4,,F4,Phantom Stock Units,,1688.63,,,,1688.63,,,,D,,,
567019,0001209191-11-009901,810245,Stock Option,,3.7,,,,2002-08-11,,2012-02-11,,Common Stock,,30000.0,,,,30000.0,,,,D,,,
567020,0001209191-11-009901,810248,Stock Option,,13.1,,,,,F6,2017-03-05,,Common Stock,,15000.0,,,,15000.0,,,,D,,,
567021,0001209191-11-009901,810247,Stock Option,,11.4,,,,,F5,2016-03-05,,Common Stock,,30000.0,,,,30000.0,,,,D,,,
567022,0001209191-11-009901,810246,Stock Option,,4.37,,,,2003-08-11,,2013-02-11,,Common Stock,,36000.0,,,,36000.0,,,,D,,,


## Exploring Name Info (1 submission with multiple reporting owners)
* 79206 have more than one reporting owner
* highest number of reporting owners is 10, because no more than 10 reporting persons can file any one Form 4 (see 2 cells below)


Why is the RPTOWNERNAME so weird? 

* Reccomendation: maybe we want to compile the names into 1 tuple, for each accession number?

In [None]:
submission_data['ISSUERCIK'].nunique() # there are issuerick with multiple issuernames

14185

In [None]:
(reporting_owner_data['ACCESSION_NUMBER'].value_counts() == 1).sum() # 0

2838282

In [None]:
reporting_owner_data['ACCESSION_NUMBER'].value_counts() > 1

ACCESSION_NUMBER
0001209191-21-007755     True
0001104659-23-051309     True
0001209191-15-052568     True
0001571049-14-000053     True
0001420295-17-000002     True
                        ...  
0001415889-24-010973    False
0001349334-24-000008    False
0001104659-24-048332    False
0001209191-24-005317    False
0001437749-20-014299    False
Name: count, Length: 2917488, dtype: bool

In [None]:
# See remarks for disclaimer about reporting persons
submission_data[submission_data['ACCESSION_NUMBER'] == '0001209191-21-007755']

Unnamed: 0,ACCESSION_NUMBER,FILING_DATE,PERIOD_OF_REPORT,DATE_OF_ORIG_SUB,NO_SECURITIES_OWNED,NOT_SUBJECT_SEC16,FORM3_HOLDINGS_REPORTED,FORM4_TRANS_REPORTED,DOCUMENT_TYPE,ISSUERCIK,ISSUERNAME,ISSUERTRADINGSYMBOL,REMARKS
2409946,0001209191-21-007755,2021-02-04,2021-02-02,,,0,,,4,1794515,ZoomInfo Technologies Inc.,ZI,"Because no more than 10 reporting persons can file any one Form 4 through the Securities and Exchange Commission's EDGAR system, Atlantic & Pacific VII-B has filed a separate Form 4."


In [None]:
reporting_owner_data[reporting_owner_data['ACCESSION_NUMBER'] == '0001209191-21-007755']

Unnamed: 0,ACCESSION_NUMBER,RPTOWNERCIK,RPTOWNERNAME,RPTOWNER_RELATIONSHIP,RPTOWNER_TITLE,RPTOWNER_TXT,RPTOWNER_STREET1,RPTOWNER_STREET2,RPTOWNER_CITY,RPTOWNER_STATE,RPTOWNER_ZIPCODE,RPTOWNER_STATE_DESC,FILE_NUMBER
2604188,0001209191-21-007755,1812579,"TA AP VII-B DO Subsidiary Partnership, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,,001-39310
2604189,0001209191-21-007755,1812606,"TA XI DO AIV II, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,,001-39310
2604190,0001209191-21-007755,1034569,"TA ASSOCIATES, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,,001-39310
2604191,0001209191-21-007755,1609539,"TA XI DO Feeder, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,,001-39310
2604192,0001209191-21-007755,1812605,"TA SDF III DO AIV II, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,,001-39310
2604193,0001209191-21-007755,1609553,"TA SDF III DO AIV, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,,001-39310
2604194,0001209191-21-007755,1609536,"TA XI DO AIV, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,,001-39310
2604195,0001209191-21-007755,1578035,"TA INVESTORS IV, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,,001-39310
2604196,0001209191-21-007755,1548681,TA Atlantic & Pacific VII-A L.P.,"Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,,001-39310
2604197,0001209191-21-007755,1609557,"TA SDF III DO Feeder, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,,001-39310


In [None]:
pd.set_option('display.max_colwidth', None)
names_data[names_data['ACCESSION_NUMBER'] == '0001209191-21-007755']
# seems like OWNERSIGNATURENAME requires .split(',') to extract alias used in Reporting owner name

Unnamed: 0,ACCESSION_NUMBER,OWNERSIGNATURENAME,OWNERSIGNATUREDATE
2561362,0001209191-21-007755,"TA SDF III DO AIV, L.P., by TA Associates, SDF III GP L.P., its General Partner, by TA Associates, L.P., its General Partner, by Jeffrey C. Hadden, its, General Counsel, /s/ Jeffrey C. Hadden",2021-02-04
2561363,0001209191-21-007755,"TA XI DO AIV, L.P., by TA Associates XI GP, L.P., its General Partner, by TA Associates, L.P., its General, Partner, by Jeffrey C. Hadden, its General Counsel, /s/ Jeffrey C. Hadden",2021-02-04
2561364,0001209191-21-007755,"TA Associates, L.P., by Jeffrey C. Hadden, its, General Counsel, /s/ Jeffrey C. Hadden",2021-02-04
2561365,0001209191-21-007755,"TA Atlantic & Pacific VII-A L.P., by TA Associates AP, VII GP L.P., its General Partner, by TA Associates, L.P., its General Partner, by Jeffrey C. Hadden, its, General Counsel, /s/ Jeffrey C. Hadden",2021-02-04
2561366,0001209191-21-007755,"TA Investors IV, L.P., by TA Associates, L.P., its General Partner, by Jeffrey C. Hadden, its, General Counsel, /s/ Jeffrey C. Hadden",2021-02-04
2561367,0001209191-21-007755,"TA SDF III DO AIV II, L.P., by TA Associates SDF, III GP, L.P., its General Partner, by TA Associates, L.P., its General Partner, by Jeffrey C. Hadden, its General, Counsel, /s/ Jeffrey C. Hadden",2021-02-04
2561368,0001209191-21-007755,"TA XI DO AIV II, L.P., by TA Associates XI GP, L.P, its General Partner, by TA Associates, L.P., its General, Partner, by Jeffrey C. Hadden, its General Counsel, /s/ Jeffrey C. Hadden",2021-02-04
2561369,0001209191-21-007755,"TA Associates AP VII-B DO Subsidiary Partnership, L.P., by TA Associates AP VII GP L.P., its General Partner, by TA Associates, L.P., its General Partner, by Jeffrey C., Hadden, its General Counsel, /s/ Jeffrey C. Hadden",2021-02-04
2561370,0001209191-21-007755,"TA SDF III DO Feeder, L.P., by TA Associates SDF III GP L.P., its General Partner, by TA Associates, L.P., its General Partner, by Jeffrey C., Hadden, its General Counsel, /s/ Jeffrey C. Hadden",2021-02-04
2561371,0001209191-21-007755,"TA XI DO Feeder, L.P., by TA Associates XI GP L.P., its General Partner, by TA Associates, L.P., its General Partner, Jeffrey C. Hadden, its, General Counsel, /s/ Jeffrey C. Hadden",2021-02-04


In [None]:
reporting_owner_data[reporting_owner_data['ACCESSION_NUMBER'] == '0001437749-20-014299']# '0001415889-24-010973']#'0001571049-14-000053']

Unnamed: 0,ACCESSION_NUMBER,RPTOWNERCIK,RPTOWNERNAME,RPTOWNER_RELATIONSHIP,RPTOWNER_TITLE,RPTOWNER_TXT,RPTOWNER_STREET1,RPTOWNER_STREET2,RPTOWNER_CITY,RPTOWNER_STATE,RPTOWNER_ZIPCODE,RPTOWNER_STATE_DESC,FILE_NUMBER
3171122,0001437749-20-014299,1815956,Hoffman Matthew Leo,Officer,Chief Financial Officer,,C/O THE CORETEC GROUP INC.,"6804 SOUTH CANTON AVENUE, SUITE 150",TULSA,OK,74136,,000-54697


In [None]:
pd.set_option('display.max_colwidth', None)
names_data[names_data['ACCESSION_NUMBER'] == '0001437749-20-014299']#'0001415889-24-010973']#'0001571049-14-000053']

Unnamed: 0,ACCESSION_NUMBER,OWNERSIGNATURENAME,OWNERSIGNATUREDATE
3119137,0001437749-20-014299,/s/ Matthew Hoffman,2020-06-30


In [None]:
names_data[names_data['OWNERSIGNATURENAME']=='/s/ Harry You']

Unnamed: 0,ACCESSION_NUMBER,OWNERSIGNATURENAME,OWNERSIGNATUREDATE
2507155,0000899243-21-014301,/s/ Harry You,2021-03-31


# Testing Litigation Data

In [None]:
data_litigations = pd.read_csv(f"{DATA_FOLDER}/{LITIGATIONS_DATA_PATH}")

In [None]:
data_litigations[['yr']].describe() # 1996 - 2017 

Unnamed: 0,yr
count,1222.0
mean,2006.533552
std,6.041633
min,1996.0
25%,2002.0
50%,2007.0
75%,2011.0
max,2017.0


In [None]:
pd.set_option('display.max_colwidth', None)
data_litigations.tail(2)

Unnamed: 0.1,Unnamed: 0,lt_no,yr,title,lt,class
1220,7977,24012,2017,Therapist Settles Charges of Insider Trading Ahead of Acquisition Announcement,"[A Seattle-based therapist has agreed to settle SEC charges that he traded in the stock of zulily, Inc Zulily based on information he learned from a Zulily employee during confidential counseling sessions , The SECs complaint alleges that, in July 2015, during counseling sessions, the Zulily employee told Kenneth Peer that Zulily was going to be acquired by Liberty Interactive, a media holding company On three occasions between July 21, 2015 and August 10, 2015, after counseling sessions with the Zulily employee, Peer purchased a total of over $28,000 of Zulily stock The complaint alleges that, before the market opened on August 17, 2015, Zulily announced that it had agreed to be acquired by Liberty Interactive in a tender offer By the end of trading that day, Zulilys stock allegedly had risen by 49%, with nearly 15 times the stocks average daily trading volume Shortly after the acquisition was announced, Peer allegedly sold all of his Zulily shares for illegal profits of approximately $10,000 , The SECs complaint charges Peer with violating Sections 10 b and 14 e of the Securities Exchange Act of 1934 and Rules 10b-5 and 14e-3 thereunder Without admitting or denying the SECs allegations, Peer agreed to disgorge $10,227 73 plus interest of $811 80 and pay a $10,227 73 penalty, for a total of $21,267 26 Peer also agreed to be enjoined from further violations of the charged provisions , The SECs investigation was conducted by Alice Liu Jensen and supervised by Steven D Buchholz, both of the Market Abuse Unit in the San Francisco Regional Office The SEC appreciates the assistance of FINRA in this matter , <img alt border0 height9 srcimagesarrowright_dkblue gif width10><a hreflitigationcomplaints2017comp24012 pdf>SEC Complaint<a>, , <i>https:www sec govlitigationlitreleases2017lr24012 htm<i><br>]",1
1221,7980,24015,2017,SEC Charges Former Employee and Friend with Insider Trading in Securities of International Rectifier Corporation,"[The today announced insider trading charges against a former employee of a semiconductor company and his friend for trading on nonpublic information that the company would be acquired , The SEC alleges that Lanny Brown learned that Infineon Technologies AG planned to acquire his then-employer, International Rectifier Corp IRC , before the deal was publicly announced According to the SECs complaint, Brown tipped his friend, Sean Fox, about the deal and both of them then acquired IRC call options The SEC further alleges that Brown and Fox concealed Browns involvement in the trading by depositing approximately $12,000 of their combined funds into Foxs brokerage account, and then used this account to purchase the call options for both of them The SEC also alleges that Fox closed out the option positions after the acquisition was publicly announced, and the two defendants made $369,720 in illicit profits To further hide Browns role in the trading, Fox allegedly funneled Browns share of the trading profits by paying several of Browns personal expenses and by writing checks to Browns children and stepchildren Brown and his wife then endorsed those checks and used the funds , The SECs complaint, filed in federal court in the District of Arizona, charges Brown and Fox with violating Section 10 b of the Securities Exchange Act of 1934 and Rule 10b-5 thereunder A criminal action is also pending against both Brown and Fox in the District of Arizona for the same underlying conduct In the SECs action, the defendants have consented to the entry of a final judgment that permanently enjoins them from future violations of the charged provisions of the federal securities laws The final judgment in the SECs action also orders them to pay, on a joint and several basis, disgorgement of $369,720 plus prejudgment interest of $43,147 79, with a credit for the monetary amount they have agreed to pay in the parallel criminal case against them , The settlements with the SEC are subject to court approval , The SECs investigation was conducted by Yolanda Ochoa and supervised by Finola H Manvelian of the Los Angeles office The SEC appreciates the assistance of the Financial Industry Regulatory Authority , <p classcenter>###, <img alt border0 height9 srcimagesarrowright_dkblue gif width10><a hreflitigationcomplaints2017comp24015 pdf>SEC Complaint<a>, , <i>https:www sec govlitigationlitreleases2017lr24015 htm<i><br>]",1


In [None]:
## DeepSeek's AI, number is index of litigations to avoid confusion
result_1220 = {
  "NameOfTrader": "Kenneth Peer",
  "Profession": "Seattle-based therapist",
  "CompanyInvolved": "zulily, Inc (Zulily)",
  "AcquiringCompany": "Liberty Interactive",
  "SourceOfInformation": "Zulily employee (learned during confidential counseling sessions)",
  "DateOfAcquisitionAnnouncement": "17-08-2015",
  "DatesOfIllegalTransactions": ["21-07-2015", "10-08-2015"],
  "TotalAmountInvested": 28000,
  "IllegalProfits": 10000,
  "StockPriceIncrease": "49%",
  "TradingVolumeIncrease": "15 times the average daily trading volume",
  "LegalConsequences": {
    "Disgorgement": 10227.73,
    "Interest": 811.80,
    "Penalty": 10227.73,
    "TotalPayment": 21267.26,
    "Injunction": "Enjoined from further violations of the charged provisions"
  },
  "SECCharges": [
    "Violation of Section 10(b) of the Securities Exchange Act of 1934",
    "Violation of Section 14(e) of the Securities Exchange Act of 1934",
    "Violation of Rule 10b-5",
    "Violation of Rule 14e-3"
  ],
  "SECInvestigators": {
    "Investigator": "Alice Liu Jensen",
    "Supervisor": "Steven D. Buchholz",
    "Unit": "Market Abuse Unit",
    "RegionalOffice": "San Francisco Regional Office"
  },
  "AssistanceProvidedBy": "FINRA"
}

result_1221 = {
  "NamesOfTraders": ["Lanny Brown", "Sean Fox"],
  "RelationToInsider": "Lanny Brown (former employee of International Rectifier Corp)",
  "CompanyInvolved": "International Rectifier Corp (IRC)",
  "AcquiringCompany": "Infineon Technologies AG",
  "DateOfAcquisitionAnnouncement": "Not explicitly stated in the text", #### THIS IS AN ISSUE 
  "DateOfIllegalTransactions": "Not explicitly stated in the text",     #### THIS IS AN ISSUE 
  "IllegalActivity": "Purchased IRC call options using nonpublic information about the acquisition",
  "FundsDepositedForTrading": 12000,
  "IllegalProfits": 369720,
  "PrejudgmentInterest": 43147.79,
  "LegalConsequences": "Permanent injunctions, disgorgement of $369,720, and prejudgment interest",
  "CriminalAction": "Pending in the District of Arizona",
  "SECInvestigators": {
    "Investigator": "Yolanda Ochoa",
    "Supervisor": "Finola H. Manvelian"
  },
  "AssistanceProvidedBy": "Financial Industry Regulatory Authority (FINRA)"
}