# SEC Form 4 Data Collation
Full README.md see here: https://www.sec.gov/files/insider_transactions_readme.pdf 

In [4]:
import pandas as pd

In [5]:
DATA_FOLDER = "FINAL_RAW_DATA"
LITIGATIONS_DATA_PATH = "infected.csv"
STOCK_PRICE_DATA_PATH = "dataset_summary.csv"

In [6]:
## Form Submission Main data
submission_data = pd.read_csv(f"{DATA_FOLDER}/SUBMISSION.csv") # ACCESSION_NUMBER is the primary key

## Transaction info for each submission (buy and sell), ACCESSION_NUMBER and (NON)DERIV_TRANS_SK are the primary keys
# One form (i.e. ACCESSION_NUMBER) can have multiple transactions (i.e. *_SK), transactions can be across multiple years, max 30 each 
# Duplicate *_SK keys are for different transactions, and there are max 2 of each duplicate _SK keys
nonderiv_trans_data = pd.read_csv(f"{DATA_FOLDER}/NONDERIV_TRANS.csv")
deriv_trans_data = pd.read_csv(f"{DATA_FOLDER}/DERIV_TRANS.csv")

## Holding info for each submission (what they have - After each transaction..?)
nonderiv_holding_data = pd.read_csv(f"{DATA_FOLDER}/NONDERIV_HOLDING.csv")
deriv_holding_data = pd.read_csv(f"{DATA_FOLDER}/DERIV_HOLDING.csv")

## Name info
reporting_owner_data = pd.read_csv(f"{DATA_FOLDER}/REPORTINGOWNER.csv")
#names_data = pd.read_csv(f"{DATA_FOLDER}/OWNER_SIGNATURE.csv") 

## Additional info, to match with '*_FN' columns in all other datasets based on matching ACCESSION_NUMBER
#footnotes_data = pd.read_csv(f"{DATA_FOLDER}/FOOTNOTES.csv")

  nonderiv_trans_data = pd.read_csv(f"{DATA_FOLDER}/NONDERIV_TRANS.csv")
  deriv_trans_data = pd.read_csv(f"{DATA_FOLDER}/DERIV_TRANS.csv")
  nonderiv_holding_data = pd.read_csv(f"{DATA_FOLDER}/NONDERIV_HOLDING.csv")
  deriv_holding_data = pd.read_csv(f"{DATA_FOLDER}/DERIV_HOLDING.csv")


In [7]:
# Overview of the data. Drop duplicates and check columns will all null, or all consistent value
dataframes = {
    'submission_data': submission_data,
    'nonderiv_trans_data': nonderiv_trans_data,
    'deriv_trans_data': deriv_trans_data,
    'nonderiv_holding_data': nonderiv_holding_data,
    'deriv_holding_data': deriv_holding_data,
    'reporting_owner_data': reporting_owner_data,
    #'names_data': names_data,
    #'footnotes_data': footnotes_data
}

for name, df in dataframes.items():
    shape = df.shape

    print(f"{name}, {shape}")

    df.drop_duplicates(inplace=True)
    if df.shape[0] != shape[0]:
        print(f"Duplicate rows removed: {shape[0] - df.shape[0]}")
    else:
        print("No duplicate rows")
    null_columns = df.columns[df.isnull().all()]
    print(f"Columns with all null values: {null_columns.tolist()}")
    constant_columns = [col for col in df.columns if df[col].nunique() == 1]
    print(f"Columns with constant values: {constant_columns}")
    print()

submission_data, (2917488, 13)
No duplicate rows
Columns with all null values: []
Columns with constant values: []

nonderiv_trans_data, (4343860, 28)
No duplicate rows
Columns with all null values: []
Columns with constant values: []

deriv_trans_data, (1763084, 42)
No duplicate rows
Columns with all null values: ['TRANS_ACQUIRED_DISP_CD_FN']
Columns with constant values: []

nonderiv_holding_data, (1522788, 14)
No duplicate rows
Columns with all null values: ['TRANS_FORM_TYPE_FN']
Columns with constant values: ['TRANS_FORM_TYPE']

deriv_holding_data, (1000283, 26)
No duplicate rows
Columns with all null values: ['TRANS_FORM_TYPE_FN']
Columns with constant values: ['TRANS_FORM_TYPE']

reporting_owner_data, (3171123, 13)
No duplicate rows
Columns with all null values: []
Columns with constant values: []



In [8]:
SELECTED_TRANSACTION_COLS = ['ACCESSION_NUMBER', 'SECURITY_TITLE', 'TRANS_DATE', 'DEEMED_EXECUTION_DATE', 'TRANS_CODE', 'EQUITY_SWAP_INVOLVED',
                             'TRANS_TIMELINESS', 'TRANS_SHARES', 'TRANS_PRICEPERSHARE', 'TRANS_ACQUIRED_DISP_CD',
                             'SHRS_OWND_FOLWNG_TRANS', 'DIRECT_INDIRECT_OWNERSHIP', 'NATURE_OF_OWNERSHIP']
# partial primary keys: 'NONDERIV_TRANS_SK', 'DERIV_TRANS_SK'
DERIV_TRANS_UNSURE_COLS = ['CONV_EXERCISE_PRICE', 'EQUITY_SWAP_INVOLVED', 'EXCERCISE_DATE', 'EXPIRATION_DATE', 'UNDLYNG_SEC_SHARES', 'UNDLYNG_SEC_VALUE']

SUBMISSION_COLS = ['ACCESSION_NUMBER', 'FILING_DATE', 'PERIOD_OF_REPORT', 'ISSUERCIK', 'ISSUERNAME', 'ISSUERTRADINGSYMBOL']

REPORTING_OWNER_COLS = ['RPTOWNERCIK', 'RPTOWNERNAME','RPTOWNER_RELATIONSHIP'] # there is also address data and filenumber

## Merge Transaction datasets

In [45]:
nonderiv_trans_data['TRANS_DATE'] = pd.to_datetime(nonderiv_trans_data['TRANS_DATE'], errors='coerce')
deriv_trans_data['TRANS_DATE'] = pd.to_datetime(deriv_trans_data['TRANS_DATE'], errors='coerce')

## Cannot directly concatenate the two dataframes as they have different columns some with identical names
df1 = nonderiv_trans_data[['NONDERIV_TRANS_SK'] + SELECTED_TRANSACTION_COLS].copy().rename(columns={'NONDERIV_TRANS_SK':'TRANS_SK'})
df2 = deriv_trans_data[['DERIV_TRANS_SK'] + SELECTED_TRANSACTION_COLS].copy().rename(columns={'DERIV_TRANS_SK':'TRANS_SK'})
all_transaction_data = pd.concat([df1,df2], axis=0, ignore_index=True).reset_index(drop=True)
print(all_transaction_data.shape)

(6106944, 14)


In [46]:
# Feature Engineering

## Create transaction amount column
all_transaction_data['trans_amt'] = all_transaction_data['TRANS_SHARES'] * all_transaction_data['TRANS_PRICEPERSHARE']

trans_amt_0 = all_transaction_data[all_transaction_data['trans_amt'] == 0].shape[0]
print(f"There are {trans_amt_0} transactions with amount 0, {trans_amt_0/(all_transaction_data.shape[0])*100:.2f}% of all transactions")

There are 2250213 transactions with amount 0, 36.85% of all transactions


In [36]:
# To ensure we can drop indirect ownership transactions
all_transaction_data['DIRECT_INDIRECT_OWNERSHIP'].value_counts()

DIRECT_INDIRECT_OWNERSHIP
D    5257448
I     849496
Name: count, dtype: int64

In [47]:
# Drop irrelevant rows

# Filter out indirect ownership
all_transaction_direct = all_transaction_data[all_transaction_data['DIRECT_INDIRECT_OWNERSHIP'] == 'D']
print(all_transaction_direct.shape)

## Remove transactions with transaction date after 2024 Q4
all_transaction_direct = all_transaction_direct[all_transaction_direct['TRANS_DATE'] < pd.to_datetime('2024-12-31')]
print(all_transaction_direct.shape)

#Filter out rows with NA for transaction date
all_transaction_direct = all_transaction_direct[~(all_transaction_direct['TRANS_DATE'].isna())]
print(all_transaction_direct.shape)

#Split data where trans_amt is 0 and non0, use the non0 data
## e.g. non qualified stock option because is a form of compensation and will have 0 transaction amount
all_transaction_direct_comp = all_transaction_direct[all_transaction_direct['trans_amt'] == 0]
all_transaction_direct = all_transaction_direct[all_transaction_direct['trans_amt'] != 0]

print(all_transaction_direct.shape)

(5257448, 15)
(5257281, 15)
(5257281, 15)
(3191965, 15)


In [None]:
all_transaction_direct[['TRANS_DATE']].describe() # 1992 - 2024

Unnamed: 0,TRANS_DATE
count,3191965
mean,2017-08-03 22:13:11.090628352
min,1992-11-13 00:00:00
25%,2014-01-02 00:00:00
50%,2017-05-19 00:00:00
75%,2021-02-26 00:00:00
max,2024-12-30 00:00:00


In [49]:
all_transaction_direct.head()

Unnamed: 0,TRANS_SK,ACCESSION_NUMBER,SECURITY_TITLE,TRANS_DATE,DEEMED_EXECUTION_DATE,TRANS_CODE,EQUITY_SWAP_INVOLVED,TRANS_TIMELINESS,TRANS_SHARES,TRANS_PRICEPERSHARE,TRANS_ACQUIRED_DISP_CD,SHRS_OWND_FOLWNG_TRANS,DIRECT_INDIRECT_OWNERSHIP,NATURE_OF_OWNERSHIP,trans_amt
0,2360796,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,1046.0,28.17,D,43944.0,D,,29465.82
1,2360797,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,637.0,28.17,D,43307.0,D,,17944.29
2,2360795,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,1278.0,28.17,D,44990.0,D,,36001.26
5,2350316,0001144204-14-040532,Common Stock,2014-05-28,,A,0,,1176.0,10.6,A,50295.0,D,,12465.6
6,2350317,0001144204-14-040532,Common Stock,2014-06-27,,A,0,,4673.0,10.7,A,54968.0,D,,50001.1


## Merge with submission data

In [50]:
submission_data['FILING_DATE'] = pd.to_datetime(submission_data['FILING_DATE'], errors='coerce')
submission_data[['FILING_DATE']].describe() # 2011-2024

Unnamed: 0,FILING_DATE
count,2917488
mean,2017-10-29 01:33:20.134912
min,2011-01-03 00:00:00
25%,2014-04-08 00:00:00
50%,2017-10-11 00:00:00
75%,2021-05-04 00:00:00
max,2024-12-31 00:00:00


In [51]:
all_transaction_direct_2 = all_transaction_direct.merge(submission_data[SUBMISSION_COLS], on='ACCESSION_NUMBER', how='left')
all_transaction_direct_2.shape

(3191965, 20)

In [17]:
all_transaction_direct_2.head(2)

Unnamed: 0,TRANS_SK,ACCESSION_NUMBER,SECURITY_TITLE,TRANS_DATE,DEEMED_EXECUTION_DATE,TRANS_CODE,EQUITY_SWAP_INVOLVED,TRANS_TIMELINESS,TRANS_SHARES,TRANS_PRICEPERSHARE,TRANS_ACQUIRED_DISP_CD,SHRS_OWND_FOLWNG_TRANS,DIRECT_INDIRECT_OWNERSHIP,NATURE_OF_OWNERSHIP,trans_amt,FILING_DATE,PERIOD_OF_REPORT,ISSUERCIK,ISSUERNAME,ISSUERTRADINGSYMBOL
0,2360796,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,1046.0,28.17,D,43944.0,D,,29465.82,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL
1,2360797,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,637.0,28.17,D,43307.0,D,,17944.29,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL


## Filter by Kaggle dataset

In [52]:
stock_price_data = pd.read_csv(f'{DATA_FOLDER}/{STOCK_PRICE_DATA_PATH}')
stock_price_data['symbol'].nunique() 

7786

In [53]:
# Inner join all_transaction_direct_2 with stock_price_data to get transaction data with avail stock price data
all_transaction_direct_filterd = all_transaction_direct_2.merge(stock_price_data[['symbol']], left_on='ISSUERTRADINGSYMBOL', right_on='symbol', how='inner')
all_transaction_direct_filterd.drop(columns=['symbol'], inplace=True)
all_transaction_direct_filterd.shape

(2534023, 20)

## Get Reporting Owner/Owner Signature
- currently the code joins only 1-1 matching, from all_transaction_direct_filterd to reporting_owner_data by 'ACCESSION_NUMBER'

In [54]:
# Get overalpping 1-1-1 matching of accession numbers across the three datasets 

## Get unique pk from all_transaction_direct_filterd
accession_num_unique = all_transaction_direct_filterd[['ACCESSION_NUMBER']].drop_duplicates()
print(accession_num_unique.shape)

## get unique pk from reporting_owner_data
matching_reporting_owner = reporting_owner_data[reporting_owner_data['ACCESSION_NUMBER'].isin(accession_num_unique['ACCESSION_NUMBER'])]
matching_rpt_pk = matching_reporting_owner[matching_reporting_owner['ACCESSION_NUMBER'].map(matching_reporting_owner['ACCESSION_NUMBER'].value_counts()) == 1][['ACCESSION_NUMBER']]
print(matching_rpt_pk.shape)

## get final matching subset of pk from matching_reporting_owner
#matching_reporting_owner_names = names_data[names_data['ACCESSION_NUMBER'].isin(matching_rpt_pk['ACCESSION_NUMBER'])]
#matching_rpt_names_pk = matching_reporting_owner_names[matching_reporting_owner_names['ACCESSION_NUMBER'].map(matching_reporting_owner_names['ACCESSION_NUMBER'].value_counts()) == 1][['ACCESSION_NUMBER']]
#print(matching_rpt_names_pk.shape)

print("Final number of unique ACCESSION_NUMBERS with 1-1 matching names,", matching_rpt_pk.shape[0])

(1325908, 1)
(1312567, 1)
Final number of unique ACCESSION_NUMBERS with 1-1 matching names, 1312567


In [55]:
# Merge 1-1 matching data

## filter all_transaction_direct_filterd to get only ACCESSION_NUMBRE in matching_rpt_names_pk
all_transaction_direct_final = all_transaction_direct_filterd[all_transaction_direct_filterd['ACCESSION_NUMBER'].isin(matching_rpt_pk['ACCESSION_NUMBER'])]

## merge with reporting_owener_data 
all_transaction_direct_final = all_transaction_direct_final.merge(reporting_owner_data[['ACCESSION_NUMBER'] + REPORTING_OWNER_COLS], on='ACCESSION_NUMBER', how='left') 

print(all_transaction_direct_final.shape)
all_transaction_direct_final.head()

(2492139, 23)


Unnamed: 0,TRANS_SK,ACCESSION_NUMBER,SECURITY_TITLE,TRANS_DATE,DEEMED_EXECUTION_DATE,TRANS_CODE,EQUITY_SWAP_INVOLVED,TRANS_TIMELINESS,TRANS_SHARES,TRANS_PRICEPERSHARE,TRANS_ACQUIRED_DISP_CD,SHRS_OWND_FOLWNG_TRANS,DIRECT_INDIRECT_OWNERSHIP,NATURE_OF_OWNERSHIP,trans_amt,FILING_DATE,PERIOD_OF_REPORT,ISSUERCIK,ISSUERNAME,ISSUERTRADINGSYMBOL,RPTOWNERCIK,RPTOWNERNAME,RPTOWNER_RELATIONSHIP
0,2360796,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,1046.0,28.17,D,43944.0,D,,29465.82,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL,1453971,Ortigas-Wedekind Marga,Officer
1,2360797,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,637.0,28.17,D,43307.0,D,,17944.29,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL,1453971,Ortigas-Wedekind Marga,Officer
2,2360795,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,1278.0,28.17,D,44990.0,D,,36001.26,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL,1453971,Ortigas-Wedekind Marga,Officer
3,2350316,0001144204-14-040532,Common Stock,2014-05-28,,A,0,,1176.0,10.6,A,50295.0,D,,12465.6,2014-06-30,2014-05-28,1474464,"New York REIT, Inc.",NYRT,1545007,Bowman Scott J.,Director
4,2350317,0001144204-14-040532,Common Stock,2014-06-27,,A,0,,4673.0,10.7,A,54968.0,D,,50001.1,2014-06-30,2014-05-28,1474464,"New York REIT, Inc.",NYRT,1545007,Bowman Scott J.,Director


# Further Data Exploration and Cleaning (WIP)

In [56]:
pd.set_option('display.max_columns', None)
all_transaction_direct_final[['trans_amt']].describe()

Unnamed: 0,trans_amt
count,2217795.0
mean,2890148000.0
std,2734796000000.0
min,0.01
25%,13393.03
50%,61208.0
75%,255515.5
max,4000000000000000.0


In [79]:
for i in list(deriv_holding_data.columns):
    print(i)

ACCESSION_NUMBER
DERIV_HOLDING_SK
SECURITY_TITLE
SECURITY_TITLE_FN
CONV_EXERCISE_PRICE
CONV_EXERCISE_PRICE_FN
TRANS_FORM_TYPE
TRANS_FORM_TYPE_FN
EXERCISE_DATE
EXERCISE_DATE_FN
EXPIRATION_DATE
EXPIRATION_DATE_FN
UNDLYNG_SEC_TITLE
UNDLYNG_SEC_TITLE_FN
UNDLYNG_SEC_SHARES
UNDLYNG_SEC_SHARES_FN
UNDLYNG_SEC_VALUE
UNDLYNG_SEC_VALUE_FN
SHRS_OWND_FOLWNG_TRANS
SHRS_OWND_FOLWNG_TRANS_FN
VALU_OWND_FOLWNG_TRANS
VALU_OWND_FOLWNG_TRANS_FN
DIRECT_INDIRECT_OWNERSHIP
DIRECT_INDIRECT_OWNERSHIP_FN
NATURE_OF_OWNERSHIP
NATURE_OF_OWNERSHIP_FN


# Getting subset of Data for Caitlyn's network analysis

In [71]:
# with TRANS_DATE: 2342363, without: 365605, without ISSUERNAME: 356387, without 'ISSUERCIK': 354851
unique_df = all_transaction_direct_final[['RPTOWNERNAME', 'RPTOWNERCIK', 'ISSUERTRADINGSYMBOL']].drop_duplicates()
unique_dates_df = all_transaction_direct_final[['TRANS_DATE', 'RPTOWNERNAME', 'RPTOWNERCIK', 'ISSUERTRADINGSYMBOL']].drop_duplicates()

In [77]:
all_transaction_direct_final['RPTOWNERNAME'].isna().sum()

949

In [78]:
all_transaction_direct_final[all_transaction_direct_final['RPTOWNERNAME'].isna()]

Unnamed: 0,TRANS_SK,ACCESSION_NUMBER,SECURITY_TITLE,TRANS_DATE,DEEMED_EXECUTION_DATE,TRANS_CODE,EQUITY_SWAP_INVOLVED,TRANS_TIMELINESS,TRANS_SHARES,TRANS_PRICEPERSHARE,TRANS_ACQUIRED_DISP_CD,SHRS_OWND_FOLWNG_TRANS,DIRECT_INDIRECT_OWNERSHIP,NATURE_OF_OWNERSHIP,trans_amt,FILING_DATE,PERIOD_OF_REPORT,ISSUERCIK,ISSUERNAME,ISSUERTRADINGSYMBOL,RPTOWNERCIK,RPTOWNERNAME,RPTOWNER_RELATIONSHIP
153750,640684,0001209191-20-065121,Common Stock,2020-12-21,,S,0,,200.00,35.01,D,3813.00,D,By The Praveen P. Tipirneni Irrevocable Trust ...,7002.000,2020-12-23,2020-12-21,1679363,,MORF,1779978,,"Director,Officer"
153751,640683,0001209191-20-065121,Common Stock,2020-12-21,,M,0,,200.00,4.32,A,4013.00,D,By The Praveen P. Tipirneni Irrevocable Trust ...,864.000,2020-12-23,2020-12-21,1679363,,MORF,1779978,,"Director,Officer"
153752,640688,0001209191-20-065121,Common Stock,2020-12-22,,S,0,,2268.00,35.05,D,3813.00,D,,79493.400,2020-12-23,2020-12-21,1679363,,MORF,1779978,,"Director,Officer"
153753,640687,0001209191-20-065121,Common Stock,2020-12-22,,M,0,,2268.00,4.32,A,6081.00,D,,9797.760,2020-12-23,2020-12-21,1679363,,MORF,1779978,,"Director,Officer"
154073,632980,0001639825-20-000223,Class A Common Stock,2020-12-22,,S,0,,58045.00,156.79,D,56213.00,D,,9100875.550,2020-12-23,2020-12-22,1639825,,PTON,1788276,,Officer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473947,2225569,0001482512-22-000029,LTIP Units,2021-12-29,,A,0,,20259.00,,A,90328.00,D,,,2022-01-03,2021-12-29,1482512,,HPP,1652508,,Officer
2474073,2223970,0001209191-22-000134,Deferred Compensation Notional Units,2021-12-31,,A,0,,285.00,103.79,A,1528.00,D,,29580.150,2022-01-03,2021-12-31,798354,,FISV,1199930,,Director
2474097,2223928,0000802681-22-000008,Phantom Stock,2021-12-31,,M,0,,11549.00,,D,0.00,D,,,2022-01-03,2021-12-31,802681,,BMTC,1357854,,Director
2474098,2223927,0000802681-22-000008,Phantom Stock,2021-12-31,,M,0,,27279.00,,D,0.00,D,,,2022-01-03,2021-12-31,802681,,BMTC,1357854,,Director


In [None]:
#unique_df.to_csv(f'{DATA_FOLDER}/rptowner_trade.csv', index=False)
#unique_dates_df.to_csv(f'{DATA_FOLDER}/rptowner_trade_with_date.csv', index=False)

# Data Exploration

## Exploring transactions and holdings data

In [None]:
# Find overlapping ACCESSION_NUMBER values for nonderiv data
nonderv_holding_n = nonderiv_holding_data['ACCESSION_NUMBER'].nunique()
nonderv_trans_n = nonderiv_trans_data['ACCESSION_NUMBER'].nunique()
print(f"Unique ACCESSION_NUMBER for holdings: {nonderv_holding_n}, for transactions: {nonderv_trans_n}")

nonderiv_overlap_values = set(nonderiv_holding_data['ACCESSION_NUMBER']) & set(nonderiv_trans_data['ACCESSION_NUMBER'])
if nonderiv_overlap_values:
    print("Overlapping ACCESSION_NUMBER values found:", len(nonderiv_overlap_values), round(len(nonderiv_overlap_values) / min((nonderv_trans_n, nonderv_holding_n)), 2))
else:
    print("No overlap found.")

NameError: name 'nonderiv_holding_data' is not defined

In [None]:
# Find overlapping ACCESSION_NUMBER values for deriv data
derv_holding_n = deriv_holding_data['ACCESSION_NUMBER'].nunique()
derv_trans_n = deriv_trans_data['ACCESSION_NUMBER'].nunique()
print(f"Unique ACCESSION_NUMBER for holdings: {derv_holding_n}, for transactions: {derv_trans_n}")

deriv_overlap_values = set(deriv_holding_data['ACCESSION_NUMBER']) & set(deriv_trans_data['ACCESSION_NUMBER'])
if deriv_overlap_values:
    print("Overlapping ACCESSION_NUMBER values found:", len(deriv_overlap_values), round(len(deriv_overlap_values) / min((derv_trans_n, derv_holding_n)), 2))
else:
    print("No overlap found.")

Unique ACCESSION_NUMBER for holdings: 246962, for transactions: 1107281
Overlapping ACCESSION_NUMBER values found: 83827 0.34


### Case: Overlapping ACCESSION_NUMBER for deriv transactions and holidngs
1. '0001062993-24-014375', 1 deriv transaction, 8 holdings, including varying values of 'SHRS_OWND_FOLWNG_TRANS' for different stocks, but no date of transaction
2. '0001654954-19-000382', 1 deriv transaction, 2 holdings, varying 'SHRS_OWND_FOLWNG_TRANS' for same stocks, also no date
3. '0001209191-11-009901', 1 deriv transaction, 5 holdings, varying 'SHRS_OWND_FOLWNG_TRANS' for 1 different and 4 same stocks, some have non-null EXERCISE_DATE
4. '0001437749-21-019709',
5. '0001181431-11-003532',
etc

* Note that nonderiv holdings have a different schema compared to deriv holdings

In [None]:
ACCESSION_NUMBER_TEST = '0001209191-11-009901'

In [None]:
deriv_trans_data[deriv_trans_data['ACCESSION_NUMBER']== ACCESSION_NUMBER_TEST]

Unnamed: 0,ACCESSION_NUMBER,DERIV_TRANS_SK,SECURITY_TITLE,SECURITY_TITLE_FN,CONV_EXERCISE_PRICE,CONV_EXERCISE_PRICE_FN,TRANS_DATE,TRANS_DATE_FN,DEEMED_EXECUTION_DATE,DEEMED_EXECUTION_DATE_FN,...,UNDLYNG_SEC_VALUE,UNDLYNG_SEC_VALUE_FN,SHRS_OWND_FOLWNG_TRANS,SHRS_OWND_FOLWNG_TRANS_FN,VALU_OWND_FOLWNG_TRANS,VALU_OWND_FOLWNG_TRANS_FN,DIRECT_INDIRECT_OWNERSHIP,DIRECT_INDIRECT_OWNERSHIP_FN,NATURE_OF_OWNERSHIP,NATURE_OF_OWNERSHIP_FN
1020924,0001209191-11-009901,1323168,Stock Option,,4.52,,2011-02-11,,,,...,,,0.0,,,,D,,,


In [None]:
deriv_holding_data[deriv_holding_data['ACCESSION_NUMBER']== ACCESSION_NUMBER_TEST]

Unnamed: 0,ACCESSION_NUMBER,DERIV_HOLDING_SK,SECURITY_TITLE,SECURITY_TITLE_FN,CONV_EXERCISE_PRICE,CONV_EXERCISE_PRICE_FN,TRANS_FORM_TYPE,TRANS_FORM_TYPE_FN,EXERCISE_DATE,EXERCISE_DATE_FN,...,UNDLYNG_SEC_VALUE,UNDLYNG_SEC_VALUE_FN,SHRS_OWND_FOLWNG_TRANS,SHRS_OWND_FOLWNG_TRANS_FN,VALU_OWND_FOLWNG_TRANS,VALU_OWND_FOLWNG_TRANS_FN,DIRECT_INDIRECT_OWNERSHIP,DIRECT_INDIRECT_OWNERSHIP_FN,NATURE_OF_OWNERSHIP,NATURE_OF_OWNERSHIP_FN
567018,0001209191-11-009901,810244,Phantom Stock Units,,0.0,,,,,F4,...,,,1688.63,,,,D,,,
567019,0001209191-11-009901,810245,Stock Option,,3.7,,,,2002-08-11,,...,,,30000.0,,,,D,,,
567020,0001209191-11-009901,810248,Stock Option,,13.1,,,,,F6,...,,,15000.0,,,,D,,,
567021,0001209191-11-009901,810247,Stock Option,,11.4,,,,,F5,...,,,30000.0,,,,D,,,
567022,0001209191-11-009901,810246,Stock Option,,4.37,,,,2003-08-11,,...,,,36000.0,,,,D,,,


## Exploring Name Info (1 submission with multiple reporting owners)
* 79206 have more than one reporting owner
* highest number of reporting owners is 10, because no more than 10 reporting persons can file any one Form 4 (see 2 cells below)


Why is the RPTOWNERNAME so weird? 

* Reccomendation: maybe we want to compile the names into 1 tuple, for each accession number?

In [None]:
submission_data['ISSUERCIK'].nunique() # there are issuerick with multiple issuernames

14185

In [None]:
(reporting_owner_data['ACCESSION_NUMBER'].value_counts() == 1).sum() # 0

2838282

In [None]:
reporting_owner_data['ACCESSION_NUMBER'].value_counts() > 1

ACCESSION_NUMBER
0001209191-21-007755     True
0001104659-23-051309     True
0001209191-15-052568     True
0001571049-14-000053     True
0001420295-17-000002     True
                        ...  
0001415889-24-010973    False
0001349334-24-000008    False
0001104659-24-048332    False
0001209191-24-005317    False
0001437749-20-014299    False
Name: count, Length: 2917488, dtype: bool

In [None]:
# See remarks for disclaimer about reporting persons
submission_data[submission_data['ACCESSION_NUMBER'] == '0001209191-21-007755']

Unnamed: 0,ACCESSION_NUMBER,FILING_DATE,PERIOD_OF_REPORT,DATE_OF_ORIG_SUB,NO_SECURITIES_OWNED,NOT_SUBJECT_SEC16,FORM3_HOLDINGS_REPORTED,FORM4_TRANS_REPORTED,DOCUMENT_TYPE,ISSUERCIK,ISSUERNAME,ISSUERTRADINGSYMBOL,REMARKS
2409946,0001209191-21-007755,2021-02-04,2021-02-02,,,0,,,4,1794515,ZoomInfo Technologies Inc.,ZI,"Because no more than 10 reporting persons can file any one Form 4 through the Securities and Exchange Commission's EDGAR system, Atlantic & Pacific VII-B has filed a separate Form 4."


In [None]:
reporting_owner_data[reporting_owner_data['ACCESSION_NUMBER'] == '0001209191-21-007755']

Unnamed: 0,ACCESSION_NUMBER,RPTOWNERCIK,RPTOWNERNAME,RPTOWNER_RELATIONSHIP,RPTOWNER_TITLE,RPTOWNER_TXT,RPTOWNER_STREET1,RPTOWNER_STREET2,RPTOWNER_CITY,RPTOWNER_STATE,RPTOWNER_ZIPCODE,RPTOWNER_STATE_DESC
2604188,0001209191-21-007755,1812579,"TA AP VII-B DO Subsidiary Partnership, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,
2604189,0001209191-21-007755,1812606,"TA XI DO AIV II, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,
2604190,0001209191-21-007755,1034569,"TA ASSOCIATES, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,
2604191,0001209191-21-007755,1609539,"TA XI DO Feeder, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,
2604192,0001209191-21-007755,1812605,"TA SDF III DO AIV II, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,
2604193,0001209191-21-007755,1609553,"TA SDF III DO AIV, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,
2604194,0001209191-21-007755,1609536,"TA XI DO AIV, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,
2604195,0001209191-21-007755,1578035,"TA INVESTORS IV, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,
2604196,0001209191-21-007755,1548681,TA Atlantic & Pacific VII-A L.P.,"Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,
2604197,0001209191-21-007755,1609557,"TA SDF III DO Feeder, L.P.","Director,TenPercentOwner",,,200 CLARENDON STREET,56TH FLOOR,BOSTON,MA,2116,


In [None]:
pd.set_option('display.max_colwidth', None)
names_data[names_data['ACCESSION_NUMBER'] == '0001209191-21-007755']
# seems like OWNERSIGNATURENAME requires .split(',') to extract alias used in Reporting owner name

Unnamed: 0,ACCESSION_NUMBER,OWNERSIGNATURENAME
2561362,0001209191-21-007755,"TA SDF III DO AIV, L.P., by TA Associates, SDF III GP L.P., its General Partner, by TA Associates, L.P., its General Partner, by Jeffrey C. Hadden, its, General Counsel, /s/ Jeffrey C. Hadden"
2561363,0001209191-21-007755,"TA XI DO AIV, L.P., by TA Associates XI GP, L.P., its General Partner, by TA Associates, L.P., its General, Partner, by Jeffrey C. Hadden, its General Counsel, /s/ Jeffrey C. Hadden"
2561364,0001209191-21-007755,"TA Associates, L.P., by Jeffrey C. Hadden, its, General Counsel, /s/ Jeffrey C. Hadden"
2561365,0001209191-21-007755,"TA Atlantic & Pacific VII-A L.P., by TA Associates AP, VII GP L.P., its General Partner, by TA Associates, L.P., its General Partner, by Jeffrey C. Hadden, its, General Counsel, /s/ Jeffrey C. Hadden"
2561366,0001209191-21-007755,"TA Investors IV, L.P., by TA Associates, L.P., its General Partner, by Jeffrey C. Hadden, its, General Counsel, /s/ Jeffrey C. Hadden"
2561367,0001209191-21-007755,"TA SDF III DO AIV II, L.P., by TA Associates SDF, III GP, L.P., its General Partner, by TA Associates, L.P., its General Partner, by Jeffrey C. Hadden, its General, Counsel, /s/ Jeffrey C. Hadden"
2561368,0001209191-21-007755,"TA XI DO AIV II, L.P., by TA Associates XI GP, L.P, its General Partner, by TA Associates, L.P., its General, Partner, by Jeffrey C. Hadden, its General Counsel, /s/ Jeffrey C. Hadden"
2561369,0001209191-21-007755,"TA Associates AP VII-B DO Subsidiary Partnership, L.P., by TA Associates AP VII GP L.P., its General Partner, by TA Associates, L.P., its General Partner, by Jeffrey C., Hadden, its General Counsel, /s/ Jeffrey C. Hadden"
2561370,0001209191-21-007755,"TA SDF III DO Feeder, L.P., by TA Associates SDF III GP L.P., its General Partner, by TA Associates, L.P., its General Partner, by Jeffrey C., Hadden, its General Counsel, /s/ Jeffrey C. Hadden"
2561371,0001209191-21-007755,"TA XI DO Feeder, L.P., by TA Associates XI GP L.P., its General Partner, by TA Associates, L.P., its General Partner, Jeffrey C. Hadden, its, General Counsel, /s/ Jeffrey C. Hadden"


In [None]:
reporting_owner_data[reporting_owner_data['ACCESSION_NUMBER'] == '0001437749-20-014299']# '0001415889-24-010973']#'0001571049-14-000053']

Unnamed: 0,ACCESSION_NUMBER,RPTOWNERCIK,RPTOWNERNAME,RPTOWNER_RELATIONSHIP,RPTOWNER_TITLE,RPTOWNER_TXT,RPTOWNER_STREET1,RPTOWNER_STREET2,RPTOWNER_CITY,RPTOWNER_STATE,RPTOWNER_ZIPCODE,RPTOWNER_STATE_DESC
3171122,0001437749-20-014299,1815956,Hoffman Matthew Leo,Officer,Chief Financial Officer,,C/O THE CORETEC GROUP INC.,"6804 SOUTH CANTON AVENUE, SUITE 150",TULSA,OK,74136,


In [None]:
pd.set_option('display.max_colwidth', None)
names_data[names_data['ACCESSION_NUMBER'] == '0001437749-20-014299']#'0001415889-24-010973']#'0001571049-14-000053']

Unnamed: 0,ACCESSION_NUMBER,OWNERSIGNATURENAME
3119137,0001437749-20-014299,/s/ Matthew Hoffman


In [None]:
names_data[names_data['OWNERSIGNATURENAME']=='/s/ Harry You']

Unnamed: 0,ACCESSION_NUMBER,OWNERSIGNATURENAME
2507155,0000899243-21-014301,/s/ Harry You


# Testing Litigation Data

In [None]:
data_litigations = pd.read_csv(f"{DATA_FOLDER}/{LITIGATIONS_DATA_PATH}")

In [None]:
data_litigations[['yr']].describe() # 1996 - 2017 

Unnamed: 0,yr
count,1222.0
mean,2006.533552
std,6.041633
min,1996.0
25%,2002.0
50%,2007.0
75%,2011.0
max,2017.0


In [None]:
pd.set_option('display.max_colwidth', None)
data_litigations.tail(2)

Unnamed: 0.1,Unnamed: 0,lt_no,yr,title,lt,class
1220,7977,24012,2017,Therapist Settles Charges of Insider Trading Ahead of Acquisition Announcement,"[A Seattle-based therapist has agreed to settle SEC charges that he traded in the stock of zulily, Inc Zulily based on information he learned from a Zulily employee during confidential counseling sessions , The SECs complaint alleges that, in July 2015, during counseling sessions, the Zulily employee told Kenneth Peer that Zulily was going to be acquired by Liberty Interactive, a media holding company On three occasions between July 21, 2015 and August 10, 2015, after counseling sessions with the Zulily employee, Peer purchased a total of over $28,000 of Zulily stock The complaint alleges that, before the market opened on August 17, 2015, Zulily announced that it had agreed to be acquired by Liberty Interactive in a tender offer By the end of trading that day, Zulilys stock allegedly had risen by 49%, with nearly 15 times the stocks average daily trading volume Shortly after the acquisition was announced, Peer allegedly sold all of his Zulily shares for illegal profits of approximately $10,000 , The SECs complaint charges Peer with violating Sections 10 b and 14 e of the Securities Exchange Act of 1934 and Rules 10b-5 and 14e-3 thereunder Without admitting or denying the SECs allegations, Peer agreed to disgorge $10,227 73 plus interest of $811 80 and pay a $10,227 73 penalty, for a total of $21,267 26 Peer also agreed to be enjoined from further violations of the charged provisions , The SECs investigation was conducted by Alice Liu Jensen and supervised by Steven D Buchholz, both of the Market Abuse Unit in the San Francisco Regional Office The SEC appreciates the assistance of FINRA in this matter , <img alt border0 height9 srcimagesarrowright_dkblue gif width10><a hreflitigationcomplaints2017comp24012 pdf>SEC Complaint<a>, , <i>https:www sec govlitigationlitreleases2017lr24012 htm<i><br>]",1
1221,7980,24015,2017,SEC Charges Former Employee and Friend with Insider Trading in Securities of International Rectifier Corporation,"[The today announced insider trading charges against a former employee of a semiconductor company and his friend for trading on nonpublic information that the company would be acquired , The SEC alleges that Lanny Brown learned that Infineon Technologies AG planned to acquire his then-employer, International Rectifier Corp IRC , before the deal was publicly announced According to the SECs complaint, Brown tipped his friend, Sean Fox, about the deal and both of them then acquired IRC call options The SEC further alleges that Brown and Fox concealed Browns involvement in the trading by depositing approximately $12,000 of their combined funds into Foxs brokerage account, and then used this account to purchase the call options for both of them The SEC also alleges that Fox closed out the option positions after the acquisition was publicly announced, and the two defendants made $369,720 in illicit profits To further hide Browns role in the trading, Fox allegedly funneled Browns share of the trading profits by paying several of Browns personal expenses and by writing checks to Browns children and stepchildren Brown and his wife then endorsed those checks and used the funds , The SECs complaint, filed in federal court in the District of Arizona, charges Brown and Fox with violating Section 10 b of the Securities Exchange Act of 1934 and Rule 10b-5 thereunder A criminal action is also pending against both Brown and Fox in the District of Arizona for the same underlying conduct In the SECs action, the defendants have consented to the entry of a final judgment that permanently enjoins them from future violations of the charged provisions of the federal securities laws The final judgment in the SECs action also orders them to pay, on a joint and several basis, disgorgement of $369,720 plus prejudgment interest of $43,147 79, with a credit for the monetary amount they have agreed to pay in the parallel criminal case against them , The settlements with the SEC are subject to court approval , The SECs investigation was conducted by Yolanda Ochoa and supervised by Finola H Manvelian of the Los Angeles office The SEC appreciates the assistance of the Financial Industry Regulatory Authority , <p classcenter>###, <img alt border0 height9 srcimagesarrowright_dkblue gif width10><a hreflitigationcomplaints2017comp24015 pdf>SEC Complaint<a>, , <i>https:www sec govlitigationlitreleases2017lr24015 htm<i><br>]",1


In [None]:
## DeepSeek's AI, number is index of litigations to avoid confusion
result_1220 = {
  "NameOfTrader": "Kenneth Peer",
  "Profession": "Seattle-based therapist",
  "CompanyInvolved": "zulily, Inc (Zulily)",
  "AcquiringCompany": "Liberty Interactive",
  "SourceOfInformation": "Zulily employee (learned during confidential counseling sessions)",
  "DateOfAcquisitionAnnouncement": "17-08-2015",
  "DatesOfIllegalTransactions": ["21-07-2015", "10-08-2015"],
  "TotalAmountInvested": 28000,
  "IllegalProfits": 10000,
  "StockPriceIncrease": "49%",
  "TradingVolumeIncrease": "15 times the average daily trading volume",
  "LegalConsequences": {
    "Disgorgement": 10227.73,
    "Interest": 811.80,
    "Penalty": 10227.73,
    "TotalPayment": 21267.26,
    "Injunction": "Enjoined from further violations of the charged provisions"
  },
  "SECCharges": [
    "Violation of Section 10(b) of the Securities Exchange Act of 1934",
    "Violation of Section 14(e) of the Securities Exchange Act of 1934",
    "Violation of Rule 10b-5",
    "Violation of Rule 14e-3"
  ],
  "SECInvestigators": {
    "Investigator": "Alice Liu Jensen",
    "Supervisor": "Steven D. Buchholz",
    "Unit": "Market Abuse Unit",
    "RegionalOffice": "San Francisco Regional Office"
  },
  "AssistanceProvidedBy": "FINRA"
}

result_1221 = {
  "NamesOfTraders": ["Lanny Brown", "Sean Fox"],
  "RelationToInsider": "Lanny Brown (former employee of International Rectifier Corp)",
  "CompanyInvolved": "International Rectifier Corp (IRC)",
  "AcquiringCompany": "Infineon Technologies AG",
  "DateOfAcquisitionAnnouncement": "Not explicitly stated in the text", #### THIS IS AN ISSUE 
  "DateOfIllegalTransactions": "Not explicitly stated in the text",     #### THIS IS AN ISSUE 
  "IllegalActivity": "Purchased IRC call options using nonpublic information about the acquisition",
  "FundsDepositedForTrading": 12000,
  "IllegalProfits": 369720,
  "PrejudgmentInterest": 43147.79,
  "LegalConsequences": "Permanent injunctions, disgorgement of $369,720, and prejudgment interest",
  "CriminalAction": "Pending in the District of Arizona",
  "SECInvestigators": {
    "Investigator": "Yolanda Ochoa",
    "Supervisor": "Finola H. Manvelian"
  },
  "AssistanceProvidedBy": "Financial Industry Regulatory Authority (FINRA)"
}

In [None]:
# From the two known datasets with Name Info, attempt to find ["Lanny Brown", "Sean Fox"]
owner_signature = names_data['OWNERSIGNATURENAME'].str.lower().values
reportin_owner = reporting_owner_data['RPTOWNERNAME'].dropna().str.lower().values

In [None]:
NAME = 'murray' #'Peer Kenneth' # Peer, Kenneth Peer
for st in owner_signature:
    if NAME.lower() in st:
        print(st)

for st in reportin_owner:
    if NAME.lower() in st:
        print(st)

/s/dennis e murray
/s/margaret a murray
/s/ murray j. mccabe
/s/richard j. rubino, attorney-in-fact for murray a. goldberg
by: michelle novotny for: murray d. rode
by: michelle novotny for: murray d. rode
/s/ ryan lawrence as attorney-in-fact for r. michael murray, jr.
/s/ stephen p. murray
/s/antonio g gomes, attorney-in-fact for murray j. demo
donald b. murray
/s/ christopher g. ferro, attorney in fact to murray j. demo
martin murray
cher s. lawrence, attorney-in-fact for mr. murray smith
/s/ james t. holder, attorney-in-fact for mr. murray
murray s. levin
r. patrick murray, ii, as attorney-in-fact
r. patrick murray, ii, as attorney-in-fact
r. patrick murray, ii, as attorney-in-fact
r. patrick murray, ii, as attorney-in-fact
r. patrick murray, ii, as attorney-in-fact
r. patrick murray, ii, as attorney-in-fact
r. patrick murray, ii, as attorney-in-fact
r. patrick murray, ii, as attorney-in-fact
r. patrick murray, ii, as attorney-in-fact
/s/ john s. lamountain as attorney-in-fact for r

In [None]:
NAME = 'Fox Sean' #'Sean Fox' #'Brown Lanny' #'Lanny Brown'

for st in owner_signature:
    if NAME.lower() in st:
        print(st)

for st in reportin_owner:
    if NAME.lower() in st:
        print(st)

# Discarded

In [None]:
# Code after Dary;'s extraction to further clean
names_data_clean = pd.read_csv(f'{DATA_FOLDER}/extracted_names.csv')

names_data_subset = names_data_clean[names_data_clean['ACCESSION_NUMBER'].isin(matching_rpt_names_pk['ACCESSION_NUMBER'])]

import re

def clean_names(name):
    # Remove legal-related terms (case-insensitive)
    name = re.sub(r'(?i)(attorney[-\s]?in[-\s]?fact|power[-\s]?attorney|P\.O\.A\.?|behalf|by )', ',', name)

    # Replace multiple spaces or commas with a single comma for clean splitting
    name = re.sub(r'\s*,\s*', ',', name)  # Clean up commas
    name = re.sub(r'\s+', ' ', name).strip()  # Remove extra spaces

    # Split by comma if necessary and return clean names
    names = [n.strip() for n in name.split(',') if n.strip()]
    return ', '.join(names)

names_data_subset['OWNERSIGNATURENAME_FINAL'] = names_data_subset['extracted'].apply(clean_names)  