# SEC Form 4 Data Collation
Refer to full readme.md for SEC data here: https://www.sec.gov/files/insider_transactions_readme.pdf 



### Overview of Merging Steps

1. **Combine Derivative & Non-Derivative Transactions**  
    - Merge `DERIV_TRANS` and `NONDERIV_TRANS` by their common columns.  
    - Rename their primary key columns (`DERIV_TRANS_SK` or `NONDERIV_TRANS_SK`) to `TRANS_SK`.

2. **Filter & Clean Transactions**  
    - **Keep only direct ownership** (drop indirect).  
    - **Exclude transactions past 2024-Q4**.  
    - **Drop rows with missing transaction dates**.  
    - **Calculate transaction amount** as *(shares × price per share)*, then split into zero vs. non-zero amounts.  
    - Retain only the **non-zero** transaction dataset (for instance, 3,191,965 rows from 1992–2024).

3. **Merge with Form 4 Submission Data** 
    - Each Form 4 submission can include up to 30 transactions.  
    - Merging adds **filing date**, **period of report**, and the **issuer’s CIK** (i.e., the company whose shares are being transacted).

4. **Merge with Reporting Owner Data**  
   - Includes the **owner’s name** and **relationship** details.  
   - Only keep submissions with exactly **one** reporting owner (covering ~99% of cases), avoiding the complexity of multi-owner filings.

#### Importing libraries and folders

In [11]:
#Required libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS
import yfinance as yf
import shutil
pd.set_option('display.max_columns', None)


#Data folders
DATA_FOLDER = "data/interim/FINAL_RAW_DATA"
STOCK_PRICE_DATA_PATH = "dataset_summary.csv"
YEARS_THRESHOLD = (2005, 2021) # to match little sis network data

## Form Submission Main data
submission_data = pd.read_csv(f"{DATA_FOLDER}/SUBMISSION.csv") # pk : ACCESSION_NUMBER

## Transaction for submissions, ACCESSION_NUMBER and (NON)DERIV_TRANS_SK are the primary keys
# One form (i.e. ACCESSION_NUMBER) can have multiple transactions (i.e. *_SK), transactions can be across multiple years, max 30 each 
# Duplicate *_SK keys are for different transactions, and there are max 2 of each duplicate _SK keys
nonderiv_trans_data = pd.read_csv(f"{DATA_FOLDER}/NONDERIV_TRANS.csv")
deriv_trans_data = pd.read_csv(f"{DATA_FOLDER}/DERIV_TRANS.csv")

## Holding info for each submission (what they have - After each transaction..?)
nonderiv_holding_data = pd.read_csv(f"{DATA_FOLDER}/NONDERIV_HOLDING.csv")
deriv_holding_data = pd.read_csv(f"{DATA_FOLDER}/DERIV_HOLDING.csv")

## Name info
reporting_owner_data = pd.read_csv(f"{DATA_FOLDER}/REPORTINGOWNER.csv")
#names_data = pd.read_csv(f"{DATA_FOLDER}/OWNER_SIGNATURE.csv") 

## Additional info, to match with '*_FN' columns in all other datasets based on matching ACCESSION_NUMBER
#footnotes_data = pd.read_csv(f"{DATA_FOLDER}/FOOTNOTES.csv")

  nonderiv_trans_data = pd.read_csv(f"{DATA_FOLDER}/NONDERIV_TRANS.csv")
  deriv_trans_data = pd.read_csv(f"{DATA_FOLDER}/DERIV_TRANS.csv")
  nonderiv_holding_data = pd.read_csv(f"{DATA_FOLDER}/NONDERIV_HOLDING.csv")
  deriv_holding_data = pd.read_csv(f"{DATA_FOLDER}/DERIV_HOLDING.csv")


#### Simple EDA

In [12]:
#Overview of data
dataframes = {
    'submission_data': submission_data,
    'nonderiv_trans_data': nonderiv_trans_data,
    'deriv_trans_data': deriv_trans_data,
    'nonderiv_holding_data': nonderiv_holding_data,
    'deriv_holding_data': deriv_holding_data,
    'reporting_owner_data': reporting_owner_data,
    #'names_data': names_data,
    #'footnotes_data': footnotes_data
}

for name, df in dataframes.items():
    shape = df.shape
    print(f"{name}, {shape}")

    df.drop_duplicates(inplace=True)
    if df.shape[0] != shape[0]:
       print(f"Duplicate rows removed: {shape[0] - df.shape[0]}")
    else:
       print("No duplicate rows")
    print()

submission_data, (2917488, 13)
No duplicate rows

nonderiv_trans_data, (4343860, 28)
No duplicate rows

deriv_trans_data, (1763084, 42)
No duplicate rows

nonderiv_holding_data, (1522788, 14)
No duplicate rows

deriv_holding_data, (1000283, 26)
No duplicate rows

reporting_owner_data, (3171123, 13)
No duplicate rows



#### Subset of columns required from each csv

In [13]:
SELECTED_TRANSACTION_COLS = ['ACCESSION_NUMBER', 'SECURITY_TITLE', 'TRANS_DATE', 'DEEMED_EXECUTION_DATE', 'TRANS_CODE', 'EQUITY_SWAP_INVOLVED',
                             'TRANS_TIMELINESS', 'TRANS_SHARES', 'TRANS_PRICEPERSHARE', 'TRANS_ACQUIRED_DISP_CD',
                             'SHRS_OWND_FOLWNG_TRANS', 'DIRECT_INDIRECT_OWNERSHIP', 'NATURE_OF_OWNERSHIP']

# partial primary keys: 'NONDERIV_TRANS_SK', 'DERIV_TRANS_SK'
DERIV_TRANS_UNSURE_COLS = ['CONV_EXERCISE_PRICE', 'EQUITY_SWAP_INVOLVED', 'EXCERCISE_DATE', 'EXPIRATION_DATE', 'UNDLYNG_SEC_SHARES', 'UNDLYNG_SEC_VALUE']

SUBMISSION_COLS = ['ACCESSION_NUMBER', 'FILING_DATE', 'PERIOD_OF_REPORT', 'ISSUERCIK', 'ISSUERNAME', 'ISSUERTRADINGSYMBOL']

REPORTING_OWNER_COLS = ['RPTOWNERCIK', 'RPTOWNERNAME','RPTOWNER_RELATIONSHIP'] # there is also address data and filenumber

#### Merge Transaction datasets

In [14]:
#Changing dtypes
nonderiv_trans_data['TRANS_DATE'] = pd.to_datetime(nonderiv_trans_data['TRANS_DATE'], errors='coerce')
deriv_trans_data['TRANS_DATE'] = pd.to_datetime(deriv_trans_data['TRANS_DATE'], errors='coerce')

## Cannot directly concatenate the two dataframes as they have different columns some with identical names
df1 = nonderiv_trans_data[['NONDERIV_TRANS_SK'] + SELECTED_TRANSACTION_COLS].copy().rename(columns={'NONDERIV_TRANS_SK':'TRANS_SK'})
df2 = deriv_trans_data[['DERIV_TRANS_SK'] + SELECTED_TRANSACTION_COLS].copy().rename(columns={'DERIV_TRANS_SK':'TRANS_SK'})
all_transaction_data = pd.concat([df1,df2], axis=0, ignore_index=True).reset_index(drop=True)
print(all_transaction_data.shape)

(6106944, 14)


#### Feature Engineering and Cleaning

In [15]:
## Create transaction amount column
all_transaction_data['trans_amt'] = all_transaction_data['TRANS_SHARES'] * all_transaction_data['TRANS_PRICEPERSHARE']

trans_amt_0 = all_transaction_data[all_transaction_data['trans_amt'] == 0].shape[0]
print(f"There are {trans_amt_0} transactions with amount 0, {trans_amt_0/(all_transaction_data.shape[0])*100:.2f}% of all transactions")

There are 2250213 transactions with amount 0, 36.85% of all transactions


In [16]:
# Value counts of direct/indirect ownership
all_transaction_data['DIRECT_INDIRECT_OWNERSHIP'].value_counts()

DIRECT_INDIRECT_OWNERSHIP
D    5257448
I     849496
Name: count, dtype: int64

#### Filtering out transactions

In [20]:


# 1. Select only transactions from 2005 to 2021
all_transaction_data = all_transaction_data[(all_transaction_data['TRANS_DATE'].dt.year >= YEARS_THRESHOLD[0]) & (all_transaction_data['TRANS_DATE'].dt.year <= YEARS_THRESHOLD[1])]
print(all_transaction_data.shape)

# 2. Filter out indirect ownership
all_transaction_direct = all_transaction_data[all_transaction_data['DIRECT_INDIRECT_OWNERSHIP'] == 'D']
print(all_transaction_direct.shape)

# 3. Filter out transactions with 0 transaction amount
## e.g. non qualified stock option because is a form of compensation and will have 0 transaction amount
all_transaction_direct_comp = all_transaction_direct[all_transaction_direct['trans_amt'] == 0]
all_transaction_direct = all_transaction_direct[all_transaction_direct['trans_amt'] != 0]

print("Resultant shape of transaction data:", all_transaction_direct.shape)

(4907535, 15)
(4213120, 15)
Resultant shape transaction data: (2596384, 15)


#### Merge with submission data

In [24]:
#Change dtypes of filing date
submission_data['FILING_DATE'] = pd.to_datetime(submission_data['FILING_DATE'], errors='coerce')

# Merge on ACCESSION_NUMBER
all_transaction_direct_2 = all_transaction_direct.merge(submission_data[SUBMISSION_COLS], on='ACCESSION_NUMBER', how='left')
print(all_transaction_direct_2.shape)

all_transaction_direct_2.head(2)

(2596384, 20)


Unnamed: 0,TRANS_SK,ACCESSION_NUMBER,SECURITY_TITLE,TRANS_DATE,DEEMED_EXECUTION_DATE,TRANS_CODE,EQUITY_SWAP_INVOLVED,TRANS_TIMELINESS,TRANS_SHARES,TRANS_PRICEPERSHARE,TRANS_ACQUIRED_DISP_CD,SHRS_OWND_FOLWNG_TRANS,DIRECT_INDIRECT_OWNERSHIP,NATURE_OF_OWNERSHIP,trans_amt,FILING_DATE,PERIOD_OF_REPORT,ISSUERCIK,ISSUERNAME,ISSUERTRADINGSYMBOL
0,2360796,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,1046.0,28.17,D,43944.0,D,,29465.82,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL
1,2360797,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,637.0,28.17,D,43307.0,D,,17944.29,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL


#### Get Reporting Owner/Owner Signature
- We only match those forms with 1 reporting owner. 

In [25]:
# Get overlapping 1-1 matching of accession numbers across the three datasets to avoid duplicated rows from joining

## Get unique pk from all_transaction_direct_filterd
accession_num_unique = all_transaction_direct_2[['ACCESSION_NUMBER']].drop_duplicates()

## get unique pk from reporting_owner_data
matching_reporting_owner = reporting_owner_data[reporting_owner_data['ACCESSION_NUMBER'].isin(accession_num_unique['ACCESSION_NUMBER'])]
matching_rpt_pk = matching_reporting_owner[matching_reporting_owner['ACCESSION_NUMBER'].map(matching_reporting_owner['ACCESSION_NUMBER'].value_counts()) == 1][['ACCESSION_NUMBER']]

print("Final number of unique ACCESSION_NUMBERS with 1-1 matching names,", matching_rpt_pk.shape[0])

(1322672, 1)
(1306752, 1)
Final number of unique ACCESSION_NUMBERS with 1-1 matching names, 1306752


In [26]:
# Merging

## filter all_transaction_direct_filterd to get only ACCESSION_NUMBRE in matching_rpt_names_pk
all_transaction_direct_final = all_transaction_direct_2[all_transaction_direct_2['ACCESSION_NUMBER'].isin(matching_rpt_pk['ACCESSION_NUMBER'])]

## merge with reporting_owener_data 
all_transaction_direct_final = all_transaction_direct_final.merge(reporting_owner_data[['ACCESSION_NUMBER'] + REPORTING_OWNER_COLS], on='ACCESSION_NUMBER', how='left') 

print(all_transaction_direct_final.shape)
all_transaction_direct_final.head()

(2546985, 23)


Unnamed: 0,TRANS_SK,ACCESSION_NUMBER,SECURITY_TITLE,TRANS_DATE,DEEMED_EXECUTION_DATE,TRANS_CODE,EQUITY_SWAP_INVOLVED,TRANS_TIMELINESS,TRANS_SHARES,TRANS_PRICEPERSHARE,TRANS_ACQUIRED_DISP_CD,SHRS_OWND_FOLWNG_TRANS,DIRECT_INDIRECT_OWNERSHIP,NATURE_OF_OWNERSHIP,trans_amt,FILING_DATE,PERIOD_OF_REPORT,ISSUERCIK,ISSUERNAME,ISSUERTRADINGSYMBOL,RPTOWNERCIK,RPTOWNERNAME,RPTOWNER_RELATIONSHIP
0,2360796,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,1046.0,28.17,D,43944.0,D,,29465.82,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL,1453971,Ortigas-Wedekind Marga,Officer
1,2360797,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,637.0,28.17,D,43307.0,D,,17944.29,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL,1453971,Ortigas-Wedekind Marga,Officer
2,2360795,0001179110-14-011078,Common Stock,2014-06-26,,S,0,,1278.0,28.17,D,44990.0,D,,36001.26,2014-06-30,2014-06-26,926326,"OMNICELL, Inc",OMCL,1453971,Ortigas-Wedekind Marga,Officer
3,2350316,0001144204-14-040532,Common Stock,2014-05-28,,A,0,,1176.0,10.6,A,50295.0,D,,12465.6,2014-06-30,2014-05-28,1474464,"New York REIT, Inc.",NYRT,1545007,Bowman Scott J.,Director
4,2350317,0001144204-14-040532,Common Stock,2014-06-27,,A,0,,4673.0,10.7,A,54968.0,D,,50001.1,2014-06-30,2014-05-28,1474464,"New York REIT, Inc.",NYRT,1545007,Bowman Scott J.,Director


#### Convert back into csv to zip

In [30]:
#convert to csv
all_transaction_direct_final.to_csv(f"{DATA_FOLDER}/all_transactions_merged.csv")

import shutil
#Zip folder
folder_to_zip = DATA_FOLDER
output_zip_file = "data/interim/FINAL_RAW_DATA.zip"
shutil.make_archive(output_zip_file.replace('.zip', ''), 'zip', folder_to_zip)

print(f"Folder '{folder_to_zip}' has been zipped as '{output_zip_file}'")


Folder 'data/interim/FINAL_RAW_DATA' has been zipped as 'data/interim/FINAL_RAW_DATA.zip'
