## Ensure that you have relevant data in data_untracked

In [14]:
import numpy as np
import pandas as pd
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

from extract_data import extract 
from create_features import features
from create_labels import labels
# from create_data import create_features

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

### Download below required if creating footnote analysis file

In [2]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download("wordnet")
# nltk.download("averaged_perceptron_tagger")

___
## Section 1. Create Data
___

### Extract form 4 data if missing

In [3]:
data_extractor = extract.Data_Extractor()
data_extractor.create_form4()

Found 76 zip file links
Downloaded: ../../data_untracked/raw/sec_submissions/interim/2024q4_form345.zip
Downloaded: ../../data_untracked/raw/sec_submissions/interim/2024q3_form345.zip


KeyboardInterrupt: 

### Create labels

In [4]:
# Create data labels if missing
label_data_creator =  labels.Label_Data_Creator()
label_data_creator.create_labels()



### Create features

In [4]:
feature_data_creator = features.Feature_Data_Creator()
feature_data_creator.create_features()

=== Final features file present ===


### Create training and testing data

In [6]:
feature_data_creator.create_training_testing()

=== Training or Testing file not found. Begin creating based on quantile: 0.7
=== Saving Training and Testing ===


___
## Section 2. Modelling, Training and Validation
___

In [5]:
print("features:")
print(feature_data_creator.features)

features:
['js_bin', 's_bin', 'b_bin', 'jb_bin', 'ob_bin', 'g_bin', 'gift', 'distribution', 'charity', 'price', 'number', 'ball', 'pursuant', '10b5-1', '16b-3', 'lobbyist_score_final', 'total_senate_connections', 'total_house_connections', 'combined_seniority_score', 'PI_combined_total', 'net_trading_intensity', 'net_trading_amt', 'relative_trade_size_to_self', 'beneficial_ownership_score', 'title_score', 'TRANS_TIMELINESS_clean', 'execution_timeliness', 'filing_lag_days', 'filing_timeliness', 'is_lobby', 'has_lobby', 'has_donate', 'important_connections', 'full_congress_connections', 'sen_important_connections', 'sen_full_congress_connections', 'sen_t2_full_congress_connections', 'sen_t1_important_connections', 'sen_t1_full_congress_connections', 'house_t2_important_connections', 'house_t2_full_congress_connections', 'house_t1_important_connections', 'house_t1_full_congress_connections']


___
## Compare results
___

# DELETE ALL BELOW

In [6]:
from path_location import folder_location

labels = pd.read_csv(f'{folder_location.PROCESSED_DATA_FOLDER}/{folder_location.ABNORMAL_CSV}')

In [7]:
features = pd.read_csv(f'{folder_location.PROCESSED_DATA_FOLDER}/{folder_location.FULL_FEATURES_FILE}')

In [9]:
list(labels.columns)

['TRANS_SK',
 'ACCESSION_NUMBER',
 'SECURITY_TITLE',
 'TRANS_DATE',
 'DEEMED_EXECUTION_DATE',
 'TRANS_CODE',
 'EQUITY_SWAP_INVOLVED',
 'TRANS_TIMELINESS',
 'TRANS_SHARES',
 'TRANS_PRICEPERSHARE',
 'TRANS_ACQUIRED_DISP_CD',
 'SHRS_OWND_FOLWNG_TRANS',
 'DIRECT_INDIRECT_OWNERSHIP',
 'NATURE_OF_OWNERSHIP',
 'trans_amt',
 'FILING_DATE',
 'PERIOD_OF_REPORT',
 'ISSUERCIK',
 'ISSUERNAME',
 'ISSUERTRADINGSYMBOL',
 'RPTOWNERCIK',
 'NUM_RPTOWNERCIK',
 'RPTOWNERNAME_;',
 'RPTOWNER_RELATIONSHIP_;',
 'RPTOWNER_TITLE_#',
 'clean_ticker',
 'is_weird_ticker',
 'PERMNO',
 'date_x',
 'VOL',
 'PRC',
 'RET',
 'TICKER',
 'date_y',
 'actual_ret',
 'b_mkt',
 'risk_free_rate',
 'expected_ret',
 'abnormal_ret',
 'CAR_5_before',
 'CAR_5_after',
 'CAR_30_before',
 'CAR_30_after',
 'CAR_60_before',
 'CAR_60_after',
 'CAR_120_before',
 'CAR_120_after',
 'security_category',
 'effective_CAR_30_after',
 'effective_CAR_60_after',
 'effective_CAR_120_after',
 'local_score_30',
 'n_local_30',
 'isolation_raw_30',
 'loca

In [11]:
list(features.columns)

['Unnamed: 0',
 'TRANS_SK',
 'ACCESSION_NUMBER',
 'SECURITY_TITLE',
 'TRANS_DATE',
 'DEEMED_EXECUTION_DATE',
 'TRANS_CODE',
 'EQUITY_SWAP_INVOLVED',
 'TRANS_TIMELINESS',
 'TRANS_SHARES',
 'TRANS_PRICEPERSHARE',
 'TRANS_ACQUIRED_DISP_CD',
 'SHRS_OWND_FOLWNG_TRANS',
 'DIRECT_INDIRECT_OWNERSHIP',
 'NATURE_OF_OWNERSHIP',
 'trans_amt',
 'FILING_DATE',
 'PERIOD_OF_REPORT',
 'ISSUERCIK',
 'ISSUERNAME',
 'ISSUERTRADINGSYMBOL',
 'RPTOWNERCIK',
 'NUM_RPTOWNERCIK',
 'RPTOWNERNAME_;',
 'RPTOWNER_RELATIONSHIP_;',
 'RPTOWNER_TITLE_#',
 'clean_ticker',
 'is_weird_ticker',
 'PERMNO',
 'date_x',
 'VOL',
 'PRC',
 'RET',
 'TICKER',
 'date_y',
 'actual_ret',
 'b_mkt',
 'risk_free_rate',
 'expected_ret',
 'abnormal_ret',
 'CAR_5_before',
 'CAR_5_after',
 'CAR_30_before',
 'CAR_30_after',
 'CAR_60_before',
 'CAR_60_after',
 'CAR_120_before',
 'CAR_120_after',
 'security_category',
 'effective_CAR_30_after',
 'effective_CAR_60_after',
 'effective_CAR_120_after',
 'local_score_30',
 'n_local_30',
 'isolation_

In [13]:
features['gift'].describe()

count    3.171001e+06
mean     2.500188e-02
std      2.435473e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      2.200000e+01
Name: gift, dtype: float64