## Ensure that you have relevant data in data_untracked

In [1]:
import numpy as np
import pandas as pd
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    
from create_labels import labels
from extract_data import extract 
from create_features import features
from preprocess_feature import preprocess

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

### Download below required if creating footnote analysis file

In [3]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download("wordnet")
# nltk.download("averaged_perceptron_tagger")

___
## Section 1. Create Data
___

### Extract form 4 data if missing

In [3]:
data_extractor = extract.Data_Extractor()
data_extractor.create_form4()
data_extractor.merge_form4()



### Create labels

In [4]:
# Create data labels if missing
label_data_creator = labels.Label_Data_Creator()
label_data_creator.create_labels()



### Create features

In [5]:
feature_data_creator = features.Feature_Data_Creator()
feature_data_creator.create_features()

=== Final features file not found. Begin creating ===
=== Network time_independent_features file is found. Extracting ===
=== Network time_dependent_features file is found. Extracting ===
=== Network zscore_features file is found. Extracting ===
=== Transaction Key file is found. Extracting ===
=== Footnote Key file is found. Extracting ===
=== Other Features Key file is found. Extracting ===
=== Removing unwanted rows ===
=== Before removal length 3171001 === 
=== After removal length 1786525 === 
=== Saving file ===


### Create training and testing data

In [6]:
feature_preprocessor = preprocess.Feature_Preprocessor()
feature_preprocessor.extract()
feature_preprocessor.create_training_testing()

preprocess is_lobby with type object
preprocess has_lobby with type object
preprocess has_donate with type object
preprocess NODEID with type float64
preprocess important_connections with type int64
preprocess full_congress_connections with type int64
preprocess house_t2_important_connections with type int64
preprocess house_t2_full_congress_connections with type int64
preprocess house_t1_important_connections with type int64
preprocess house_t1_full_congress_connections with type int64
preprocess sen_important_connections with type int64
preprocess sen_full_congress_connections with type int64
preprocess sen_t2_important_connections with type int64
preprocess sen_t2_full_congress_connections with type int64
preprocess sen_t1_important_connections with type int64
preprocess sen_t1_full_congress_connections with type int64
preprocess full_congress_connections_z with type float64
preprocess sen_full_congress_connections_z with type float64
preprocess sen_t2_full_congress_connections_z wi

In [7]:
feature_preprocessor_baseline = preprocess.Feature_Preprocessor()
feature_preprocessor_baseline.extract(["TRANS_CODE"])
feature_preprocessor.baseline_create_training_testing()

preprocess TRANS_CODE with type object
=== Begin creating based on quantile: 0.8
=== Saving baseline Training and Testing ===


___
## Section 2. Modelling, Training and Validation
___

In [8]:
print("features:")
print(feature_data_creator.features)

features:
['js_bin', 's_bin', 'b_bin', 'jb_bin', 'ob_bin', 'gb_bin', 'gift', 'distribution', 'charity', 'price', 'number', 'ball', 'pursuant', '10b5-1', '16b-3', 'net_trading_intensity', 'net_trading_amt', 'relative_trade_size_to_self', 'beneficial_ownership_score', 'title_score', 'TRANS_TIMELINESS_clean', 'execution_timeliness', 'filing_lag_days', 'filing_timeliness', 'security_category', 'trans_amt', 'is_lobby', 'has_lobby', 'has_donate', 'NODEID', 'important_connections', 'full_congress_connections', 'house_t2_important_connections', 'house_t2_full_congress_connections', 'house_t1_important_connections', 'house_t1_full_congress_connections', 'sen_important_connections', 'sen_full_congress_connections', 'sen_t2_important_connections', 'sen_t2_full_congress_connections', 'sen_t1_important_connections', 'sen_t1_full_congress_connections', 'full_congress_connections_z', 'sen_full_congress_connections_z', 'sen_t2_full_congress_connections_z', 'house_t2_full_congress_connections_z', 'sen_

___
## Compare results
___