In [None]:
#Classification: “Is this insider transaction likely to be followed by a positive price movement (label=1) vs. negative/no movement (label=0) in the next day?”

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb
import optuna

In [7]:
data = pd.read_csv("cleaned_data.csv")
data = data.drop(columns=['1w', '1m', '6m', "Unnamed: 17", "Unnamed: 0", "X", "Insider Name", "Ticker", 'Company Name'])
data

Unnamed: 0,Filing Date,Trade Date,Title,Trade Type,Price,Qty,Owned,ΔOwn,Value,1d
0,2024-12-10 17:04:00+00:00,2024-05-31 00:00:00+00:00,"SVP, Chief Accounting Officer",A - Grant,$94.14,265,15981,2%,"$24,948",0.014880
1,2024-12-10 17:01:00+00:00,2024-05-31 00:00:00+00:00,EVP - Chief Investment Officer,A - Grant,$93.00,268,268,New,"$24,924",0.014880
2,2024-12-10 16:59:00+00:00,2024-05-31 00:00:00+00:00,"EVP, Gen.Counsel, Secretary",A - Grant,$93.40,267,27270,1%,"$24,937",0.014880
3,2024-12-10 16:56:00+00:00,2024-05-31 00:00:00+00:00,EVP-COO,A - Grant,$96.17,259,1464,21%,"$24,908",0.014880
4,2024-12-10 16:53:00+00:00,2024-05-31 00:00:00+00:00,EVP - CFO,A - Grant,$96.20,169,23810,1%,"$16,258",0.014880
...,...,...,...,...,...,...,...,...,...,...
5206,2023-06-29 14:04:00+00:00,2023-06-29 00:00:00+00:00,"EVP, Chief Admin. Officer",G - Gift,$0.00,-435,31943,-1%,$0,0.017074
5207,2023-06-28 20:21:00+00:00,2023-06-26 00:00:00+00:00,Dir,G - Gift,$0.00,-5800,58465,-9%,$0,0.010717
5208,2023-06-28 20:19:00+00:00,2023-06-26 00:00:00+00:00,Dir,A - Grant,$12.13,1082,92104,1%,"$13,125",0.010717
5209,2023-06-28 20:18:00+00:00,2023-06-26 00:00:00+00:00,Dir,A - Grant,$12.13,875,12879,7%,"$10,614",0.010717


In [8]:
data["Price"] = pd.to_numeric(data['Price'].str.replace('$', '').str.replace(',', ''), errors='coerce')
data["Qty"] = pd.to_numeric(data['Qty'].str.replace(',', ''), errors='coerce')
data['Owned'] = data['Owned'].str.replace(',', '').astype(int)
data['ΔOwn'] = data['ΔOwn'].str.replace('%', '').str.replace("New", '0').str.replace('>', '').astype(float) / 100
data['Value'] = data['Value'].str.strip().str.replace('$', '').str.replace(',', '').str.replace("(", '').str.replace(")", '').astype(float)

data

  data["Price"] = pd.to_numeric(data['Price'].str.replace('$', '').str.replace(',', ''), errors='coerce')
  data['Value'] = data['Value'].str.strip().str.replace('$', '').str.replace(',', '').str.replace("(", '').str.replace(")", '').astype(float)


Unnamed: 0,Filing Date,Trade Date,Title,Trade Type,Price,Qty,Owned,ΔOwn,Value,1d
0,2024-12-10 17:04:00+00:00,2024-05-31 00:00:00+00:00,"SVP, Chief Accounting Officer",A - Grant,94.14,265,15981,0.02,24948.0,0.014880
1,2024-12-10 17:01:00+00:00,2024-05-31 00:00:00+00:00,EVP - Chief Investment Officer,A - Grant,93.00,268,268,0.00,24924.0,0.014880
2,2024-12-10 16:59:00+00:00,2024-05-31 00:00:00+00:00,"EVP, Gen.Counsel, Secretary",A - Grant,93.40,267,27270,0.01,24937.0,0.014880
3,2024-12-10 16:56:00+00:00,2024-05-31 00:00:00+00:00,EVP-COO,A - Grant,96.17,259,1464,0.21,24908.0,0.014880
4,2024-12-10 16:53:00+00:00,2024-05-31 00:00:00+00:00,EVP - CFO,A - Grant,96.20,169,23810,0.01,16258.0,0.014880
...,...,...,...,...,...,...,...,...,...,...
5206,2023-06-29 14:04:00+00:00,2023-06-29 00:00:00+00:00,"EVP, Chief Admin. Officer",G - Gift,0.00,-435,31943,-0.01,0.0,0.017074
5207,2023-06-28 20:21:00+00:00,2023-06-26 00:00:00+00:00,Dir,G - Gift,0.00,-5800,58465,-0.09,0.0,0.010717
5208,2023-06-28 20:19:00+00:00,2023-06-26 00:00:00+00:00,Dir,A - Grant,12.13,1082,92104,0.01,13125.0,0.010717
5209,2023-06-28 20:18:00+00:00,2023-06-26 00:00:00+00:00,Dir,A - Grant,12.13,875,12879,0.07,10614.0,0.010717


In [9]:
from sklearn.preprocessing import LabelEncoder

# Example: Encoding categorical columns
for col in ['Title', 'Trade Type']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))

In [10]:
# Load the data (replace 'file_path.csv' with your actual data file)


# Convert 'Trade Date' and 'Filing Date' to datetime if not already done
data['Trade Date'] = pd.to_datetime(data['Trade Date'], errors='coerce')
data['Filing Date'] = pd.to_datetime(data['Filing Date'], errors='coerce')

# Ensure '1d' is numeric and the target variable
data['1d'] = pd.to_numeric(data['1d'], errors='coerce')

# Define the target (1 if movement is positive, 0 otherwise)
data['target'] = (data['1d'] > 0).astype(int)

# Drop rows with missing target values
# data = data.dropna(subset=['target'])
data = data.dropna().reset_index(drop=True)

# Feature selection (drop non-informative columns)
columns_to_drop = ['1d', 'Trade Date', 'Filing Date']  # Adjust as needed
X = data.drop(columns=columns_to_drop + ['target'])
y = data['target']

# Handle missing values in X
X = X.fillna(X.median())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
# # X_train = X_train.dropna()
# X_test = X_test.dropna()
# y_train = y_train[X_train.index]
# y_test = y_test[X_test.index]
X_test

Unnamed: 0,Title,Trade Type,Price,Qty,Owned,ΔOwn,Value
2813,98,0,0.00,5790,39186,0.17,0.0
3474,250,1,0.00,-1226788,0,-1.00,0.0
2865,98,0,0.00,4762,9524,1.00,0.0
983,1,3,1788.31,6,198324,0.00,10730.0
1028,1,3,19.40,790,2881340,0.00,15327.0
...,...,...,...,...,...,...,...
1807,98,0,34.33,400,18211,0.02,13732.0
274,344,4,62.84,-14150,0,-1.00,889149.0
1234,148,3,21.01,5000,166164,0.03,105050.0
1637,99,3,71.83,159671,18150574,0.01,11469255.0


In [12]:
# Initialize and train a basic XGBoost model
baseline_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
baseline_model.fit(X_train, y_train)

# Evaluate baseline model
y_pred = baseline_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.71      0.73       551
           1       0.69      0.74      0.71       492

    accuracy                           0.72      1043
   macro avg       0.72      0.72      0.72      1043
weighted avg       0.72      0.72      0.72      1043



Parameters: { "use_label_encoder" } are not used.



In [81]:
from sklearn.metrics import roc_auc_score
import optuna
import xgboost as xgb

# Define the objective function for Optuna
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',  # Include eval_metric here
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
    }
    
    model = xgb.XGBClassifier(**params, use_label_encoder=False)
    model.fit(X_train, y_train)  # Remove eval_metric from fit()
    y_pred = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_pred)

# Run Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Best hyperparameters
print("Best Hyperparameters:", study.best_params)

[I 2025-01-05 11:32:11,069] A new study created in memory with name: no-name-6b042640-1149-4900-8ae6-be6aca104093
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-05 11:32:11,945] Trial 0 finished with value: 0.7968899856875157 and parameters: {'max_depth': 6, 'learning_rate': 0.058547445863093676, 'n_estimators': 410, 'subsample': 0.6423947015353754, 'colsample_bytree': 0.8018662441385621, 'lambda': 3.097170621579323e-05, 'alpha': 0.0010433247383948054}. Best is trial 0 with value: 0.7968899856875157.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-05 11:32:12,156] Trial 1 finished with value: 0.7856539477372995 and parameters: {'max_depth': 6, 'learning_rate': 0.27144455449976806, 'n_estimators': 94, 'subsample': 0.9806757832519486, 'colsample_bytree': 0.6508036169485752, 'lambda': 0.19999043165100688, 'alpha': 3.6567683880755274e-06}. Best is trial 0 with value: 0.7968899856875157.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-05 11:32:12,4

Best Hyperparameters: {'max_depth': 8, 'learning_rate': 0.22766898728512824, 'n_estimators': 452, 'subsample': 0.9713158295417347, 'colsample_bytree': 0.9062145078030461, 'lambda': 0.025824385857143403, 'alpha': 0.004852327352963468}


In [3]:
# Train the model with the best parameters
best_params = study.best_params
final_model = xgb.XGBClassifier(**best_params, use_label_encoder=False)
final_model.fit(X_train, y_train)

# Evaluate final model
y_pred = final_model.predict(X_test)
y_pred_proba = final_model.predict_proba(X_test)[:, 1]

print("Clasft Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_pred_proba))

NameError: name 'study' is not defined

In [83]:
final_model.save_model("xgbInsider.json")

In [85]:
loaded_model = xgb.XGBClassifier()
loaded_model.load_model("xgbInsider.json")

In [87]:
best_params.items()

dict_items([('max_depth', 8), ('learning_rate', 0.22766898728512824), ('n_estimators', 452), ('subsample', 0.9713158295417347), ('colsample_bytree', 0.9062145078030461), ('lambda', 0.025824385857143403), ('alpha', 0.004852327352963468)])

In [89]:
import mlflow
import mlflow.sklearn

with mlflow.start_run():
        # 1) Log hyperparameters
        for key, value in best_params.items():
            mlflow.log_param(key, value)

        y_pred = loaded_model.predict(X_test)

        # 2) Compute & log metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)

        # 3) Log model artifact
        mlflow.sklearn.log_model(loaded_model, "model")

        print(f"Logged metrics: accuracy={acc}, f1_score={f1}")

    

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 23)

In [37]:
# X_test.iloc[0].to_json()
X_test.iloc[25]

Title               6.00
Trade Type          4.00
Price               6.76
Qty           -180000.00
Owned         1589654.00
ΔOwn               -0.10
Value         1217050.00
Name: 2061, dtype: float64

In [39]:
import requests

url = "http://0.0.0.0:8000/insider_effect_predict"
payload = {"Data": X_test.iloc[40].to_json()}

response = requests.post(url, json=payload)

print("Response:", response.json())

Response: {'status': 'success', 'message': 'Prediction completed successfully', 'data': {'prediction': 'negative', 'details': 'The prediction indicates negative effect'}}


In [26]:
import json

j = json.loads(X_test.iloc[0].to_json())

In [28]:
pd.DataFrame(j, index=[0])

Unnamed: 0,Title,Trade Type,Price,Qty,Owned,ΔOwn,Value
0,98.0,0.0,0.0,5790.0,39186.0,0.17,0.0


In [107]:
final_model.predict(X_test.iloc[:20].rename(columns = {"Trade Type":"trade_type"}))

In [1]:
X_test.iloc[:1]

NameError: name 'X_test' is not defined