In [7]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import joblib
import warnings

warnings.filterwarnings('ignore')

In [24]:
print("Loading datasets...")
try:
    # Load all datasets except for the press releases
    advisor_df = pd.read_csv('data/advisor_data_labeled.csv', parse_dates=['date'])
    financial_df = pd.read_csv('data/financial_data_labeled.csv', parse_dates=['date'])
    market_df = pd.read_csv('data/market_data_labeled.csv', parse_dates=['date'])
    social_df = pd.read_csv('data/raw_social_data_labeled (1).csv', parse_dates=['date'])
except FileNotFoundError as e:
    print(f"Error: {e}. Please make sure all required CSV files are in the same directory.")
    exit()

Loading datasets...


In [9]:
# --- 2. Pre-processing and Feature Engineering ---
print("Aggregating social media data...")
# Aggregate raw social data to get daily features per company
social_agg = social_df.groupby([social_df['date'].dt.date, 'company']).agg(
    social_post_count=('post_text', 'count'),
    avg_likes=('likes', 'mean'),
    avg_retweets=('retweets', 'mean'),
    label=('label', 'max') # Take the max label for the day as the fraud indicator
).reset_index()
social_agg['date'] = pd.to_datetime(social_agg['date'])


Aggregating social media data...


In [17]:
social_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3395 entries, 0 to 3394
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               3395 non-null   datetime64[ns]
 1   company            3395 non-null   object        
 2   social_post_count  3395 non-null   int64         
 3   avg_likes          3395 non-null   float64       
 4   avg_retweets       3395 non-null   float64       
 5   label              3395 non-null   int64         
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 159.3+ KB


In [25]:
# --- 2. Merge Data ---
print("Merging data...")
# Start with market_df as the base
merged_df = market_df.copy()

# Merge financial data on company name
merged_df = pd.merge(merged_df, financial_df.drop(columns=['date'], errors='ignore'), on='company', how='left', suffixes=('', '_fin'))

Merging data...


In [30]:
# --- 3. Company-Specific Anomaly Features ---
print("Creating company-specific anomaly features...")
merged_df = merged_df.sort_values(by=['company', 'date']).reset_index(drop=True)

features_to_engineer = ['price_change_1d', 'volume_spike', 'abnormal_return']
window = 30 # 30-day rolling window

for feature in features_to_engineer:
    if feature in merged_df.columns:
        rolling_stats = merged_df.groupby('company')[feature].rolling(window=window, min_periods=5)
        rolling_mean = rolling_stats.mean().reset_index(level=0, drop=True)
        rolling_std = rolling_stats.std().reset_index(level=0, drop=True)
        merged_df[f'{feature}_zscore'] = (merged_df[feature] - rolling_mean) / (rolling_std + 1e-6)

zscore_cols = [f for f in merged_df.columns if '_zscore' in f]
merged_df[zscore_cols] = merged_df[zscore_cols].fillna(0)
merged_df = merged_df.dropna(subset=['revenue_growth']) # Drop rows where financial data is missing

Creating company-specific anomaly features...


In [32]:
# --- 4. Final Data Preparation ---
print("Finalizing data for training...")
label_cols = [col for col in merged_df.columns if 'label' in col]
y = merged_df[label_cols].max(axis=1).fillna(0).astype(int)

features_to_drop = features_to_engineer + ['date', 'company', 'filing_type'] + label_cols
X = merged_df.drop(columns=features_to_drop)
X = X.fillna(X.median())

Finalizing data for training...


In [33]:
# --- 5. Model Training and Evaluation ---
print("Splitting data and training LightGBM model...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

lgb_model = lgb.LGBMClassifier(objective='binary', random_state=42)
lgb_model.fit(X_train, y_train)

y_pred_proba = lgb_model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print("\n--- Model 1 Evaluation ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC Score: {auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Splitting data and training LightGBM model...
[LightGBM] [Info] Number of positive: 479773, number of negative: 447712
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2308
[LightGBM] [Info] Number of data points in the train set: 927485, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.517284 -> initscore=0.069163
[LightGBM] [Info] Start training from score 0.069163

--- Model 1 Evaluation ---
Accuracy: 0.8845
AUC Score: 0.9740

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.98      0.89    149237
           1       0.97      0.80      0.88    159925

    accuracy                           0.88    309162
   macro avg       0.90      0.89      0.88    309162
weighted avg       0.90      0.88   

In [35]:
# --- 6. Save Artifacts ---
print("Saving model and feature list...")
joblib.dump(lgb_model, 'model_data/model_1_market_financial.joblib')
joblib.dump(list(X.columns), 'model_data/model_1_market_financial_features.joblib')
X_test.to_csv('test_data/model_1_X_test_data.csv', index=False)
y_test.to_csv('test_data/model_1_y_test_data.csv', index=False)
print("\nModel 1 (market-financial) and its feature list have been saved successfully!")

Saving model and feature list...

Model 1 (market-financial) and its feature list have been saved successfully!
