In [22]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import joblib
import warnings

warnings.filterwarnings('ignore')

In [42]:
print("Loading datasets...")
try:
    # Load all datasets except for the press releases
    advisor_df = pd.read_csv('data/advisor_data_labeled.csv', parse_dates=['date'])
    financial_df = pd.read_csv('data/financial_data_labeled.csv', parse_dates=['date'])
    market_df = pd.read_csv('data/market_data_labeled.csv', parse_dates=['date'])
    social_df = pd.read_csv('data/raw_social_data_labeled (1).csv', parse_dates=['date'])
except FileNotFoundError as e:
    print(f"Error: {e}. Please make sure all required CSV files are in the same directory.")
    exit()

Loading datasets...


In [9]:
# --- 2. Pre-processing and Feature Engineering ---
print("Aggregating social media data...")
# Aggregate raw social data to get daily features per company
social_agg = social_df.groupby([social_df['date'].dt.date, 'company']).agg(
    social_post_count=('post_text', 'count'),
    avg_likes=('likes', 'mean'),
    avg_retweets=('retweets', 'mean'),
    label=('label', 'max') # Take the max label for the day as the fraud indicator
).reset_index()
social_agg['date'] = pd.to_datetime(social_agg['date'])


Aggregating social media data...


In [17]:
social_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3395 entries, 0 to 3394
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               3395 non-null   datetime64[ns]
 1   company            3395 non-null   object        
 2   social_post_count  3395 non-null   int64         
 3   avg_likes          3395 non-null   float64       
 4   avg_retweets       3395 non-null   float64       
 5   label              3395 non-null   int64         
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 159.3+ KB


In [57]:
# --- 2. Merge Data ---
print("Merging data...")
# Start with market_df as the base
merged_df = market_df.copy()

# Merge financial data on company name
merged_df = pd.merge(merged_df, financial_df.drop(columns=['date'], errors='ignore'), on='company', how='left', suffixes=('', '_fin'))

Merging data...


In [58]:
merged_df['company'].value_counts()

company
Tata Motors    343805
Reliance       318864
Infosys        287226
Wipro          286752
TCS               544
SBI               518
ICICI             516
Adani             491
ONGC              487
HDFC              466
Name: count, dtype: int64

In [59]:
# --- 3. Company-Specific Anomaly Features ---
print("Creating company-specific anomaly features...")
merged_df = merged_df.sort_values(by=['company', 'date']).reset_index(drop=True)

features_to_engineer = ['price_change_1d', 'volume_spike', 'abnormal_return']
window = 30 # 30-day rolling window

for feature in features_to_engineer:
    if feature in merged_df.columns:
        rolling_stats = merged_df.groupby('company')[feature].rolling(window=window, min_periods=5)
        rolling_mean = rolling_stats.mean().reset_index(level=0, drop=True)
        rolling_std = rolling_stats.std().reset_index(level=0, drop=True)
        merged_df[f'{feature}_zscore'] = (merged_df[feature] - rolling_mean) / (rolling_std + 1e-6)

zscore_cols = [f for f in merged_df.columns if '_zscore' in f]
# merged_df[zscore_cols] = merged_df[zscore_cols].fillna(np.median(merged_df[merged_df[zscore_cols].notna().sum(axis=1) > 0][zscore_cols]))
merged_df[zscore_cols] = merged_df[zscore_cols].fillna(merged_df[zscore_cols].median())
# merged_df = merged_df.dropna(subset=['revenue_growth']) # Drop rows where financial data is missing
merged_df['revenue_growth'] = merged_df['revenue_growth'].fillna(merged_df['revenue_growth'].median())
merged_df['profit_margin'] = merged_df['profit_margin'].fillna(merged_df['profit_margin'].median())
merged_df['debt_to_equity'] = merged_df['debt_to_equity'].fillna(merged_df['debt_to_equity'].median())
merged_df['roe'] = merged_df['roe'].fillna(merged_df['roe'].median())
merged_df['insider_trades'] = merged_df['insider_trades'].fillna(merged_df['insider_trades'].median())
merged_df['label_fin'] = merged_df['label_fin'].fillna(0)
merged_df['filing_type'] = merged_df['filing_type'].fillna('Others')


Creating company-specific anomaly features...


In [55]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1239669 entries, 0 to 1239668
Data columns (total 18 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   date                    1239669 non-null  datetime64[ns]
 1   company                 1239669 non-null  object        
 2   price_change_1d         1239669 non-null  float64       
 3   price_change_5d         1239669 non-null  float64       
 4   volatility_post         1239669 non-null  float64       
 5   abnormal_return         1239669 non-null  float64       
 6   volume_spike            1239669 non-null  float64       
 7   label                   1239669 non-null  int64         
 8   revenue_growth          1239669 non-null  float64       
 9   profit_margin           1236647 non-null  float64       
 10  debt_to_equity          1236647 non-null  float64       
 11  roe                     1236647 non-null  float64       
 12  insider_trades

In [60]:
merged_df.isna().sum()

date                      0
company                   0
price_change_1d           0
price_change_5d           0
volatility_post           0
abnormal_return           0
volume_spike              0
label                     0
revenue_growth            0
profit_margin             0
debt_to_equity            0
roe                       0
insider_trades            0
filing_type               0
label_fin                 0
price_change_1d_zscore    0
volume_spike_zscore       0
abnormal_return_zscore    0
dtype: int64

In [61]:
merged_df['company'].value_counts()

company
Tata Motors    343805
Reliance       318864
Infosys        287226
Wipro          286752
TCS               544
SBI               518
ICICI             516
Adani             491
ONGC              487
HDFC              466
Name: count, dtype: int64

In [62]:
# --- 4. Final Data Preparation ---
print("Finalizing data for training...")

# --- ADD THIS PART ---
# Convert company name to a numerical category
merged_df['company_cat'] = merged_df['company'].astype('category').cat.codes

# Create and save a mapping file for later use in prediction
company_mapping = merged_df[['company', 'company_cat']].drop_duplicates().reset_index(drop=True)
company_mapping.to_csv('data/company_to_category_map.csv', index=False)
print("Saved company-to-category mapping to 'company_to_category_map.csv'")
# --- END OF ADDITION ---

label_cols = [col for col in merged_df.columns if 'label' in col]
y = merged_df[label_cols].max(axis=1).fillna(0).astype(int)

features_to_drop = features_to_engineer + ['date', 'company', 'filing_type'] + label_cols
X = merged_df.drop(columns=features_to_drop)
X = X.fillna(X.median())

Finalizing data for training...
Saved company-to-category mapping to 'company_to_category_map.csv'


In [63]:
# --- 5. Model Training and Evaluation ---
print("Splitting data and training LightGBM model...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

lgb_model = lgb.LGBMClassifier(objective='binary', random_state=42)
lgb_model.fit(X_train, y_train)

y_pred_proba = lgb_model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print("\n--- Model 1 Evaluation ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC Score: {auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Splitting data and training LightGBM model...
[LightGBM] [Info] Number of positive: 480699, number of negative: 449052
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026008 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2318
[LightGBM] [Info] Number of data points in the train set: 929751, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.517019 -> initscore=0.068103
[LightGBM] [Info] Start training from score 0.068103

--- Model 1 Evaluation ---
Accuracy: 0.9001
AUC Score: 0.9812

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.98      0.90    149684
           1       0.98      0.82      0.90    160234

    accuracy                           0.90    309918
   macro avg       0.91      0.90      0.90    309918
weighted avg       0.91      0.90   

In [64]:
# --- 6. Save Artifacts ---
print("Saving model and feature list...")
joblib.dump(lgb_model, 'model_data/model_1_market_financial.joblib')
joblib.dump(list(X.columns), 'model_data/model_1_market_financial_features.joblib')
X_test.to_csv('test_data/model_1_X_test_data.csv', index=False)
y_test.to_csv('test_data/model_1_y_test_data.csv', index=False)
print("\nModel 1 (market-financial) and its feature list have been saved successfully!")

Saving model and feature list...

Model 1 (market-financial) and its feature list have been saved successfully!
