In [1]:
import os
import sys
parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    
from path_location import folder_location

PROCESSED_FOLDER = folder_location.PROCESSED_DATA_FOLDER
TRAINING_FILE = folder_location.TRAINING_FULL_FEATURES_FILE
TESTING_FILE = folder_location.TRAINING_FULL_FEATURES_FILE
TESTING_FEATURES_FILE = f"{PROCESSED_FOLDER}/{TESTING_FILE}"
TRAINING_FEATURES_FILE = f"{PROCESSED_FOLDER}/{TRAINING_FILE}"
NETWORK_RAW_FOLDERS = folder_location.PROFILE_DATA_FOLDERS

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

import pandas as pd, numpy as np, warnings
import matplotlib.pyplot as plt
from tqdm.auto                import tqdm
from sklearn.ensemble         import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.neural_network   import MLPClassifier
from sklearn.preprocessing    import StandardScaler
from sklearn.pipeline         import Pipeline
from imblearn.pipeline        import Pipeline as ImbPipeline
from imblearn.over_sampling   import SMOTE
from sklearn.metrics          import (
    classification_report, roc_auc_score,
    precision_recall_fscore_support
)
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model     import LogisticRegression
warnings.filterwarnings("ignore")

# ────────────────────────────────────────────────────────────────
# Helpers
# ────────────────────────────────────────────────────────────────
def make_pipe(clf, use_smote=False, scale=True):
    steps=[]
    if scale:     steps.append(("scaler", StandardScaler()))
    if use_smote: steps.append(("smote",  SMOTE(random_state=0)))
    steps.append(("clf", clf))
    return ImbPipeline(steps) if (use_smote or scale) else Pipeline(steps)

def find_thr_for_recall(y, p, target=0.75):
    best_thr, best_diff = 0.5, 1e9
    for thr in np.linspace(0,1,201):
        rec = precision_recall_fscore_support(
            y, (p>=thr).astype(int),
            average='binary', zero_division=0
        )[1]
        diff = abs(rec - target)
        if diff < best_diff:
            best_diff, best_thr = diff, thr
    return best_thr

def quick_report(name, y, p, tgt=0.75):
    auc = roc_auc_score(y, p)
    thr = find_thr_for_recall(y, p, tgt)
    print(f"\n{name}  |  ROC‑AUC={auc:.4f}  | thr@recall≈{tgt} → {thr:.3f}")
    print(classification_report(y, (p>=thr).astype(int), digits=3))


# ────────────────────────────────────────────────────────────────
# 1) Load, sort, one‑hot & align
# ────────────────────────────────────────────────────────────────
df_train = pd.read_csv(TRAINING_FEATURES_FILE, parse_dates=["TRANS_DATE"])
df_test  = pd.read_csv(TESTING_FEATURES_FILE,  parse_dates=["TRANS_DATE"])

df_train.sort_values("TRANS_DATE", inplace=True)

# ticker→sector mapping + one‑hot
tic_map = (
    pd.read_csv(f"{NETWORK_RAW_FOLDERS}/TIC to SIC.xlsx")[['tic','GSECTOR']]
      .set_index('tic')['GSECTOR'].to_dict()
)
for df in (df_train, df_test):
    # 1. map ticker → full 2‑digit sector
    df["GSECTOR"] = df["ISSUERTRADINGSYMBOL"].map(tic_map).fillna(-1).astype(int)
    
    # ► keep an *integer* copy we can aggregate on later
    df["GSEC_INT"] = df["GSECTOR"]               # <── NEW
    # (optional) also keep the 4‑digit group if you want to test it
    df["GGRP_INT"] = df["GSECTOR"] // 10         # e.g. 2030 etc.  <── NEW

    # 2. one‑hot encode for the RF
    dummies = pd.get_dummies(df["GSECTOR"], prefix="GSEC")
    df[dummies.columns] = dummies
    df.drop(columns=["GSECTOR"], inplace=True)


# now align so train & test share exactly the same dummy columns
# make sure *only* the one‑hot columns are aligned
all_gsec = sorted(
    {c for c in df_train.columns if c.startswith("GSEC")}
    | {c for c in df_test.columns  if c.startswith("GSEC")}
)

for df in (df_train, df_test):
    for c in all_gsec:
        if c not in df:
            df[c] = 0

# ────────────────────────────────────────────────────────────────
# 2) Drop extras, filter by year, optional subsample
# ────────────────────────────────────────────────────────────────
COLS_TO_DROP = [
    'TRANS_SK','ACCESSION_NUMBER','ISSUERTRADINGSYMBOL','NODEID',
    'RPTOWNERNAME_;','snorkel_prob'
]
LABEL = "y_pred"

df_train.drop(columns=COLS_TO_DROP, inplace=True)
df_test .drop(columns=COLS_TO_DROP, inplace=True)

df_train = df_train[df_train["TRANS_DATE"].dt.year > 2010].copy()
df_test  = df_test [df_test ["TRANS_DATE"].dt.year < 2020].copy()

RUN_ON_SUBSET = False
if RUN_ON_SUBSET:
    df_train = df_train.sample(300_000, random_state=5)
    df_test  = df_test .sample(100_000, random_state=5)

# ────────────────────────────────────────────────────────────────
# 3) Split out X/y & fillna
# ────────────────────────────────────────────────────────────────
X_tr = df_train.drop(columns=[LABEL, "TRANS_DATE"])
y_tr = df_train[LABEL].to_numpy()
X_te = df_test .drop(columns=[LABEL, "TRANS_DATE"])
y_te = df_test [LABEL].to_numpy()

X_tr.fillna(0.0, inplace=True)
X_te.fillna(0.0, inplace=True)




Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [20]:
# ────────────────────────────────────────────────────────────────
# 0)  PREP –‑ pick feature lists
# ────────────────────────────────────────────────────────────────
gsec_cols   = [c for c in X_tr.columns if c.startswith("GSEC_")]
base_feats  = [c for c in X_tr.columns if c not in gsec_cols]
print(f"{len(gsec_cols)} sector dummies detected")

# ────────────────────────────────────────────────────────────────
# 1)  QUICK two‑model training  (keep n_estimators small = faster)
# ────────────────────────────────────────────────────────────────
rf_all = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    n_jobs=-1,
    class_weight="balanced",
    random_state=42,
)
rf_no  = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    n_jobs=-1,
    class_weight="balanced",
    random_state=42,
)

rf_all.fit(X_tr,           y_tr)
rf_no .fit(X_tr[base_feats], y_tr)

p_all = rf_all.predict_proba(X_te)[:,1]
p_no  = rf_no .predict_proba(X_te[base_feats])[:,1]

# ────────────────────────────────────────────────────────────────
# 2)  Compute per‑sector AUCs
# ────────────────────────────────────────────────────────────────
from sklearn.metrics import roc_auc_score
rows  = []
for col in gsec_cols:
    mask   = X_te[col] == 1
    if mask.sum() < 300:      # skip tiny sectors
        continue
    auc_all = roc_auc_score(y_te[mask], p_all[mask])
    auc_no  = roc_auc_score(y_te[mask], p_no [mask])
    rows.append({"sector": col.replace("GSEC_",""),
                 "auc_with": auc_all,
                 "auc_no":  auc_no,
                 "lift":     auc_all - auc_no})

auc_df = (pd.DataFrame(rows)
            .sort_values("lift", ascending=False)
            .reset_index(drop=True))
display(auc_df.head())



13 sector dummies detected


Unnamed: 0,sector,auc_with,auc_no,lift
0,30,0.681701,0.652424,0.029278
1,55,0.648093,0.625436,0.022658
2,-1,0.625951,0.607434,0.018517
3,40,0.631037,0.616272,0.014765
4,60,0.677199,0.663913,0.013285


In [26]:
from sklearnex import patch_sklearn
patch_sklearn()

import pandas as pd, numpy as np, warnings
from sklearn.ensemble      import RandomForestClassifier
from sklearn.calibration   import CalibratedClassifierCV
from sklearn.pipeline      import Pipeline
from imblearn.pipeline     import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics       import roc_auc_score, classification_report
from joblib                import Parallel, delayed
from sklearn.model_selection import TimeSeriesSplit


warnings.filterwarnings("ignore")

# ───────────────────────────────────────────────────────
# 0) Helpers
# ───────────────────────────────────────────────────────
def make_pipe(clf, use_smote=False, scale=True):
    steps = []
    if scale:     steps.append(("scaler", StandardScaler()))
    if use_smote: steps.append(("smote",  SMOTE(random_state=0)))
    steps.append(("clf", clf))
    return ImbPipeline(steps) if (use_smote or scale) else Pipeline(steps)

def quick_report(name, y_true, probs):
    auc = roc_auc_score(y_true, probs)
    print(f"\n{name}  →  ROC‑AUC = {auc:.4f}")
    print(classification_report(y_true, (probs >= 0.5).astype(int), digits=3))


Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [27]:


# ───────────────────────────────────────────────────────
# 1) Assume X_tr, y_tr, X_te, y_te are already prepared
#    (with your one‑hots for GSEC_* plus b_bin, s_bin, js_bin, jb_bin)
# ───────────────────────────────────────────────────────

# 2) Train your global RF and calibrate it
rf_pipe = make_pipe(
    RandomForestClassifier(
        n_estimators=200, max_depth=None,
        max_features='sqrt', min_samples_split=5,
        class_weight='balanced',
        random_state=0, n_jobs=-1, verbose=0
    ),
    use_smote=False, scale=False
)
rf_pipe.fit(X_tr, y_tr)

cal_rf = CalibratedClassifierCV(rf_pipe, method="isotonic", cv=5)
cal_rf.fit(X_tr, y_tr)

p_global_tr = cal_rf.predict_proba(X_tr)[:,1]
p_global_te = cal_rf.predict_proba(X_te)[:,1]
quick_report("Global RF (calibrated)", y_te, p_global_te)




Global RF (calibrated)  →  ROC‑AUC = 0.6816
              precision    recall  f1-score   support

           0      0.628     0.804     0.705    103106
           1      0.626     0.408     0.494     82993

    accuracy                          0.627    186099
   macro avg      0.627     0.606     0.599    186099
weighted avg      0.627     0.627     0.611    186099



In [28]:
import gc
gc.collect()

4534

In [32]:

# 3) Identify specialist columns: GSEC_* + your bin flags
sector_cols = [c for c in X_tr.columns if c.startswith("GSEC_")]
bin_cols    = [c for c in ("b_bin", "s_bin", "js_bin","jb_bin") if c in X_tr.columns]
spec_cols   = sector_cols + bin_cols


# 4) Fit a RF‑specialist for each flag in parallel
def fit_specialist(col):
    mask = X_tr[col].astype(bool)
    if mask.sum() < 20:
        return col, None
    spec = make_pipe(
        RandomForestClassifier(
            n_estimators=200, max_depth=None,
            max_features='sqrt', min_samples_split=5,
            class_weight='balanced',
            random_state=0, n_jobs=-1, verbose=0
        ),
        use_smote=False, scale=False
    )
    spec.fit(X_tr[mask], y_tr[mask])
    return col, spec

results = Parallel(n_jobs=-1, verbose=1)(
    delayed(fit_specialist)(col) for col in spec_cols
)
specialists = {col:pipe for col,pipe in results if pipe is not None}


# 5) Build level‑0 train & test matrices
def make_level0(X, p_global, specialists):
    lvl = pd.DataFrame({"p_global": p_global}, index=X.index)
    for col, pipe in specialists.items():
        col_pred = p_global.copy()
        mask = X[col].astype(bool)
        col_pred[mask] = pipe.predict_proba(X[mask])[:,1]
        lvl[f"p_spec_{col}"] = col_pred
    return lvl

lvl_tr = make_level0(X_tr, p_global_tr, specialists)
lvl_te = make_level0(X_te, p_global_te, specialists)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 out of  17 | elapsed:  2.0min remaining:   26.2s
[Parallel(n_jobs=-1)]: Done  17 out of  17 | elapsed:  3.0min finished


In [7]:
# ────────────────────────────────────────────────────────────────
# 7) for comparison: also run a LogisticRegressionCV meta‐learner
# ────────────────────────────────────────────────────────────────
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import TimeSeriesSplit

# use the same 5‑fold time‑series split
tscv = TimeSeriesSplit(n_splits=5)

lr_meta = LogisticRegressionCV(
    Cs           = [0.01, 0.1, 1, 10],
    cv           = tscv,
    scoring      = "roc_auc",
    class_weight = "balanced",
    max_iter     = 3000,
    n_jobs       = -1,
    verbose      = 1,
    solver       = "lbfgs",
)
lr_meta.fit(lvl_tr, y_tr)
print("Meta‑learner (LogRegCV) best C:", lr_meta.C_[0])

p_meta_lr = lr_meta.predict_proba(lvl_te)[:, 1]
quick_report("Meta (LogRegCV)", y_te, p_meta_lr)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    5.1s remaining:    7.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.3s finished


Meta‑learner (LogRegCV) best C: 1.0

Meta (LogRegCV)  →  ROC‑AUC = 0.6823
              precision    recall  f1-score   support

           0      0.661     0.694     0.677    103106
           1      0.594     0.558     0.575     82993

    accuracy                          0.633    186099
   macro avg      0.628     0.626     0.626    186099
weighted avg      0.631     0.633     0.632    186099

